From d126a0d4fd6f0b6da928135238cd2cc8f2072380 Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Tue, 22 Sep 2020 01:45:44 -0700
Subject: [PATCH 001/449] [iOS] Disable the iOS nightly build until the cert
 issue has resolved (#45094)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45094

Test Plan: Imported from OSS

Reviewed By: husthyc

Differential Revision: D23831152

Pulled By: xta0

fbshipit-source-id: 6327edba01e4d5abad63ac35680eefb22276423f
---
 .circleci/cimodel/data/simple/nightly_ios.py | 2 +-
 .circleci/config.yml                         | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/.circleci/cimodel/data/simple/nightly_ios.py b/.circleci/cimodel/data/simple/nightly_ios.py
index 580dfa3d7ae8..6c01479dde80 100644
--- a/.circleci/cimodel/data/simple/nightly_ios.py
+++ b/.circleci/cimodel/data/simple/nightly_ios.py
@@ -60,7 +60,7 @@ def gen_tree(self):
 
 
 WORKFLOW_DATA = BUILD_CONFIGS + [
-    IOSNightlyJob("binary", is_upload=True),
+    # IOSNightlyJob("binary", is_upload=True),
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index b32bb9b5086a..5ca2d725b9e9 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7021,15 +7021,6 @@ workflows:
           ios_arch: arm64
           ios_platform: OS
           name: pytorch_ios_11_2_1_nightly_arm64_build
-      - binary_ios_upload:
-          build_environment: libtorch-ios-11.2.1-nightly-binary-build-upload
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-          requires:
-            - pytorch_ios_11_2_1_nightly_x86_64_build
-            - pytorch_ios_11_2_1_nightly_arm64_build
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c

From c947ab0bb977dd995f0a96099b69fbab28377001 Mon Sep 17 00:00:00 2001
From: vfdev-5 <vfdev-5@gmail.com>
Date: Tue, 22 Sep 2020 02:02:29 -0700
Subject: [PATCH 002/449] Added sparse support for asin and neg functions,
 updated log1p (#44028)

Summary:
Description:

- [x] added C++ code for sparse `asin` and `neg` ops similarly to `log1p` op
- [x] added tests
  - [x] coalesced input CPU/CUDA
  - [x] uncoalesced input CPU/CUDA
- [x] added tests for `negative`  and `arcsin`

Backprop will be addressed in another PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44028

Reviewed By: agolynski

Differential Revision: D23793027

Pulled By: mruberry

fbshipit-source-id: 5fd642808da8e528cf6acd608ca0dcd720c4ccc3
---
 aten/src/ATen/native/UnaryOps.cpp             |   6 +-
 aten/src/ATen/native/native_functions.yaml    |  12 +
 .../ATen/native/sparse/SparseTensorMath.cpp   |  56 ++++-
 test/test_sparse.py                           | 206 ++++++++++++++----
 4 files changed, 233 insertions(+), 47 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 2764490f6d48..b5a6e2c017e7 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -405,9 +405,9 @@ Tensor& neg_out(Tensor& result, const Tensor& self) {
 Tensor neg(const Tensor& self) { return unary_op_impl(self, at::neg_out); }
 Tensor& neg_(Tensor& self) { return unary_op_impl_(self, at::neg_out); }
 
-Tensor& negative_out(Tensor& result, const Tensor& self) { return at::native::neg_out(result, self); }
-Tensor negative(const Tensor& self) { return at::native::neg(self); }
-Tensor& negative_(Tensor& self) { return at::native::neg_(self); }
+Tensor& negative_out(Tensor& result, const Tensor& self) { return at::neg_out(result, self); }
+Tensor negative(const Tensor& self) { return self.neg(); }
+Tensor& negative_(Tensor& self) { return self.neg_(); }
 
 Tensor logical_not(const Tensor& self) {
   Tensor result = at::empty({0}, self.options().dtype(kBool));
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 6541e45b3230..a84c9b4b61b8 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -553,8 +553,14 @@
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: asin_
+    SparseCPU, SparseCUDA: asin_sparse_
 
 - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: asin_out
+    SparseCPU, SparseCUDA: asin_out_sparse
 
 # arcsin, alias of asin
 - func: arcsin(Tensor self) -> Tensor
@@ -2716,8 +2722,14 @@
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: neg_
+    SparseCPU, SparseCUDA: neg_sparse_
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: neg_out
+    SparseCPU, SparseCUDA: neg_out_sparse
 
 # Alias for neg
 - func: negative(Tensor self) -> Tensor
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 9eee5e056dff..2bb5842b4726 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -95,16 +95,17 @@ SparseTensor& mul_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scal
 // log1p(SparseTensor)
 // --------------------------------------------------------------------
 
-// TODO: add in-place variant
+// In-place log1p on uncoalesced tensors is not supported since the operation is not a linear map.
+// Values of uncoalesced tensor corresponding to the same indices are summed
+// and log1p(summed_value) != log1p(v1) + log1p(v2)
 
 SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
-  AT_ASSERT(r.is_sparse());
-  AT_ASSERT(t.is_sparse());
+  TORCH_CHECK(r.is_sparse(), "Tensor should be sparse");
+  TORCH_CHECK(t.is_sparse(), "Tensor should be sparse");
 
   if (is_same_tensor(r, t)) {
     // don't have in-place log1p for uncoalesced input because coalesce() is not in-place
-    TORCH_CHECK(
-      r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
+    TORCH_CHECK(r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported");
   }
   else {
     copy_sparse_to_sparse_(r, t.coalesce());
@@ -114,10 +115,53 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) {
 }
 
 SparseTensor& log1p_sparse_(SparseTensor& t) {
-  TORCH_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!");
   return log1p_out_sparse(t, t);
 }
 
+// --------------------------------------------------------------------
+// neg(SparseTensor)
+// --------------------------------------------------------------------
+
+SparseTensor& neg_out_sparse(SparseTensor& r, const SparseTensor& t) {
+  TORCH_CHECK(r.is_sparse(), "Tensor should be sparse");
+  TORCH_CHECK(t.is_sparse(), "Tensor should be sparse");
+
+  // copy_sparse_ does not perform the copy if it is the same tensor
+  copy_sparse_to_sparse_(r, t);
+  r._values().neg_();
+  return r;
+}
+
+SparseTensor& neg_sparse_(SparseTensor& t) {
+  return neg_out_sparse(t, t);
+}
+
+// --------------------------------------------------------------------
+// asin(SparseTensor)
+// --------------------------------------------------------------------
+
+// In-place asin on uncoalesced tensors is not supported since the operation is not a linear map.
+// Values of uncoalesced tensor corresponding to the same indices are summed
+// and asin(summed_value) != asin(v1) + asin(v2)
+
+SparseTensor& asin_out_sparse(SparseTensor& r, const SparseTensor& t) {
+  TORCH_CHECK(r.is_sparse(), "Tensor should be sparse");
+  TORCH_CHECK(t.is_sparse(), "Tensor should be sparse");
+
+  if (is_same_tensor(r, t)) {
+    // don't have in-place asin for uncoalesced input because coalesce() is not in-place, see above comment
+    TORCH_CHECK(r.is_coalesced(), "asin: in-place on uncoalesced tensors is not supported");
+  } else {
+    copy_sparse_to_sparse_(r, t.coalesce());
+  }
+  r._values().asin_();
+  return r;
+}
+
+SparseTensor& asin_sparse_(SparseTensor& t) {
+  return asin_out_sparse(t, t);
+}
+
 // --------------------------------------------------------------------
 // pow(SparseTensor, Scalar)
 // --------------------------------------------------------------------
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 64846ab729eb..af833be6810c 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -6,6 +6,7 @@
 
 import itertools
 import functools
+import operator
 import random
 import unittest
 from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
@@ -1728,53 +1729,182 @@ def test_narrow(self):
 
         self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3))  # dim > sparseDim + denseDim
 
-    def _test_log1p_tensor(self, input, dense_tensor):
+    def _test_log1p_tensor(self, sparse_tensor):
+        dense_tensor = sparse_tensor.to_dense()
         expected_output = dense_tensor.log1p()
-        self.assertEqual(expected_output, input.log1p().to_dense())
-        self.assertEqual(expected_output, input.coalesce().log1p_().to_dense())
 
-        # test in-place op on uncoalesced input
-        with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"):
-            input.log1p_()
+        self.assertEqual(expected_output, sparse_tensor.log1p().to_dense())
+        self.assertEqual(expected_output, sparse_tensor.coalesce().log1p_().to_dense())
 
-        input.requires_grad_()
-        self.assertTrue(input.requires_grad)
+        if self.is_uncoalesced:
+            # test in-place op on uncoalesced input
+            with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"):
+                sparse_tensor.log1p_()
+
+        sparse_tensor.requires_grad_()
+        self.assertTrue(sparse_tensor.requires_grad)
 
         # test autograd
-        x = input.clone()
-        y = input.log1p()
+        x = sparse_tensor.clone()
+        y = sparse_tensor.log1p()
         with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"):
             y.backward(x)
 
     def test_log1p(self):
-        input = torch.sparse_coo_tensor(
-            torch.LongTensor([[0], [1], [2]]).transpose(1, 0).clone().detach(),
-            torch.FloatTensor([3, 4, 5]),
-            torch.Size([3]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.as_tensor([3, 4, 5], dtype=torch.float32))
-
-        # test uncoalesced input
-        input_uncoalesced = torch.sparse_coo_tensor(
-            torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0).clone().detach(),
-            torch.FloatTensor([2, 3, 4, 1, 1, 1]),
-            torch.Size([3]),
-            device=self.device)
-        self._test_log1p_tensor(input_uncoalesced, torch.as_tensor([3, 4, 5], dtype=torch.float32))
-
-        input = torch.sparse_coo_tensor(
-            torch.zeros([2, 0]),
-            torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
-            torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0]))
-
-        input = torch.sparse_coo_tensor(
-            torch.zeros([1, 5]),
-            torch.zeros([5, 6, 0]),
-            torch.Size([5, 6, 0]),
-            device=self.device)
-        self._test_log1p_tensor(input, torch.zeros([5, 6, 0]))
+        if not self.is_uncoalesced:
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([3.0, 4.0, 5.0]),
+                size=[3, ],
+                device=self.device
+            ).coalesce()
+            self._test_log1p_tensor(input_coalesced)
+
+            # hybrid sparse input
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[1, 3], [2, 4]]),
+                values=torch.tensor([[1.0, 3.0], [5.0, 7.0]]),
+                size=[4, 5, 2],
+                device=self.device
+            ).coalesce()
+            self._test_log1p_tensor(input_coalesced)
+
+        if self.is_uncoalesced:
+            # test uncoalesced input
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([2.0, 3.0, 4.0, 1.0, 1.0, 1.0]),
+                size=[3, ],
+                device=self.device
+            )
+            self._test_log1p_tensor(input_uncoalesced)
+
+            # test on empty sparse tensor
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.zeros([2, 0]),
+                values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+                size=[0, 0, 5, 5, 5, 5, 5, 5, 0],
+                device=self.device
+            )
+            self._test_log1p_tensor(input_uncoalesced)
+
+    def _test_neg_negative(self, sparse_tensor):
+        dense_tensor = sparse_tensor.to_dense()
+        expected_output = dense_tensor.neg()
+
+        ops = (
+            torch.neg, torch.Tensor.neg, torch.Tensor.neg_,
+            torch.negative, torch.Tensor.negative, torch.Tensor.negative_,
+            operator.neg
+        )
+        for op in ops:
+            sparse_tensor_copy = sparse_tensor.clone()
+            self.assertEqual(expected_output, op(sparse_tensor_copy).to_dense())
+
+            if op in (torch.neg, torch.negative):
+                sparse_tensor_out = torch.zeros_like(sparse_tensor)
+                op(sparse_tensor, out=sparse_tensor_out)
+                self.assertEqual(expected_output, sparse_tensor_out.to_dense())
+
+    def test_neg_negative(self):
+
+        if not self.is_uncoalesced:
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0, 1, 2]]),
+                values=torch.tensor([3.0, -4.0, 5.0]),
+                size=[3, ],
+                device=self.device
+            ).coalesce()
+            self._test_neg_negative(input_coalesced)
+
+            # hybrid sparse input
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[1, 3], [2, 4]]),
+                values=torch.tensor([[-1.0, 3.0], [-5.0, 7.0]]),
+                size=[4, 5, 2],
+                device=self.device
+            ).coalesce()
+            self._test_neg_negative(input_coalesced)
+
+        if self.is_uncoalesced:
+            # test uncoalesced input
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([2.0, -3.0, -4.0, 1.0, -1.0, 1.5]),
+                size=[3, ],
+                device=self.device
+            )
+            self._test_neg_negative(input_uncoalesced)
+
+            # test on empty sparse tensor
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.zeros([2, 0]),
+                values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+                size=[0, 0, 5, 5, 5, 5, 5, 5, 0],
+                device=self.device
+            )
+            self._test_neg_negative(input_uncoalesced)
+
+    def _test_asin_arcsin(self, sparse_tensor):
+        dense_tensor = sparse_tensor.to_dense()
+        expected_output = dense_tensor.asin()
+
+        ops = (
+            torch.asin, torch.Tensor.asin,
+            torch.arcsin, torch.Tensor.arcsin,
+        )
+        for op in ops:
+            self.assertEqual(expected_output, op(sparse_tensor).to_dense())
+            if op in (torch.asin, torch.arcsin):
+                sparse_tensor_out = torch.zeros_like(sparse_tensor)
+                op(sparse_tensor, out=sparse_tensor_out)
+                self.assertEqual(expected_output, sparse_tensor_out.to_dense())
+
+        for op in (torch.Tensor.asin_, torch.Tensor.arcsin_):
+            self.assertEqual(expected_output, op(sparse_tensor.clone().coalesce()).to_dense())
+            if self.is_uncoalesced:
+                # test in-place op on uncoalesced input
+                with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"):
+                    op(sparse_tensor)
+
+    def test_asin_arcsin(self):
+
+        if not self.is_uncoalesced:
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0, 1, 2, 3]]),
+                values=torch.tensor([0.5, -0.5, 0.7, -0.7]),
+                size=[4, ],
+                device=self.device
+            ).coalesce()
+            self._test_asin_arcsin(input_coalesced)
+
+            # hybrid sparse input
+            input_coalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[1, 3], [2, 4]]),
+                values=torch.tensor([[-0.1, 0.24], [-0.44, 0.1]]),
+                size=[4, 5, 2],
+                device=self.device
+            ).coalesce()
+            self._test_asin_arcsin(input_coalesced)
+
+        if self.is_uncoalesced:
+            # test uncoalesced input
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0),
+                values=torch.tensor([0.3, -0.3, -0.4, 0.3, -0.5, 0.15]),
+                size=[3, ],
+                device=self.device
+            )
+            self._test_asin_arcsin(input_uncoalesced)
+
+            # test on empty sparse tensor
+            input_uncoalesced = torch.sparse_coo_tensor(
+                indices=torch.zeros([2, 0]),
+                values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]),
+                size=[0, 0, 5, 5, 5, 5, 5, 5, 0],
+                device=self.device
+            )
+            self._test_asin_arcsin(input_uncoalesced)
 
     def test_mv(self):
         def test_shape(di, dj, dk, nnz):

From 339961187a9750e8d5f10954ce78ea8cf819987c Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Tue, 22 Sep 2020 03:18:30 -0700
Subject: [PATCH 003/449] [pytorch] refine dispatch keys in
 native_functions.yaml (1/N) (#45010)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45010

The motivation of this change is to differentiate "backend specific" ops
and "generic" ops.

"backend specific" ops are those invoking backend specific kernels thus
only able to run on certain backends, e.g.: CPU, CUDA.

"generic" ops are those not *directly* invoking backend specific kernels.
They are usually calling other "backend specific" ops to get things
done. Thus, they are also referred to as "composite" ops, or "math" ops
(because they are usually pure C++ code constructed from math formula).

The other way to see the difference is that: we have to implement new
kernels for the "backend specific" ops if we want to run these ops on a
new backend. In contrast, "generic"/"composite" ops can run on the new
backend if we've added support for all the "backend specific" ops to
which they delegate their work.

Historically we didn't make a deliberate effort to always populate
supported backends to the "dispatch" section for all the "backend specific"
ops in native_functions.yaml. So now there are many ops which don't have
"dispatch" section but are actually "backend specific" ops. Majority
of them are calling "DispatchStub" kernels, which usually only support
CPU/CUDA (via TensorIterator) or QuantizedCPU/CUDA.

The ultimate goal is to be able to differentiate these two types of ops
by looking at the "dispatch" section in native_functions.yaml.

This PR leveraged the analysis script on #44963 to populate missing
dispatch keys for a set of "backend specific" ops. As the initial step,
we only deal with the simplest case:
* These ops don't already have dispatch section in native_functions.yaml;
* These ops call one or more DispatchStub (thus "backend specific");
* These ops don't call any other aten ops - except for some common
  ones almost every op calls via framework, e.g. calling aten::eq via
  Dispatcher::checkSchemaCompatibility. Calling other nontrivial aten
  ops is a sign of being "composite", so we don't want to deal with this
  case now;
* These ops don't call Tensor::is_quantized() / Tensor::is_sparse() / etc.
  Some ops call thse Tensor::is_XXX() methods to dispatch to quantized /
  sparse kernels internally. We don't deal with this case now.

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23803951

Pulled By: ljk53

fbshipit-source-id: aaced7c34427d1ede72380af4513508df366ea16
---
 aten/src/ATen/native/native_functions.yaml | 142 ++++++++++++++++++++-
 1 file changed, 137 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a84c9b4b61b8..8aac7483ff2a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -167,13 +167,13 @@
 - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
   variants: function
   dispatch:
-     CUDA: fused_dropout_cuda
+    CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
   use_c10_dispatcher: full
   variants: function
   dispatch:
-     CUDA: masked_scale_cuda
+    CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
   use_c10_dispatcher: full
@@ -290,6 +290,8 @@
   variants: function, method
 
 - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: conj_out
 
 - func: _conj(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -304,6 +306,8 @@
   variants: function, method
 
 - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: acos_out
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
@@ -480,6 +484,8 @@
   variants: function, method
 
 - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: acosh_out
 
 # arccosh, alias for acosh
 - func: arccosh(Tensor self) -> Tensor
@@ -501,6 +507,8 @@
   variants: function, method
 
 - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: asinh_out
 
 # arcsinh, alias for asinh
 - func: arcsinh(Tensor self) -> Tensor
@@ -522,6 +530,8 @@
   variants: function, method
 
 - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atanh_out
 
 # arctanh, alias for atanh
 - func: arctanh(Tensor self) -> Tensor
@@ -582,6 +592,8 @@
   variants: function, method
 
 - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan_out
 
 # arctan, alias of atan
 - func: arctan(Tensor self) -> Tensor
@@ -673,6 +685,8 @@
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: bernoulli_
 
 # This out-of-place version isn't used explicitly, but needed by jit.
 # There is no default valid on `p` here because it would introduce ambiguity
@@ -917,12 +931,16 @@
   variants: function
 
 - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: complex_out
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
 - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polar_out
 
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
   use_c10_dispatcher: full
@@ -996,6 +1014,8 @@
   variants: function, method
 
 - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cos_out
 
 - func: cosh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1006,6 +1026,8 @@
   variants: function, method
 
 - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cosh_out
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
@@ -1189,7 +1211,7 @@
 - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
-    CPU:  ctc_loss_cpu
+    CPU: ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
@@ -1455,6 +1477,8 @@
   variants: function, method
 
 - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1465,6 +1489,8 @@
   variants: function, method
 
 - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1475,6 +1501,8 @@
   variants: function, method
 
 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp_out
 
 - func: exp2(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1485,6 +1513,8 @@
   variants: function, method
 
 - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1599,6 +1629,8 @@
   variants: function, method
 
 - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: frac_out
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_guard: False
@@ -1617,6 +1649,8 @@
     CPU: from_file
 
 - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: gcd_out
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -1627,6 +1661,8 @@
   variants: function, method
 
 - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: lcm_out
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -1994,12 +2030,16 @@
     CPU, CUDA: log2_out
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp_out
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp2_out
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -2714,6 +2754,8 @@
   variants: function, method
 
 - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: reciprocal_out
 
 - func: neg(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2741,6 +2783,8 @@
   variants: function, method
 
 - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: negative_out
 
 - func: repeat(Tensor self, int[] repeats) -> Tensor
   use_c10_dispatcher: full
@@ -2900,6 +2944,8 @@
 
 - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: silu_out
 
 - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2921,6 +2967,8 @@
     MkldnnCPU: mkldnn_sigmoid_
 
 - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sigmoid_out
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
@@ -2935,6 +2983,8 @@
     CPU, CUDA: logit_
 
 - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logit_out
 
 - func: sin(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2957,6 +3007,8 @@
   variants: function, method
 
 - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sinh_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
 # This method is OK to call if the `Variable` is a view.
@@ -3167,6 +3219,8 @@
   variants: function, method
 
 - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sqrt_out
 
 - func: square(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3236,6 +3290,8 @@
   variants: function, method
 
 - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3249,6 +3305,8 @@
   variants: function, method
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tanh_out
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
   use_c10_dispatcher: full
@@ -3596,8 +3654,8 @@
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
   dispatch:
-      SparseCPU: _sparse_sum_backward_cpu
-      SparseCUDA: _sparse_sum_backward_cuda
+    SparseCPU: _sparse_sum_backward_cpu
+    SparseCUDA: _sparse_sum_backward_cuda
 
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
@@ -4799,6 +4857,8 @@
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: atan2_
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4817,6 +4877,8 @@
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4906,27 +4968,41 @@
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: cauchy_
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: log_normal_
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: exponential_
 
 - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: geometric_
 
 # wrappers for TH functions
 
@@ -5380,6 +5456,8 @@
   use_c10_dispatcher: full
 
 - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcmul_out
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5390,6 +5468,8 @@
   variants: method
 
 - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcdiv_out
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5609,12 +5689,16 @@
     CPU, CUDA: lgamma
 
 - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polygamma_out
 
 - func: polygamma(int n, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5647,6 +5731,8 @@
   variants: function, method
 
 - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: i0_out
 
 - func: sign(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5674,6 +5760,8 @@
   variants: method, function
 
 - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan2_out
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -5740,19 +5828,27 @@
     CUDA: fmod_cuda
 
 - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: hypot_out
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: hypot
 
 - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
 
 - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nextafter_out
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: nextafter
 
 - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
@@ -6477,10 +6573,14 @@
 
 - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_out
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu
 
 - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6490,6 +6590,8 @@
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_backward
 
 - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6523,6 +6625,8 @@
 
 - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_out
 
 - func: hardsigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6534,6 +6638,8 @@
 - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6562,6 +6668,8 @@
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6572,14 +6680,20 @@
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_out
 
 - func: hardswish(Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6603,6 +6717,8 @@
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_backward
 
 - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6668,10 +6784,14 @@
 
 - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_out
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus
 
 - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6681,13 +6801,19 @@
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_backward
 
 - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_out
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink
 
 - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6697,6 +6823,8 @@
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_backward
 
 - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -7468,6 +7596,8 @@
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: logit_backward
 
 - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -7477,6 +7607,8 @@
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: tanh_backward
 
 # What's a thnn_conv_ versus a slow_conv_?
 #

From 71aeb84ab491ae005b6f4caf84b3892edaf968ad Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Tue, 22 Sep 2020 05:40:03 -0700
Subject: [PATCH 004/449] Revert D23803951: [pytorch] refine dispatch keys in
 native_functions.yaml (1/N)

Test Plan: revert-hammer

Differential Revision:
D23803951 (https://github.com/pytorch/pytorch/commit/339961187a9750e8d5f10954ce78ea8cf819987c)

Original commit changeset: aaced7c34427

fbshipit-source-id: fcc4fb6a2c1d79b587f62347b43f8851fe1647fd
---
 aten/src/ATen/native/native_functions.yaml | 142 +--------------------
 1 file changed, 5 insertions(+), 137 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8aac7483ff2a..a84c9b4b61b8 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -167,13 +167,13 @@
 - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
   variants: function
   dispatch:
-    CUDA: fused_dropout_cuda
+     CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
   use_c10_dispatcher: full
   variants: function
   dispatch:
-    CUDA: masked_scale_cuda
+     CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
   use_c10_dispatcher: full
@@ -290,8 +290,6 @@
   variants: function, method
 
 - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: conj_out
 
 - func: _conj(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -306,8 +304,6 @@
   variants: function, method
 
 - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: acos_out
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
@@ -484,8 +480,6 @@
   variants: function, method
 
 - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: acosh_out
 
 # arccosh, alias for acosh
 - func: arccosh(Tensor self) -> Tensor
@@ -507,8 +501,6 @@
   variants: function, method
 
 - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: asinh_out
 
 # arcsinh, alias for asinh
 - func: arcsinh(Tensor self) -> Tensor
@@ -530,8 +522,6 @@
   variants: function, method
 
 - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: atanh_out
 
 # arctanh, alias for atanh
 - func: arctanh(Tensor self) -> Tensor
@@ -592,8 +582,6 @@
   variants: function, method
 
 - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: atan_out
 
 # arctan, alias of atan
 - func: arctan(Tensor self) -> Tensor
@@ -685,8 +673,6 @@
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: bernoulli_
 
 # This out-of-place version isn't used explicitly, but needed by jit.
 # There is no default valid on `p` here because it would introduce ambiguity
@@ -931,16 +917,12 @@
   variants: function
 
 - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: complex_out
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
 - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: polar_out
 
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
   use_c10_dispatcher: full
@@ -1014,8 +996,6 @@
   variants: function, method
 
 - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: cos_out
 
 - func: cosh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1026,8 +1006,6 @@
   variants: function, method
 
 - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: cosh_out
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
@@ -1211,7 +1189,7 @@
 - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
-    CPU: ctc_loss_cpu
+    CPU:  ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
@@ -1477,8 +1455,6 @@
   variants: function, method
 
 - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1489,8 +1465,6 @@
   variants: function, method
 
 - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1501,8 +1475,6 @@
   variants: function, method
 
 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: exp_out
 
 - func: exp2(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1513,8 +1485,6 @@
   variants: function, method
 
 - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1629,8 +1599,6 @@
   variants: function, method
 
 - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: frac_out
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_guard: False
@@ -1649,8 +1617,6 @@
     CPU: from_file
 
 - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: gcd_out
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -1661,8 +1627,6 @@
   variants: function, method
 
 - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: lcm_out
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -2030,16 +1994,12 @@
     CPU, CUDA: log2_out
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: logaddexp_out
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: logaddexp2_out
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -2754,8 +2714,6 @@
   variants: function, method
 
 - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: reciprocal_out
 
 - func: neg(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2783,8 +2741,6 @@
   variants: function, method
 
 - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: negative_out
 
 - func: repeat(Tensor self, int[] repeats) -> Tensor
   use_c10_dispatcher: full
@@ -2944,8 +2900,6 @@
 
 - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
-  dispatch:
-    CPU, CUDA: silu_out
 
 - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2967,8 +2921,6 @@
     MkldnnCPU: mkldnn_sigmoid_
 
 - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: sigmoid_out
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
@@ -2983,8 +2935,6 @@
     CPU, CUDA: logit_
 
 - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: logit_out
 
 - func: sin(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3007,8 +2957,6 @@
   variants: function, method
 
 - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: sinh_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
 # This method is OK to call if the `Variable` is a view.
@@ -3219,8 +3167,6 @@
   variants: function, method
 
 - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: sqrt_out
 
 - func: square(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3290,8 +3236,6 @@
   variants: function, method
 
 - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3305,8 +3249,6 @@
   variants: function, method
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: tanh_out
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
   use_c10_dispatcher: full
@@ -3654,8 +3596,8 @@
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
   dispatch:
-    SparseCPU: _sparse_sum_backward_cpu
-    SparseCUDA: _sparse_sum_backward_cuda
+      SparseCPU: _sparse_sum_backward_cpu
+      SparseCUDA: _sparse_sum_backward_cuda
 
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
@@ -4857,8 +4799,6 @@
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
-  dispatch:
-    CPU, CUDA: atan2_
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4877,8 +4817,6 @@
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
-  dispatch:
-    CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4968,41 +4906,27 @@
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: random_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: random_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: random_
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: cauchy_
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: log_normal_
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: exponential_
 
 - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
-  dispatch:
-    CPU, CUDA: geometric_
 
 # wrappers for TH functions
 
@@ -5456,8 +5380,6 @@
   use_c10_dispatcher: full
 
 - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: addcmul_out
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5468,8 +5390,6 @@
   variants: method
 
 - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: addcdiv_out
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5689,16 +5609,12 @@
     CPU, CUDA: lgamma
 
 - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: polygamma_out
 
 - func: polygamma(int n, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5731,8 +5647,6 @@
   variants: function, method
 
 - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: i0_out
 
 - func: sign(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5760,8 +5674,6 @@
   variants: method, function
 
 - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: atan2_out
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -5828,27 +5740,19 @@
     CUDA: fmod_cuda
 
 - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: hypot_out
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
-  dispatch:
-    CPU, CUDA: hypot
 
 - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
 
 - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU, CUDA: nextafter_out
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
-  dispatch:
-    CPU, CUDA: nextafter
 
 - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
@@ -6573,14 +6477,10 @@
 
 - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
-  dispatch:
-    CPU, CUDA: elu_out
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: elu
 
 - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6590,8 +6490,6 @@
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: elu_backward
 
 - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6625,8 +6523,6 @@
 
 - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
-  dispatch:
-    CPU, CUDA: hardsigmoid_out
 
 - func: hardsigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6638,8 +6534,6 @@
 - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: hardsigmoid_
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6668,8 +6562,6 @@
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: hardtanh_backward
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6680,20 +6572,14 @@
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
-  dispatch:
-    CPU, CUDA: hardswish_out
 
 - func: hardswish(Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6717,8 +6603,6 @@
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: leaky_relu_backward
 
 - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6784,14 +6668,10 @@
 
 - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
-  dispatch:
-    CPU, CUDA: softplus_out
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: softplus
 
 - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6801,19 +6681,13 @@
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: softplus_backward
 
 - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
-  dispatch:
-    CPU, CUDA: softshrink_out
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: softshrink
 
 - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6823,8 +6697,6 @@
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: softshrink_backward
 
 - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -7596,8 +7468,6 @@
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: logit_backward
 
 - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -7607,8 +7477,6 @@
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
-  dispatch:
-    CPU, CUDA: tanh_backward
 
 # What's a thnn_conv_ versus a slow_conv_?
 #

From 1b059f2c6de83a9c2445ab3fb21d10893f66a839 Mon Sep 17 00:00:00 2001
From: Bugra Akyildiz <vbugra@fb.com>
Date: Tue, 22 Sep 2020 06:27:06 -0700
Subject: [PATCH 005/449] Directly use work.result() to retrieve tensor rather
 than passing as a separate argument (#44914)

Summary:
We currently are fetching an allreduced tensor from Python in C++ in, where we are storing the resulting tensor in a struct's parameter. This PR removes extra tensor paratemeter in the function parameter and fetch from a single place.

Fixes https://github.com/pytorch/pytorch/issues/43960

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44914

Reviewed By: rohan-varma

Differential Revision: D23798888

Pulled By: bugra

fbshipit-source-id: ad1b8c31c15e3758a57b17218bbb9dc1f61f1577
---
 torch/csrc/distributed/c10d/reducer.cpp | 15 +++++++--------
 torch/csrc/distributed/c10d/reducer.h   |  1 -
 torch/nn/parallel/distributed.py        |  2 +-
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index ac4e735af94a..a895bea5fc26 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -425,11 +425,9 @@ std::vector<std::vector<at::Tensor>> Reducer::get_bucket_tensors() const {
 
 void Reducer::set_forward_pass_work_handle(
     std::shared_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
-    at::Tensor& tensor,
     bool useStaticWorldSize) {
   std::lock_guard<std::mutex> lock(mutex_);
   forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle);
-  forwardPassWorkHandle_.resultTensor = tensor;
   forwardPassWorkHandle_.useStaticWorldSize = useStaticWorldSize;
 }
 
@@ -573,12 +571,13 @@ void Reducer::mark_variable_ready(VariableIndex index) {
   if (divFactor_ == kUnsetDivFactor) {
     divFactor_ = process_group_->getSize();
     auto& workHandle = forwardPassWorkHandle_.workHandle;
-    if (workHandle) {
-      if (!forwardPassWorkHandle_.useStaticWorldSize) {
-        workHandle->wait();
-        at::Tensor& res = forwardPassWorkHandle_.resultTensor;
-        divFactor_ = res.item().to<int>();
-      }
+    if (workHandle && !forwardPassWorkHandle_.useStaticWorldSize) {
+      workHandle->wait();
+      auto results = workHandle->result();
+      // Guard against the results being empty
+      TORCH_INTERNAL_ASSERT(results.size() > 0);
+      at::Tensor& res = results.front();
+      divFactor_ = res.item().to<int>();
     }
   }
 
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 87ad60330af7..d45e5c2b90e1 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -89,7 +89,6 @@ class Reducer {
   // corresponding tensor being reduced.
   void set_forward_pass_work_handle(
       std::shared_ptr<c10d::ProcessGroup::Work> forwardPassWorkHandle,
-      at::Tensor& tensor,
       bool useStaticWorldSize);
 
   // Retrieve on-device tensors used to track locally unused parameters. For
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 1425f73dd365..44f5e6fe2ccb 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -581,7 +581,7 @@ def forward(self, *inputs, **kwargs):
             )
             work = dist.all_reduce(ones, group=self.process_group, async_op=True)
             self.reducer._set_forward_pass_work_handle(
-                work, ones, self.ddp_join_divide_by_initial_world_size
+                work, self.ddp_join_divide_by_initial_world_size
             )
 
         # Calling _rebuild_buckets before forward compuation,

From 58b6ab69e5100adc01dd1bf272e70279ba6ae012 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Tue, 22 Sep 2020 08:01:16 -0700
Subject: [PATCH 006/449] torch.sgn for complex tensors (#39955)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/39955

resolves https://github.com/pytorch/pytorch/issues/36323 by adding `torch.sgn` for complex tensors.
`torch.sgn` returns `x/abs(x)` for `x != 0` and returns `0 + 0j` for `x==0`

This PR doesn't test the correctness of the gradients. It will be done as a part of auditing all the ops in future once we decide the autograd behavior (JAX vs TF) and add gradchek.

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D23460526

Pulled By: anjali411

fbshipit-source-id: 70fc4e14e4d66196e27cf188e0422a335fc42f92
---
 aten/src/ATen/core/aten_interned_strings.h    |  1 +
 aten/src/ATen/cpu/vec256/vec256_base.h        |  7 +++++
 .../ATen/cpu/vec256/vec256_complex_double.h   | 10 +++++++
 .../ATen/cpu/vec256/vec256_complex_float.h    | 10 +++++++
 aten/src/ATen/native/UnaryOps.cpp             | 12 ++++++++
 aten/src/ATen/native/UnaryOps.h               |  1 +
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   | 26 ++++++++++++-----
 aten/src/ATen/native/cpu/zmath.h              |  9 ++++++
 aten/src/ATen/native/cuda/UnarySignKernels.cu | 17 +++++++++++
 aten/src/ATen/native/native_functions.yaml    |  9 ++++++
 .../operator_benchmark/pt/unary_test.py       |  1 +
 docs/source/name_inference.rst                |  2 ++
 docs/source/tensors.rst                       |  2 ++
 test/test_autograd.py                         |  2 +-
 test/test_torch.py                            | 15 ++++++++++
 tools/autograd/derivatives.yaml               | 11 ++++---
 torch/_tensor_docs.py                         | 14 +++++++++
 torch/_torch_docs.py                          | 25 ++++++++++++++++
 torch/csrc/autograd/FunctionsManual.cpp       | 29 +++++++++++++++++++
 torch/csrc/autograd/FunctionsManual.h         |  3 ++
 torch/overrides.py                            |  1 +
 .../_internal/common_methods_invocations.py   |  2 ++
 22 files changed, 196 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index c5e4b0ea3c01..4fa49302240b 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -611,6 +611,7 @@ _(aten, sigmoid) \
 _(aten, sign) \
 _(aten, signbit) \
 _(aten, silu) \
+_(aten, sgn) \
 _(aten, sin) \
 _(aten, sinh) \
 _(aten, size) \
diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 0e66cb357965..49acbc518dca 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -239,6 +239,13 @@ struct Vec256 {
     // Specifically map() does not perform the type conversion needed by abs.
     return map([](T x) { return static_cast<T>(std::abs(x)); });
   }
+
+  template <typename other_t_sgn = T,
+            typename std::enable_if<c10::is_complex<other_t_sgn>::value, int>::type = 0>
+  Vec256<T> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
   template <typename other_t_angle = T,
             typename std::enable_if<!c10::is_complex<other_t_angle>::value, int>::type = 0>
   Vec256<T> angle() const {
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
index fbc7a480a4c0..0827b33a3122 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h
@@ -134,6 +134,16 @@ template <> class Vec256<c10::complex<double>> {
     auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
     return _mm256_and_pd(angle, real_mask);         // angle    0
   }
+  Vec256<c10::complex<double>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_pd();
+    auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
+    auto abs_val = Vec256(abs);
+
+    auto div = values / abs_val.values;       // x / abs(x)
+
+    return blendv(div, zero, mask);
+  }
   __m256d real_() const {
     const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
                                                                      0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
index 892345e9d5c5..ea931acc494b 100644
--- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h
+++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h
@@ -171,6 +171,16 @@ template <> class Vec256<c10::complex<float>> {
     auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle    90-angle
     return _mm256_and_ps(angle, real_mask);         // angle    0
   }
+  Vec256<c10::complex<float>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_ps();
+    auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
+    auto abs_val = Vec256(abs);
+
+    auto div = values / abs_val.values;       // x / abs(x)
+
+    return _mm256_blendv_ps(div, zero, mask);
+  }
   __m256 real_() const {
     const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
                                                                    0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index b5a6e2c017e7..f9af400ba2f4 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -301,6 +301,17 @@ Tensor& sign_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(
 Tensor sign(const Tensor& self) { return unary_op_impl(self, at::sign_out); }
 Tensor& sign_(Tensor& self) { return unary_op_impl_(self, at::sign_out); }
 
+Tensor& sgn_out(Tensor& result, const Tensor& self) {
+  if (self.is_complex()) {
+    return unary_op_impl_out(result, self, sgn_stub);
+  } else {
+    return unary_op_impl_out(result, self, sign_stub);
+  }
+}
+
+Tensor sgn(const Tensor& self) { return unary_op_impl(self, at::sgn_out); }
+Tensor& sgn_(Tensor& self) { return unary_op_impl_(self, at::sgn_out); }
+
 Tensor& sin_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sin_stub); }
 Tensor sin(const Tensor& self) { return unary_op_impl(self, at::sin_out); }
 Tensor& sin_(Tensor& self) { return unary_op_impl_(self, at::sin_out); }
@@ -639,6 +650,7 @@ DEFINE_DISPATCH(sigmoid_stub);
 DEFINE_DISPATCH(logit_stub);
 DEFINE_DISPATCH(sign_stub);
 DEFINE_DISPATCH(signbit_stub);
+DEFINE_DISPATCH(sgn_stub);
 DEFINE_DISPATCH(sin_stub);
 DEFINE_DISPATCH(sinh_stub);
 DEFINE_DISPATCH(sqrt_stub);
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index fa172cb58b38..0dcd5a0b9473 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -53,6 +53,7 @@ DECLARE_DISPATCH(unary_fn, sigmoid_stub);
 DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub);
 DECLARE_DISPATCH(unary_fn, sign_stub);
 DECLARE_DISPATCH(unary_fn, signbit_stub);
+DECLARE_DISPATCH(unary_fn, sgn_stub);
 DECLARE_DISPATCH(unary_fn, sin_stub);
 DECLARE_DISPATCH(unary_fn, sinh_stub);
 DECLARE_DISPATCH(unary_fn, sqrt_stub);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index adf300522692..45c7e4e23762 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -270,16 +270,16 @@ static void sign_kernel(TensorIterator& iter){
         auto one_vec = Vec256<scalar_t>(static_cast<scalar_t>(1));
 
         cpu_kernel_vec(
-            iter,
-            [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); },
-            [=](Vec256<scalar_t> self_vec){
+          iter,
+          [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); },
+          [=](Vec256<scalar_t> self_vec){
 
-                // Comparision operators returns bitmask.
-                auto left = Vec256<scalar_t>::blendv(zero_vec, one_vec, zero_vec < self_vec);
-                auto right = Vec256<scalar_t>::blendv(zero_vec, one_vec, self_vec < zero_vec);
+              // Comparision operators returns bitmask.
+              auto left = Vec256<scalar_t>::blendv(zero_vec, one_vec, zero_vec < self_vec);
+              auto right = Vec256<scalar_t>::blendv(zero_vec, one_vec, self_vec < zero_vec);
 
-                return left - right;
-            });
+              return left - right;
+          });
     });
   }
 }
@@ -290,6 +290,15 @@ static void signbit_kernel(TensorIterator& iter){
   });
 }
 
+static void sgn_kernel(TensorIterator& iter){
+  AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), 'sgn_cpu', [&]() {
+    cpu_kernel_vec(
+      iter,
+      [=](scalar_t a) -> scalar_t { return sgn_impl(a); },
+      [=](Vec256<scalar_t> a) { return a.sgn(); });
+  });
+}
+
 static void sinh_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() {
     cpu_kernel_vec(
@@ -639,6 +648,7 @@ REGISTER_DISPATCH(reciprocal_stub, &reciprocal_kernel);
 REGISTER_DISPATCH(neg_stub, &neg_kernel);
 REGISTER_DISPATCH(sign_stub, &sign_kernel);
 REGISTER_DISPATCH(signbit_stub, &signbit_kernel);
+REGISTER_DISPATCH(sgn_stub, &sgn_kernel);
 REGISTER_DISPATCH(sinh_stub, &sinh_kernel);
 REGISTER_DISPATCH(cosh_stub, &cosh_kernel);
 REGISTER_DISPATCH(acosh_stub, &acosh_kernel);
diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h
index d6816f4dd182..e0554e0cbc29 100644
--- a/aten/src/ATen/native/cpu/zmath.h
+++ b/aten/src/ATen/native/cpu/zmath.h
@@ -138,6 +138,15 @@ inline c10::complex<double> ceil_impl (c10::complex<double> z) {
   return c10::complex<double>(std::ceil(z.real()), std::ceil(z.imag()));
 }
 
+template<typename T>
+inline c10::complex<T> sgn_impl (c10::complex<T> z) {
+  if (z == c10::complex<T>(0, 0)) {
+    return c10::complex<T>(0, 0);
+  } else {
+    return z / zabs(z);
+  }
+}
+
 template <typename TYPE>
 inline TYPE floor_impl (TYPE z) {
   return std::floor(z);
diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index 3d90089556be..cd02c89f23f0 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -51,9 +51,26 @@ void signbit_kernel_cuda(TensorIterator& iter){
   });
 }
 
+template<typename T>
+__host__ __device__ static inline c10::complex<T> sgn_wrapper(c10::complex<T> z) {
+  if (z == c10::complex<T>(0, 0)) {
+    return c10::complex<T>(0, 0);
+  } else {
+    return z / std::abs(z);
+  }
+}
+
+void sgn_kernel_cuda(TensorIterator& iter){
+  AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cuda", [&]() {
+      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+        return sgn_wrapper(a);
+      });
+  });
+}
 REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel_cuda);
 REGISTER_DISPATCH(neg_stub, &neg_kernel_cuda);
 REGISTER_DISPATCH(sign_stub, &sign_kernel_cuda);
 REGISTER_DISPATCH(signbit_stub, &signbit_kernel_cuda);
+REGISTER_DISPATCH(sgn_stub, &sgn_kernel_cuda);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index a84c9b4b61b8..3244522f1808 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -277,6 +277,15 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: sgn(Tensor self) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: sgn_(Tensor(a!) self) -> Tensor(a!)
+  variants: method
+
+- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+
 - func: real(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
   variants: function
diff --git a/benchmarks/operator_benchmark/pt/unary_test.py b/benchmarks/operator_benchmark/pt/unary_test.py
index 4a8a7865330b..1391283b1e10 100644
--- a/benchmarks/operator_benchmark/pt/unary_test.py
+++ b/benchmarks/operator_benchmark/pt/unary_test.py
@@ -91,6 +91,7 @@ def forward(self):
         ['sigmoid', torch.sigmoid],
         ['sigmoid_', torch.sigmoid_],
         ['sign', torch.sign],
+        ['sgn', torch.sgn],
         ['sin', torch.sin],
         ['sin_', torch.sin_],
         ['sinh', torch.sinh],
diff --git a/docs/source/name_inference.rst b/docs/source/name_inference.rst
index 7fc84e092633..ccbb8c0c54d3 100644
--- a/docs/source/name_inference.rst
+++ b/docs/source/name_inference.rst
@@ -197,6 +197,8 @@ If you don't see an operation listed here, but it would help your use case, plea
    :meth:`Tensor.sigmoid_`,None
    ":meth:`Tensor.sign`, :func:`torch.sign`",:ref:`keeps_input_names-doc`
    :meth:`Tensor.sign_`,None
+   ":meth:`Tensor.sgn`, :func:`torch.sgn`",:ref:`keeps_input_names-doc`
+   :meth:`Tensor.sgn_`,None
    ":meth:`Tensor.sin`, :func:`torch.sin`",:ref:`keeps_input_names-doc`
    :meth:`Tensor.sin_`,None
    ":meth:`Tensor.sinh`, :func:`torch.sinh`",:ref:`keeps_input_names-doc`
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index d7a94711e76b..cd1c363604fe 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -532,6 +532,8 @@ view of a storage and defines numeric operations on it.
    .. automethod:: sign
    .. automethod:: sign_
    .. automethod:: signbit
+   .. automethod:: sgn
+   .. automethod:: sgn_
    .. automethod:: sin
    .. automethod:: sin_
    .. automethod:: sinh
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 9d037fd7c138..938a41c2c089 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4692,7 +4692,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
                 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'round',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
-                'cosh', '__rmul__'] + separate_complex_tests
+                'cosh', '__rmul__', 'sgn'] + separate_complex_tests
 
 # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
 # complex_list += ['fill_', 't', '__rdiv__', 'tanh']
diff --git a/test/test_torch.py b/test/test_torch.py
index a2f5f21dab1e..c8dfd5115333 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -11288,6 +11288,19 @@ def test_signbit_complex(self, device, dtype):
         with self.assertRaisesRegex(RuntimeError, 'signbit is not implemented for complex tensors.'):
             torch.signbit(t, out=out)
 
+    @dtypes(torch.cfloat, torch.cdouble)
+    def test_sgn(self, device, dtype):
+        x = torch.randn(100, dtype=dtype)
+        angle = x.angle()
+        out = x.sgn()
+        self.assertEqual(out.angle(), angle)
+        self.assertEqual(out.abs(), torch.ones_like(x).real)
+
+        x_out = torch.empty_like(x)
+        torch.sgn(x, out=x_out)
+        self.assertEqual(x_out.angle(), angle)
+        self.assertEqual(x_out.abs(), torch.ones_like(x).real)
+
     @dtypes(*(torch.testing.get_all_dtypes(include_bool=False)))
     def test_signbit_non_boolean_output(self, device, dtype):
         # test non-boolean tensors as the `out=` parameters
@@ -14709,6 +14722,8 @@ def _test_helper(x, y, bias, memory_format):
                 lambda x, y: x.logit_(1e-6),
                 lambda x, y: x.sign(),
                 lambda x, y: x.sign_(),
+                lambda x, y: x.sgn(),
+                lambda x, y: x.sgn_(),
                 lambda x, y: x.sin(),
                 lambda x, y: x.sin_(),
                 lambda x, y: x.sinh(),
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 9ee296e83035..70ddaee5226f 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -159,7 +159,7 @@
 # NB: The parameter names here MUST be consistent with the parameter names
 # in Decalarations.yaml
 - name: abs(Tensor self) -> Tensor
-  self: grad * self.sign()
+  self: grad * self.sgn()
 
 - name: acos(Tensor self) -> Tensor
   self: grad * -((-self * self + 1).rsqrt())
@@ -397,11 +397,11 @@
 # of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414
 # Note that we don't use "result" because saving it would be BC-breaking when it is used in an inplace operation later
 - name: div.Tensor(Tensor self, Tensor other) -> Tensor
-  self: grad / other
-  other: -grad * (self / other) / other
+  self: div_tensor_self_backward(grad, other, self.scalar_type())
+  other: div_tensor_other_backward(grad, self, other)
 
 - name: div.Scalar(Tensor self, Scalar other) -> Tensor
-  self: grad / other
+  self: div_tensor_self_backward(grad, at::scalar_to_tensor(other), self.scalar_type())
 
 - name: dot(Tensor self, Tensor tensor) -> Tensor
   self: grad * tensor
@@ -928,6 +928,9 @@
 - name: sign(Tensor self) -> Tensor
   self: zeros_like(grad)
 
+- name: sgn(Tensor self) -> Tensor
+  self: sgn_backward(result, grad, self)
+
 - name: sin(Tensor self) -> Tensor
   self: grad * self.cos().conj()
 
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 2a83aeca0de8..55c5613cdcc3 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -3121,6 +3121,20 @@ def callable(a, b) -> number
 See :func:`torch.signbit`
 """)
 
+add_docstr_all('sgn',
+               r"""
+sgn() -> Tensor
+
+See :func:`torch.sgn`
+""")
+
+add_docstr_all('sgn_',
+               r"""
+sgn_() -> Tensor
+
+In-place version of :meth:`~Tensor.sgn`
+""")
+
 add_docstr_all('sin',
                r"""
 sin() -> Tensor
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index d0f6f8c92151..5a3b2339fde5 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -6603,6 +6603,31 @@ def merge_dicts(*dicts):
     tensor([ False, True,  False,  False])
 """.format(**common_args))
 
+add_docstr(torch.sgn,
+           r"""
+sgn(input, *, out=None) -> Tensor
+
+For complex tensors, this function returns a new tensor whose elemants have the same angle as that of the
+elements of :attr:`input` and absolute value 1. For a non-complex tensor, this function
+returns the signs of the elements of :attr:`input` (see :func:`torch.sign`).
+
+:math:`\text{out}_{i} = 0`, if :math:`|{\text{{input}}_i}| == 0`
+:math:`\text{out}_{i} = \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|}`, otherwise
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> x=torch.tensor([3+4j, 7-24j, 0, 1+2j])
+    >>> x.sgn()
+    tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j])
+""".format(**common_args))
+
 add_docstr(torch.sin,
            r"""
 sin(input, out=None) -> Tensor
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 29f0720fb3c7..1e73ebac2a2a 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -211,6 +211,17 @@ Tensor mvlgamma_backward(Tensor grad, const Tensor & self, int64_t p) {
   return grad * args.digamma_().sum(-1);
 }
 
+Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) {
+  if (self.is_complex()) {
+    auto abs = at::abs(self);
+    // C -> C
+    // https://arxiv.org/pdf/1701.00392.pdf Section 4.20
+    return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result)));
+  } else {
+    return at::zeros_like(grad, at::MemoryFormat::Preserve);
+  }
+}
+
 Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
   auto result = grad * other.conj();
   if (!at::isComplexType(self_st) && result.is_complex()) {
@@ -220,6 +231,24 @@ Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
   return result;
 }
 
+Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) {
+  auto result = grad / other.conj();
+  if (!at::isComplexType(self_st) && result.is_complex()) {
+    // R -> C
+    result = at::real(result);
+  }
+  return result;
+}
+
+Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) {
+  auto result = -grad * ((self / other) / other).conj();
+  if (!other.is_complex() && result.is_complex()) {
+    // R -> C
+    result = at::real(result);
+  }
+  return result;
+}
+
 Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) {
   // invert the permutation
   auto ndims = fwd_dims.size();
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index b4e7d1667f88..8fd0e9b08cc4 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -44,6 +44,8 @@ at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at:
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Scalar & base, const at::Tensor& exponent, at::Tensor result);
 at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st);
+at::Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st);
+at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other);
 at::Tensor mvlgamma_backward(at::Tensor grad, const at::Tensor & self, int64_t p);
 at::Tensor permute_backwards(const at::Tensor & grad, at::IntArrayRef fwd_dims);
 at::Tensor rad2deg_backward(const at::Tensor& grad);
@@ -74,6 +76,7 @@ at::Tensor sum_tensorlist(at::TensorList tl);
 at::Tensor repeat_backward(at::Tensor grad, int64_t input_dims, at::IntArrayRef repeats);
 at::Tensor _fused_dropout_backward(at::Tensor grad, at::Tensor mask, double p1m);
 at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value);
+at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self);
 at::Tensor var_backward(const at::Tensor & grad, const at::Tensor & self, bool unbiased);
 at::Tensor var_backward(at::Tensor grad, const at::Tensor & self, at::IntArrayRef dim, bool unbiased, bool keepdim);
 at::Tensor std_backward(const at::Tensor & result, const at::Tensor & grad, const at::Tensor & self, bool unbiased);
diff --git a/torch/overrides.py b/torch/overrides.py
index 60f615bb1b0e..d17c6c4f7473 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -701,6 +701,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.sigmoid: lambda input, out=None: -1,
         torch.sign: lambda input, out=None: -1,
         torch.signbit: lambda input, out=None: -1,
+        torch.sgn: lambda input, out=None: -1,
         torch.sin: lambda input, out=None: -1,
         torch.sinh: lambda input, out=None: -1,
         torch.slogdet: lambda input: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 643261461fc8..dd429deacbf0 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -696,6 +696,8 @@ def method_tests():
         ('round', (), NO_ARGS, 'scalar', (True,)),
         ('sign', (S, S, S), NO_ARGS),
         ('sign', (), NO_ARGS, 'scalar'),
+        ('sgn', (S, S, S), NO_ARGS),
+        ('sgn', (), NO_ARGS, 'scalar'),
         ('trunc', (S, S, S), NO_ARGS, '', (True,)),
         ('trunc', (), NO_ARGS, 'scalar', (True,)),
         ('floor', (S, S, S), NO_ARGS, '', (True,)),

From 36ec8f8fb8d0356db7cb67230500f70833a2b2ba Mon Sep 17 00:00:00 2001
From: Brandon Lin <branlin@fb.com>
Date: Tue, 22 Sep 2020 08:22:58 -0700
Subject: [PATCH 007/449] [dper3] Create dper LearningRate low-level module
 (#44639)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44639

As title; this will unblock migration of several modules that need learning rate functionality.

Test Plan:
```
buck test //dper3/dper3/modules/low_level_modules/tests:learning_rate_test
```

Reviewed By: yf225

Differential Revision: D23681733

fbshipit-source-id: 1d98cb35bf6a4ff0718c9cb6abf22401980b523c
---
 caffe2/sgd/learning_rate_op.cc                |  5 ++--
 caffe2/sgd/learning_rate_op.h                 | 10 ++++----
 .../check_backward_compatibility.py           | 24 ++++++++++---------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc
index 534f89d68360..e8172ab65efe 100644
--- a/caffe2/sgd/learning_rate_op.cc
+++ b/caffe2/sgd/learning_rate_op.cc
@@ -164,7 +164,7 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
     "int? max_iter = -1, "
     "int? num_iter = 0, "
     "float? start_multiplier = 0, "
-    "float? end_mulitplier = 0, "
+    "float? end_multiplier = 0, "
     "float? multiplier = 0.5, "
     "float? multiplier_1 = 1.0, "
     "float? multiplier_2 = 1.0, "
@@ -184,5 +184,6 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
     "float? cosine_max_lr = 0.05, "
     "int? cosine_period = 50, "
     "float? cosine_t_mult = 1.0, "
-    "float? cosine_lr_shrink = 0.99) -> Tensor output",
+    "float? cosine_lr_shrink = 0.99, "
+    "float? decay = 1.0) -> Tensor output",
     LearningRateOpFloatCPU);
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index fa35ad4c8d6f..3ba6bef39e63 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -81,13 +81,13 @@ class LearningRateOp final : public Operator<Context> {
       return new HillLearningRate<T>(
           num_iter, start_multiplier, gamma, power, end_multiplier);
     } else if (policy == "slope") {
-      int64_t num_iter_1 =
-          this->template GetSingleArgument<int64_t>(arg_prefix + "num_iter_1", 0);
+      int64_t num_iter_1 = this->template GetSingleArgument<int64_t>(
+          arg_prefix + "num_iter_1", 0);
       DCHECK_GT(num_iter_1, 0);
       T multiplier_1 = this->template GetSingleArgument<float>(
           arg_prefix + "multiplier_1", 0.);
-      int64_t num_iter_2 =
-          this->template GetSingleArgument<int64_t>(arg_prefix + "num_iter_2", 0);
+      int64_t num_iter_2 = this->template GetSingleArgument<int64_t>(
+          arg_prefix + "num_iter_2", 0);
       DCHECK_GT(num_iter_1, 0);
       T multiplier_2 = this->template GetSingleArgument<float>(
           arg_prefix + "multiplier_2", 0.);
@@ -191,7 +191,7 @@ class LearningRateOp final : public Operator<Context> {
       int stepsize =
           this->template GetSingleArgument<int>(arg_prefix + "stepsize", 0);
       T decay =
-          this->template GetSingleArgument<int>(arg_prefix + "decay", 1.0);
+          this->template GetSingleArgument<float>(arg_prefix + "decay", 1.0);
       DCHECK_GT(stepsize, 0);
       DCHECK_GE(max_lr, base_lr_);
       return new CyclicalLearningRate<T>(base_lr_, max_lr, stepsize, decay);
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index d5cbe5a884a9..739a4de51951 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -1,4 +1,3 @@
-
 import argparse
 import datetime
 import re
@@ -58,16 +57,16 @@
     ("aten::atan2", datetime.date(2020, 7, 30)),
     ("aten::copy_", datetime.date(2020, 7, 30)),
     ("aten::sort", datetime.date(2020, 7, 30)),
-    ('aten::_convolution', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose', datetime.date(2020, 10, 15)),
-    ('aten::_convolution_double_backward', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_backward_input', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_backward', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_backward_weight', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose_backward', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose_backward_input', datetime.date(2020, 10, 15)),
-    ('aten::cudnn_convolution_transpose_backward_weight', datetime.date(2020, 10, 15)),
+    ("aten::_convolution", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose", datetime.date(2020, 10, 15)),
+    ("aten::_convolution_double_backward", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_backward_input", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_backward", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_backward_weight", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose_backward", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose_backward_input", datetime.date(2020, 10, 15)),
+    ("aten::cudnn_convolution_transpose_backward_weight", datetime.date(2020, 10, 15)),
     ("aten::_cudnn_init_dropout_state", datetime.date(2020, 7, 30)),
     ("aten::sparse_coo_tensor", datetime.date(2020, 7, 30)),
     ("aten::_sparse_coo_tensor_with_dims", datetime.date(2020, 7, 30)),
@@ -90,6 +89,7 @@
     ("aten::logspace", datetime.date(2020, 9, 30)),
     ("aten::logspace.out", datetime.date(2020, 9, 30)),
     ("__getstate__", datetime.date(2020, 9, 11), "Conv[23]dPackedParams"),
+    ("_caffe2::LearningRate", datetime.date(2020, 10, 1)),
     ("aten::_var", datetime.date(2020, 10, 1)),
     ("aten::_std", datetime.date(2020, 10, 1)),
     ("aten::_foreach_add_", datetime.date(2020, 10, 1)),
@@ -115,6 +115,7 @@ def allow_listed(schema, allow_list):
             return True
     return False
 
+
 # The nightly will fail to parse newly added syntax to schema declarations
 # Add new schemas that will fail the nightly here
 dont_parse_list = [
@@ -122,6 +123,7 @@ def allow_listed(schema, allow_list):
     ("test_backend", datetime.date(2099, 9, 17)),
 ]
 
+
 def dont_parse(schema_line):
     for item in dont_parse_list:
         if item[1] < datetime.date.today():

From 4a0aa69a66cd7dac8dfba2268163c5b5dec899f4 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Tue, 22 Sep 2020 08:50:55 -0700
Subject: [PATCH 008/449] Fix undefined variable 'namedshape' in tensor.py
 (#45085)

Summary:
Hot Fix

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45085

Reviewed By: malfet, seemethere

Differential Revision: D23824444

Pulled By: walterddr

fbshipit-source-id: c9f37b394d281b7ef44b14c30699bb7510a362a7
---
 torch/tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/tensor.py b/torch/tensor.py
index be79dd5c3cd8..18dccfda7c8b 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -845,7 +845,7 @@ def unflatten(self, dim, sizes):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.unflatten, relevant_args, self, dim, namedshape)
+            return handle_torch_function(Tensor.unflatten, relevant_args, self, dim, sizes)
 
         if not sizes:
             raise RuntimeError("unflatten: sizes must be non-empty")

From e155fbe915ff4553d0c0f81df728d606498fee15 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Tue, 22 Sep 2020 08:51:58 -0700
Subject: [PATCH 009/449] add warning when ParameterList/Dict is used with
 DataParallel (#44405)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44405

Test Plan: Imported from OSS

Reviewed By: agolynski

Differential Revision: D23783987

Pulled By: albanD

fbshipit-source-id: 5018b0d381cb09301d2f88a98a910854f740ace1
---
 test/distributed/test_data_parallel.py | 30 ++++++++++++++++++++++++++
 test/test_nn.py                        | 13 +++++++++++
 torch/nn/modules/container.py          | 24 +++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
index dee5fd702b16..99a10906462a 100644
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@@ -775,6 +775,36 @@ def forward(self, x):
                         print("Caught exception during iterations at " + named_msg, flush=True)
                         raise
 
+    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    def test_parameter_list_dict_replica(self):
+        class MyMod(torch.nn.Module):
+            def __init__(self, data):
+                super(MyMod, self).__init__()
+                self.data = data
+
+            def forward(self, inp):
+                return inp
+
+        p1 = torch.nn.Parameter(torch.rand(10))
+        p2 = torch.nn.Parameter(torch.rand(10))
+        module = MyMod(torch.nn.ParameterList([p1, p2])).cuda()
+        model = dp.DataParallel(module)
+        input = torch.randn((8, 8), device="cuda")
+
+        with self.assertWarnsRegex(
+                UserWarning,
+                r"nn\.ParameterList is being used with DataParallel but this"):
+            model(input)
+
+        module = MyMod(torch.nn.ParameterDict({"0": p1, "1": p2})).cuda()
+        model = dp.DataParallel(module)
+        input = torch.randn((8, 8), device="cuda")
+
+        with self.assertWarnsRegex(
+                UserWarning,
+                r"nn\.ParameterDict is being used with DataParallel but this"):
+            model(input)
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_nn.py b/test/test_nn.py
index 7c8f6b7b2874..2dde3c46b74e 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -2646,6 +2646,19 @@ def test_weight_norm(self):
             m = torch.nn.utils.weight_norm(m)
             m = torch.nn.utils.weight_norm(m)
 
+    def test_parameterlistdict_setting_attributes(self):
+        mod = nn.ParameterList(map(nn.Parameter, [torch.rand(2), torch.rand(2)]))
+
+        with self.assertWarnsRegex(UserWarning,
+                                   r"Setting attributes on ParameterList is not supported"):
+            torch.nn.utils.weight_norm(mod, "0")
+
+        mod = nn.ParameterDict({"a": nn.Parameter(torch.rand(2)), "b": nn.Parameter(torch.rand(2))})
+
+        with self.assertWarnsRegex(UserWarning,
+                                   r"Setting attributes on ParameterDict is not supported"):
+            torch.nn.utils.weight_norm(mod, "b")
+
     def test_weight_norm_pickle(self):
         m = torch.nn.utils.weight_norm(nn.Linear(5, 7))
         m = pickle.loads(pickle.dumps(m))
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index c9db80d64fdb..f5d07ae4a69c 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -429,6 +429,11 @@ def __setitem__(self, idx: int, param: 'Parameter') -> None:
         idx = self._get_abs_string_index(idx)
         return self.register_parameter(str(idx), param)
 
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if not isinstance(value, torch.nn.Parameter):
+            warnings.warn("Setting attributes on ParameterList is not supported.")
+        super(ParameterList, self).__setattr__(key, value)
+
     def __len__(self) -> int:
         return len(self._parameters)
 
@@ -480,6 +485,13 @@ def extra_repr(self) -> str:
     def __call__(self, input):
         raise RuntimeError('ParameterList should not be called.')
 
+    def _replicate_for_data_parallel(self):
+        warnings.warn("nn.ParameterList is being used with DataParallel but this is not "
+                      "supported. This list will appear empty for the models replicated "
+                      "on each GPU except the original one.")
+
+        return super(ParameterList, self)._replicate_for_data_parallel()
+
 
 class ParameterDict(Module):
     r"""Holds parameters in a dictionary.
@@ -533,6 +545,11 @@ def __setitem__(self, key: str, parameter: 'Parameter') -> None:
     def __delitem__(self, key: str) -> None:
         del self._parameters[key]
 
+    def __setattr__(self, key: Any, value: Any) -> None:
+        if not isinstance(value, torch.nn.Parameter):
+            warnings.warn("Setting attributes on ParameterDict is not supported.")
+        super(ParameterDict, self).__setattr__(key, value)
+
     def __len__(self) -> int:
         return len(self._parameters)
 
@@ -621,3 +638,10 @@ def extra_repr(self) -> str:
 
     def __call__(self, input):
         raise RuntimeError('ParameterDict should not be called.')
+
+    def _replicate_for_data_parallel(self):
+        warnings.warn("nn.ParameterDict is being used with DataParallel but this is not "
+                      "supported. This dict will appear empty for the models replicated "
+                      "on each GPU except the original one.")
+
+        return super(ParameterDict, self)._replicate_for_data_parallel()

From 63fd257879db488e693b3b84ac4311a152df7497 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 22 Sep 2020 08:58:17 -0700
Subject: [PATCH 010/449] Add `Ellipsis` constant to the list of recognized
 tokens (#44959)

Summary:
Per https://docs.python.org/3.6/library/constants.html
> `Ellipsis` is the same as ellipsis literal `...`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44959

Reviewed By: suo

Differential Revision: D23785660

Pulled By: malfet

fbshipit-source-id: f68461849e7d16ef68042eb96566f2c936c06b0f
---
 torch/csrc/jit/frontend/lexer.h    | 3 ++-
 torch/csrc/jit/frontend/parser.cpp | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index f78dd7a7d11b..3a83d8b9a87f 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -111,7 +111,8 @@ namespace jit {
   _(TK_WITH, "with", "with")                     \
   _(TK_WITH_ITEM, "withitem", "")                \
   _(TK_AS, "as", "as")                           \
-  _(TK_PROP, "property", "")
+  _(TK_PROP, "property", "")                     \
+  _(TK_ELLIPSIS, "Ellipsis", "Ellipsis")
 
 enum TokenKind {
   // we use characters to represent themselves so skip all valid characters
diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp
index 66c75d8a499d..c9f4aac038cc 100644
--- a/torch/csrc/jit/frontend/parser.cpp
+++ b/torch/csrc/jit/frontend/parser.cpp
@@ -167,6 +167,10 @@ struct ParserImpl {
         prefix = Dots::create(L.cur().range);
         L.next();
       } break;
+      case TK_ELLIPSIS: {
+        prefix = Dots::create(L.cur().range);
+        L.next();
+      } break;
       default: {
         Ident name = parseIdent();
         prefix = Var::create(name.range(), name);

From 9fc7a942f0043a79ccd0ef0c3d55a844249b52d3 Mon Sep 17 00:00:00 2001
From: Himangshu <hlahkar@gmail.com>
Date: Tue, 22 Sep 2020 09:05:41 -0700
Subject: [PATCH 011/449] Change from self to self.class() in _DecoratorManager
 to ensure a new object is every time a function is called recursively
 (#44633)

Summary:
Change from self to self._class_() in _DecoratorManager to ensure a new object is every time a function is called recursively

Fixes https://github.com/pytorch/pytorch/issues/44531

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44633

Reviewed By: agolynski

Differential Revision: D23783601

Pulled By: albanD

fbshipit-source-id: a818664dee7bdb061a40ede27ef99e9546fc80bb
---
 test/test_autograd.py       | 47 +++++++++++++++++++++++++++++++++++++
 torch/autograd/grad_mode.py |  4 ++--
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 938a41c2c089..c03c1a496605 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1001,6 +1001,53 @@ def gen_enable_grad():
             for _ in gen_enable_grad():
                 self.assertEqual(torch.is_grad_enabled(), False)
 
+    def test_set_grad_generator_functions_recursive(self):
+        # enable_grad_decorator_recursive and no_grad_decorator_recursive call each other
+        # recursively, to ensure that the decorators preserve the caller's setting
+        @torch.enable_grad()
+        def enable_grad_decorator_recursive(depth):
+            self.assertTrue(torch.is_grad_enabled())
+            if depth > 0:
+                no_grad_decorator_recursive(depth - 1)
+                self.assertTrue(torch.is_grad_enabled())
+
+        @torch.no_grad()
+        def no_grad_decorator_recursive(depth):
+            self.assertFalse(torch.is_grad_enabled())
+            if depth > 0:
+                enable_grad_decorator_recursive(depth - 1)
+                self.assertFalse(torch.is_grad_enabled())
+
+        # enable_grad_context_manager_recursive and no_grad_context_manager_recursive call
+        # each other recursively, to ensure that the decorators preserve the caller's setting
+        def enable_grad_context_manager_recursive(depth):
+            with torch.enable_grad():
+                self.assertTrue(torch.is_grad_enabled())
+                if depth > 0:
+                    no_grad_context_manager_recursive(depth - 1)
+                    self.assertTrue(torch.is_grad_enabled())
+
+        def no_grad_context_manager_recursive(depth):
+            with torch.no_grad():
+                self.assertFalse(torch.is_grad_enabled())
+                if depth > 0:
+                    enable_grad_context_manager_recursive(depth - 1)
+                    self.assertFalse(torch.is_grad_enabled())
+
+        with torch.enable_grad():
+            self.assertTrue(torch.is_grad_enabled())
+            enable_grad_decorator_recursive(10)
+            self.assertTrue(torch.is_grad_enabled())
+            enable_grad_context_manager_recursive(10)
+            self.assertTrue(torch.is_grad_enabled())
+
+        with torch.no_grad():
+            self.assertFalse(torch.is_grad_enabled())
+            enable_grad_decorator_recursive(10)
+            self.assertFalse(torch.is_grad_enabled())
+            enable_grad_context_manager_recursive(10)
+            self.assertFalse(torch.is_grad_enabled())
+
     def test_no_grad_python_function(self):
         """Python Functions should respect grad mode."""
         x = torch.ones(5, 5, requires_grad=True)
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 4bcc3be1d85b..bbd96e941a54 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -22,7 +22,7 @@ def __call__(self, func: F) -> F:
 
         @functools.wraps(func)
         def decorate_context(*args, **kwargs):
-            with self:
+            with self.__class__():
                 return func(*args, **kwargs)
         return cast(F, decorate_context)
 
@@ -33,7 +33,7 @@ def generator_context(*args, **kwargs):
             gen = func(*args, **kwargs)
             while True:
                 try:
-                    with self:
+                    with self.__class__():
                         x = next(gen)
                     yield x
                 except StopIteration:

From ae286d81e00f45b81778635c1aa482d64f2ec7bc Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Tue, 22 Sep 2020 09:37:00 -0700
Subject: [PATCH 012/449] [JIT] improve alias analysis for list constructs
 (#39111)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/39111

In our present alias analysis, we consider any Value that enter another container as entering the heap, and thus aliasing all other heap values of the same type. There are a number of advantages to this approach:
- it is not to hard to maintain the aliasDb implementation
- it is much easier from an op schema perspective - there are many composite list ops registered internally and externally that would be tricky to register and get right if we did something more complicated
- It limits the size of the AliasDb, because a container of size 10 only contains a single memory dag element instead of 10 elements.

The downside is that we have are unable to handle the simple and extremely common case of a list of tensors being used in an ATen op.

In an example like:

```
 def foo(input):
    x = torch.tensor([1, 2, 3, 4])
    y = [x, x]
    input.add_(1)
    return torch.cat(y)
```

we will consider x to be written to. any write to any wildcard element (an element that enters a tuple, an element that is taken from a list) will mark x as written to. This can be limiting for our ability to create a functional subset and fuse graphs - as a result, 4 of TorchVision classification models could not be functionalized.

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D23828003

Pulled By: eellison

fbshipit-source-id: 9109fcb6f2ca20ca897cae71683530285da9d537
---
 test/cpp/jit/test_alias_analysis.cpp | 26 ++++++++++
 test/jit/test_remove_mutation.py     | 41 ++++++++++++++++
 torch/csrc/jit/ir/alias_analysis.cpp | 72 ++++++++++++++++++++++++----
 torch/csrc/jit/ir/alias_analysis.h   |  5 +-
 4 files changed, 133 insertions(+), 11 deletions(-)

diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp
index e854113a7a87..e700ee540616 100644
--- a/test/cpp/jit/test_alias_analysis.cpp
+++ b/test/cpp/jit/test_alias_analysis.cpp
@@ -1238,6 +1238,32 @@ TEST(AliasRegistrationTest, PureWithAnnotationsShouldError) {
       "Tried to register operator foo::rand11(Tensor(a) arg1) -> (Tensor(a)) with aliasing information in the schema but without AliasAnalysisKind::FROM_SCHEMA");
 }
 
+TEST(AliasRegistrationTest, AliasMoveAtenListOp) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  auto graph_string = R"IR(
+  graph():
+    %x : Tensor = prim::MakeTestTensor()
+    %8 : int = prim::Constant[value=0]()
+    %5 : int = prim::Constant[value=1]()
+    %4 : int = prim::Constant[value=2]()
+    %y : Tensor[] = prim::ListConstruct(%x)
+    %6 : Tensor = aten::add_(%x, %4, %5)
+    %9 : Tensor = aten::cat(%y, %8)
+    return (%9))IR";
+
+  torch::jit::parseIR(graph_string, graph.get(), vmap);
+  AliasDb aliasDb(graph);
+
+  // bc y.1 has a single used in a single non-aliasing aten op,
+  // x is added to y.1 contained elements instead of wildcard set
+  EXPECT_TRUE(!aliasDb.mayAlias(vmap["x"], vmap["9"]));
+
+  // write to contained element should prevent move
+  EXPECT_TRUE(!aliasDb.moveBeforeTopologicallyValid(
+      vmap["y"]->node(), vmap["9"]->node()));
+}
+
 TEST(AliasRegistrationTest, PureWithAnnotationsShouldError2) {
   auto registry = torch::RegisterOperators().op(
       "foo::rand12(Tensor(a) arg1) -> Tensor(b)",
diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py
index ef408e775c33..b747fc06bcde 100644
--- a/test/jit/test_remove_mutation.py
+++ b/test/jit/test_remove_mutation.py
@@ -200,3 +200,44 @@ def intermediary_use():
         # it is possible to remove the append here but don't currently have the logic for it
         FileCheck().check_not("append").run(graph)
         self.assertEqual(intermediary_use(), fn())
+
+    def test_common_pytorch_list_ops(self):
+        for op in ["cat", "stack", "vstack", "hstack", "dstack"]:
+            class OpMod(torch.nn.Module):
+                def __init__(self, op):
+                    super(OpMod, self).__init__()
+                    self.op = torch_op
+
+                def forward(self):
+                    x = torch.tensor([1, 2, 3, 4])
+                    x.add_(3)
+                    y = [x, x]
+                    return self.op(y) + 3
+
+            torch_op = getattr(torch, op)
+            mod = OpMod(torch_op)
+            mod_script = torch.jit.script(mod)
+            self.run_pass('remove_mutation', mod_script.forward.graph)
+            FileCheck().check_not("aten::add_").run(mod_script.forward.graph)
+            self.assertEqual(mod(), mod_script())
+
+            # test that the output doesnt alias the input
+            for inputs in [torch.rand(2, 2)], [torch.rand(2, 2) for _ in range(2)]:
+                result = torch_op(inputs)
+                sums = [ten.sum() for ten in result]
+
+                for inp in inputs:
+                    inp.fill_(10)
+
+                self.assertEqual(sums, [ten.sum() for ten in result])
+
+
+        @torch.jit.script
+        def test_multiple_uses():
+            x = torch.tensor([1, 2, 3, 4])
+            x.add_(3)
+            y = [x, x]
+            return torch.cat(y), y
+
+        self.run_pass('remove_mutation', mod_script.forward.graph)
+        FileCheck().check("aten::add_").run(test_multiple_uses.graph)
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 50b84d8f6405..bb5872f35f4f 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/ir/alias_analysis.h>
 
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/utils/memory.h>
 
@@ -298,15 +299,10 @@ void AliasDb::getReadsImpl(Node* n, MemoryLocations& ret) const {
     auto it = elementMap_.find(input);
     if (it != elementMap_.end()) {
       auto el = it->second;
-      // Add all memory locations this element may alias.
-      ret |= memoryDAG_->getMemoryLocations(el);
 
-      // We also consider memory locations of contained values to be "read".
-      for (const auto& type : input->type()->containedTypes()) {
-        if (auto wildcard = getWildcard(type)) {
-          ret |= memoryDAG_->getMemoryLocations(wildcard);
-        }
-      }
+      // Add all memory locations this element may alias and their contained
+      // elements
+      memoryDAG_->collectAllContainedMemoryLocations(el, ret);
     }
   }
 
@@ -878,6 +874,44 @@ void AliasDb::analyzeConservative(Node* node) {
   }
 }
 
+bool AliasDb::functionalNonEscapingListUse(const Use& use) const {
+  Node* n = use.user;
+  size_t offset = use.offset;
+  Value* container = n->inputs().at(offset);
+
+  // only consider aten op uses of lists
+  if (!container->type()->cast<ListType>()) {
+    return false;
+  }
+
+  /*
+  in the general case, we consider any Value that enters another container as
+  entering the heap, and thus aliasing all other heap values of the same type.
+  the advantage of this approach are:
+  - there are many composite list/container ops that would be tricky to
+  schematize if we did something more complicated
+  - limits the size of the AliasDb, because a container of size 10 only contains
+  1 memory dag element instead of 10
+  - we do not need to worry about adding contained elements to the wildcard set
+  when a container escapes the graph.
+  The downside of this approach is we are unable to handle the common case of a
+  list constructed and passed into an aten op. Here, optimize for a set of
+  common ops where the output does not alias the list or the list elements
+  */
+
+  switch (use.user->kind()) {
+    case aten::cat:
+    case aten::broadcast_tensors:
+    case aten::stack:
+    case aten::vstack:
+    case aten::hstack:
+    case aten::dstack:
+      return true;
+  }
+
+  return false;
+}
+
 // List or dict or tuple: construct: create an aliasing element for the actual
 // container, then mark all inputs as wildcards, since they've gone inside the
 // container. Then, add the wildcard sets of appropriate type to the contained
@@ -895,6 +929,20 @@ void AliasDb::analyzeContainerConstruct(Node* node) {
 
   TORCH_INTERNAL_ASSERT(node->outputs().size() == 1);
   auto container = node->output();
+
+  // optimization:
+  // if a list is only used once in an aten op, and the op output
+  // doesn't alias the input, then we can add all inputs to the list's
+  // contained elements instead of the wildcard set.
+  if (container->uses().size() == 1 &&
+      functionalNonEscapingListUse(container->uses().at(0))) {
+    giveFreshAlias(container, false);
+    for (Value* v : node->inputs()) {
+      addToContainedElements(v, container);
+    }
+    return;
+  }
+
   giveFreshAlias(container);
   auto container_elem = elementMap_.at(container);
   for (auto input : node->inputs()) {
@@ -1068,7 +1116,9 @@ void AliasDb::createValue(const Value* value) {
   elementMap_[value] = new_elem;
 }
 
-void AliasDb::giveFreshAlias(const Value* value) {
+void AliasDb::giveFreshAlias(
+    const Value* value,
+    bool add_wildcard_to_contained_elems) {
   auto maybe_mut_type = getMutableTypePtr(value->type());
   if (!maybe_mut_type) {
     return;
@@ -1082,7 +1132,9 @@ void AliasDb::giveFreshAlias(const Value* value) {
 
   auto new_elem = memoryDAGBuilder_->makeFreshValue(value);
   elementMap_[value] = new_elem;
-  addContainedTypesToFreshElement(new_elem, *maybe_mut_type);
+  if (add_wildcard_to_contained_elems) {
+    addContainedTypesToFreshElement(new_elem, *maybe_mut_type);
+  }
 }
 
 Element* AliasDb::getOrCreateElement(const Value* value) {
diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h
index e3e69185891f..b20654b1f6b9 100644
--- a/torch/csrc/jit/ir/alias_analysis.h
+++ b/torch/csrc/jit/ir/alias_analysis.h
@@ -205,10 +205,13 @@ class AliasDb {
       const Value* element,
       const Value* container);
   void mapAliases(at::ArrayRef<Value*> to, at::ArrayRef<Value*> from);
-  void giveFreshAlias(const Value* value);
+  void giveFreshAlias(
+      const Value* value,
+      bool add_wildcard_to_contained_elems = true);
   Element* getOrCreateElement(const Value* value);
 
   c10::optional<TypePtr> getMutableTypePtr(const TypePtr& type) const;
+  bool functionalNonEscapingListUse(const Use& use) const;
 
   bool isContainerType(const TypePtr& type) const;
 

From 4b42f0b6134977bf3ae1b3466ed4674ada9fe372 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Tue, 22 Sep 2020 09:57:24 -0700
Subject: [PATCH 013/449] Support Math keyword in native_functions.yaml.
 (#44556)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44556

Test Plan: Imported from OSS

Reviewed By: bhosmer

Differential Revision: D23698386

Pulled By: ailzhang

fbshipit-source-id: f10ea839a2cfe7d16f5823a75b8b8c5f1ae22dde
---
 aten/src/ATen/native/README.md             | 12 +++++++
 aten/src/ATen/native/group_norm.cpp        | 24 +++++++++++++
 aten/src/ATen/native/native_functions.yaml |  1 +
 aten/src/ATen/templates/TypeDefault.cpp    |  4 +++
 aten/src/ATen/test/CMakeLists.txt          |  1 +
 aten/src/ATen/test/math_kernel_test.cpp    | 40 ++++++++++++++++++++++
 tools/autograd/gen_variable_type.py        |  7 ++--
 tools/codegen/gen.py                       | 21 ++++++++++--
 8 files changed, 104 insertions(+), 6 deletions(-)
 create mode 100644 aten/src/ATen/test/math_kernel_test.cpp

diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 861901521a3b..f18114e73246 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -277,6 +277,18 @@ them the same thing!)
 If two backends have the same dispatch function, you can write `CPU, CUDA: func`
 to reuse the same function name in both cases.
 
+Available backend options can be found at
+https://github.com/pytorch/pytorch/blob/master/tools/codegen/gen.py#L970.
+In addition to backends above, we also support keyword `Math` which is an alias
+that maps to all backend and autograd backend keys. In other words, function registered to `Math` key
+should be a plain mathematical composition of other `at::` functions and works for any backend.
+
+If you add `dispatch` section to any API that didn't have it before, you **have to** move
+the old implementation to `Math` field so that it's still available for other backends to use.
+
+This work is currently WIP and you can find the design proposal in
+https://github.com/pytorch/pytorch/issues/44680.
+
 ### `device_guard`
 
 ```
diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp
index 229e54a9ce62..beb4d940363e 100644
--- a/aten/src/ATen/native/group_norm.cpp
+++ b/aten/src/ATen/native/group_norm.cpp
@@ -133,5 +133,29 @@ Tensor group_norm(
 DEFINE_DISPATCH(GroupNormKernel);
 DEFINE_DISPATCH(GroupNormBackwardKernel);
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor> math_group_norm(
+    const at::Tensor& input, const at::Tensor& weight,
+    const at::Tensor& bias, int64_t N, int64_t C, int64_t HxW,
+    int64_t group, double eps) {
+  auto input_shape = input.sizes();
+  at::Tensor input_reshaped = input.view({1, N * group, N ? -1 : 1});
+  auto outputs = at::native_batch_norm(
+      input_reshaped, /*weight=*/{}, /*bias=*/{}, /*running_mean=*/{},
+      /*running_var=*/{}, /*training=*/true, /*momentum=*/0, eps);
+  at::Tensor out = std::get<0>(outputs);
+  out = out.view(input_shape);
+  std::vector<int64_t> affine_param_shape(input.dim(), 1);
+  affine_param_shape[1] = C;
+  if (weight.defined() && bias.defined()) {
+    out = bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1);
+  } else if (weight.defined()) {
+    out = out.mul(weight.view(affine_param_shape));
+  } else if (bias.defined()) {
+    out = out.add(bias.view(affine_param_shape));
+  }
+  at::Tensor mean = std::get<1>(outputs).view({N, group});
+  at::Tensor rstd = std::get<2>(outputs).view({N, group});
+  return std::make_tuple(out, mean, rstd);
+}
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 3244522f1808..d5a746e2a522 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1731,6 +1731,7 @@
   use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: native_group_norm
+    Math: math_group_norm
 
 - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int N, int C, int HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   use_c10_dispatcher: full
diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index aa5bb4f0c838..6f2b988619c7 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -73,4 +73,8 @@ TORCH_LIBRARY(aten, m) {
   m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)");
 }
 
+TORCH_LIBRARY_IMPL(aten, Math, m) {
+  ${math_function_registrations};
+}
+
 }  // namespace at
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index a0b992302084..43d0fc8ccd92 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -27,6 +27,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp
diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp
new file mode 100644
index 000000000000..9a4dfd640c3e
--- /dev/null
+++ b/aten/src/ATen/test/math_kernel_test.cpp
@@ -0,0 +1,40 @@
+#include <gtest/gtest.h>
+
+#include <ATen/ATen.h>
+
+using namespace at;
+
+#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \
+  ASSERT_TRUE(t1.is_same_size(t2));                    \
+  ASSERT_TRUE(t1.allclose(t2, atol, rtol));
+
+// Ideally we want to test both forward and backward on math kernels but I
+// haven't found an easy way to do it.  Currently we only test forward here
+// and rely on backward tests of each at:: function used in math kernels.
+TEST(MathKernelTest, NativeGroupNorm) {
+  int num_channels = 6;
+  int N = 2;
+  int H = 2, W = 2;
+  int HxW = H * W;
+
+  const auto input = randn({N, num_channels, H, W});
+  const auto weight = randn({num_channels});
+  const auto bias = randn({num_channels});
+  double eps = 1e-05;
+  for (bool undef_weight: {true, false}) {
+    for (int num_groups: {3, 6, 1}) {
+      Tensor undef;
+      auto out = at::native::native_group_norm(
+            input, undef_weight ? undef : weight, undef_weight ? undef : bias,
+            N, num_channels, HxW, num_groups, eps);
+      auto math_out = at::native::math_group_norm(
+            input, undef_weight ? undef : weight, undef_weight ? undef : bias,
+            N, num_channels, HxW, num_groups, eps);
+      ASSERT_ALLCLOSE_TOLERANCES(std::get<0>(out), std::get<0>(math_out), 1e-4, 1e-6);
+      ASSERT_ALLCLOSE_TOLERANCES(std::get<1>(out), std::get<1>(math_out), 1e-4, 1e-6);
+      ASSERT_ALLCLOSE_TOLERANCES(std::get<2>(out), std::get<2>(math_out), 1e-4, 1e-6);
+    }
+  }
+}
+
+
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 804da9193a50..e41c921f1e33 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -370,7 +370,8 @@
 
 # Generate a file that lists all functions and their schema string. Used for XLA
 REGISTRATION_DECLARATION = CodeTemplate("""\
-${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_string}", "compound": "${compound}"}
+${return_type} ${api_name}(${declaration_formals}); \
+// {"schema": "${schema_string}", "compound": "${compound}", "has_math_kernel": "${has_math_kernel}"}
 """)
 
 # TraceType templates
@@ -654,12 +655,12 @@ def gen_variable_type(out, aten_declarations, template_path):
             registration_declarations.append(
                 REGISTRATION_DECLARATION.substitute(declaration,
                                                     declaration_formals=declaration_formals,
-                                                    compound='false'))
+                                                    compound='False'))
         else:
             registration_declarations.append(
                 REGISTRATION_DECLARATION.substitute(declaration,
                                                     declaration_formals=declaration_formals,
-                                                    compound='true'))
+                                                    compound='True'))
 
     env = {
         'registration_declarations': registration_declarations,
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index e4acb369f08e..be8c57f1061a 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -290,7 +290,7 @@ def func(f: NativeFunction) -> Optional[str]:
             assert returns_type == dispatcher.returns_type(f.func.returns)
             dispatcher_args = dispatcher.arguments(f.func)
             dispatcher_args_types_str = ', '.join(map(lambda a: a.type, dispatcher_args))
-            if dispatch is None:
+            if dispatch is None or dispatch == 'Math':
                 type_name = f'TypeDefault::{name}'
             else:
                 type_name = f'{dispatch}Type::{name}'
@@ -811,6 +811,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         ('device_guard', f.device_guard),
         ('with_gil', False),
         ('deprecated', False),
+        ('has_math_kernel', f.dispatch is not None and 'Math' in f.dispatch),
     ])
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
@@ -1016,17 +1017,31 @@ def make_file_manager(install_dir: str) -> FileManager:
         del fm
 
     cpu_fm.write('TypeDefault.h', lambda: {
-        'type_method_declarations': list(mapMaybe(
+        'type_method_declarations':
+        list(mapMaybe(
             compute_type_method(None, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)) +
+        list(mapMaybe(
+            compute_type_method('Math', target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist),
             native_functions)),
+
     })
     cpu_fm.write('TypeDefault.cpp', lambda: {
-        'type_method_definitions': list(mapMaybe(
+        'type_method_definitions':
+        list(mapMaybe(
             compute_type_method(None, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)) +
+        list(mapMaybe(
+            compute_type_method('Math', target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist),
             native_functions)),
+
         'function_registrations': list(mapMaybe(
             compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
             native_functions)) if not options.per_op_registration else [],
+
+        'math_function_registrations': list(mapMaybe(
+            compute_type_method('Math', target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
+            native_functions)) if not options.per_op_registration else [],
     })
     cpu_fm.write('Functions.h', lambda: {
         'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)),

From 8501b89a87398422025df32f50c0d3e1bbd152fb Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Tue, 22 Sep 2020 10:07:08 -0700
Subject: [PATCH 014/449] [ONNX] Update ort release (#45095)

Summary:
Update ort release

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45095

Reviewed By: bwasti

Differential Revision: D23832041

Pulled By: malfet

fbshipit-source-id: 39c47a87e451c4c43ba4d4e8be385cc195cc611a
---
 .jenkins/caffe2/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 20a7310a91c1..61fb7de08fe5 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -171,7 +171,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
     # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
     # Fix the pip error: Couldn't find a version that satisfies the requirement
     pip install --upgrade pip
-    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122
+    pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.5.0.dev202009182
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"
 fi

From 1fd48a9d1f4158fb015793caa282d2c0ca92059d Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 22 Sep 2020 10:28:26 -0700
Subject: [PATCH 015/449] Revert D23798016: [FX] s/get_param/get_attr/

Test Plan: revert-hammer

Differential Revision:
D23798016 (https://github.com/pytorch/pytorch/commit/c941dd3492535b3e09f4cb3f60c80b02f5e04c3f)

Original commit changeset: 1d2f3db1994a

fbshipit-source-id: 974d930064b37d396c5d66c905a63d45449813e5
---
 test/fx/quantization.py           |  2 +-
 test/test_fx.py                   |  6 +++---
 torch/fx/__init__.py              |  4 ++--
 torch/fx/graph.py                 | 10 +++++-----
 torch/fx/graph_module.py          |  4 ++--
 torch/fx/symbolic_trace.py        |  8 ++++----
 torch/quantization/fx/quantize.py |  4 ++--
 torch/quantization/fx/utils.py    |  4 ++--
 8 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index 968c797c9163..8116ed5ce89a 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -222,7 +222,7 @@ def load_arg(a):
         for node in self.graph.nodes:
             if node.op == 'placeholder':
                 result = next(args_iter)
-            elif node.op == 'get_attr':
+            elif node.op == 'get_param':
                 result = self.state_dict[node.target]
             elif node.op == 'call_function':
                 result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
diff --git a/test/test_fx.py b/test/test_fx.py
index 41607d64cbcc..89311e2a2873 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -336,7 +336,7 @@ def __init__(self, interpreter):
                 placeholder_nodes.append(graph.create_node('placeholder', name))
 
             # Get the interpreter object
-            interpreter_node = graph.create_node('get_attr', 'interpreter')
+            interpreter_node = graph.create_node('get_param', 'interpreter')
 
             # Add a node to call the interpreter instance
             output_node = graph.create_node(
@@ -567,7 +567,7 @@ def test_graph_fns(self):
         g = Graph()
         a = g.placeholder('a')
         b = g.call_module('linear', (a,))
-        c = g.get_attr('bias')
+        c = g.get_param('bias')
         d = g.call_method('add', (b, c))
         e = g.call_function(torch.sin, (d,))
         g.output(e)
@@ -584,7 +584,7 @@ def test_construct_root_dict(self):
         graph : torch.fx.Graph = torch.fx.Graph()
         a : torch.fx.Node = graph.create_node('placeholder', 'x')
         b : torch.fx.Node = graph.create_node('call_module', 'foo.bar.baz', args=(a,))
-        c : torch.fx.Node = graph.create_node('get_attr', 'zip.zap.zam')
+        c : torch.fx.Node = graph.create_node('get_param', 'zip.zap.zam')
         d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
         graph.output(d)
 
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 185511460740..5b90c434340c 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -36,7 +36,7 @@ def forward(self, x):
 opcode         name           target                                                   args                kwargs
 -------------  -------------  -------------------------------------------------------  ------------------  -----------
 placeholder    x              x                                                        ()                  {}
-get_attr       linear_weight  linear.weight                                            ()                  {}
+get_param      linear_weight  linear.weight                                            ()                  {}
 call_function  add_1          <built-in function add>                                  (x, linear_weight)  {}
 call_module    linear_1       linear                                                   (add_1,)            {}
 call_method    relu_2         relu                                                     [linear_1]          {}
@@ -48,7 +48,7 @@ def forward(self, x):
 
 - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on.
   `target` is similarly the name of the argument. `args` and `kwargs` are don't-care
-- `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
+- `get_param` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
    fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy.
    `args` and `kwargs` are don't-care
 - `call_function` applies a free function to some values. `name` is similarly the name of the value to assign
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 1a8079ca8289..a63b7c8b35dc 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -80,7 +80,7 @@ def create_node(self, op: str, target: Target,
                     args: Optional[Tuple[Argument, ...]] = None,
                     kwargs: Optional[Dict[str, Argument]] = None,
                     name: Optional[str] = None) -> Node:
-        assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder')
+        assert op in ('call_function', 'call_method', 'get_param', 'call_module', 'placeholder')
         args = () if args is None else args
         kwargs = {} if kwargs is None else kwargs
         self._mark_uses(args)
@@ -93,8 +93,8 @@ def create_node(self, op: str, target: Target,
     def placeholder(self, name: str) -> Node:
         return self.create_node('placeholder', name)
 
-    def get_attr(self, name: str) -> Node:
-        return self.create_node('get_attr', name)
+    def get_param(self, name: str) -> Node:
+        return self.create_node('get_param', name)
 
     def call_module(self,
                     module_name: str,
@@ -196,7 +196,7 @@ def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
                 assert isinstance(node.target, str)
                 body.append(f'{node.name} = {_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})\n')
                 continue
-            elif node.op == 'get_attr':
+            elif node.op == 'get_param':
                 assert isinstance(node.target, str)
                 body.append(f'{node.name} = {_format_target(root_module, node.target)}\n')
                 continue
@@ -230,7 +230,7 @@ def format_node(n : Node) -> Optional[str]:
                 assert isinstance(n.target, str)
                 placeholder_names.append(n.target)
                 return None
-            elif n.op == 'get_attr':
+            elif n.op == 'get_param':
                 return f'%{n.name} : [uses={n.uses}] = self.{n.target}'
             else:
                 return f'%{n.name} : [uses={n.uses}] = {n.op}[target={n.target}](' \
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 505ee991d6cc..df40cbd84fe1 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -124,13 +124,13 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
             if hasattr(root, 'training'):
                 self.training = root.training
             for node in graph.nodes:
-                if node.op in ['get_attr', 'call_module']:
+                if node.op in ['get_param', 'call_module']:
                     assert isinstance(node.target, str)
                     _copy_attr(root, self, node.target)
         elif isinstance(root, dict):
             targets_to_copy = []
             for node in graph.nodes:
-                if node.op in ['get_attr', 'call_module']:
+                if node.op in ['get_param', 'call_module']:
                     assert isinstance(node.target, str)
                     if node.target not in root:
                         raise RuntimeError('Node ' + str(node) + ' referenced target ' + node.target +
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index 9b192dd5501f..442fa28c36d9 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -55,15 +55,15 @@ def create_arg(self, a: Any) -> Argument:
         if isinstance(a, torch.nn.Parameter):
             for n, p in self.root.named_parameters():
                 if a is p:
-                    return self.create_node('get_attr', n, (), {})
+                    return self.create_node('get_param', n, (), {})
             raise NameError('parameter is not a member of this module')
         # Tensors do not have a reliable string repr() from which they can be
         # constructed (and we probably don't want to rely on that, either), so
         # for any constant Tensor values we encounter, first search for if they
         # are an attribute of some module in the module hierarchy. If so, emit
-        # a get_attr to retrieve that tensor. Otherwise, we'll store away the
+        # a get_param to retrieve that tensor. Otherwise, we'll store away the
         # tensor value into a special attribute on the Module s.t. we can
-        # retrieve it with a get_attr.
+        # retrieve it with a get_param.
         if isinstance(a, torch.Tensor):
             # TODO: slow
             def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]:
@@ -96,7 +96,7 @@ def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]:
                     i += 1
                 setattr(self.root, qualname, a)
 
-            return self.create_node('get_attr', qualname, (), {})
+            return self.create_node('get_param', qualname, (), {})
         return super().create_arg(a)
 
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 7967b4ec2dcb..8d8ef0f328c3 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -177,7 +177,7 @@ def get_qconfig(module):
 
         self.qconfig_map = dict()
         for node in input_graph.nodes:
-            if node.op == 'get_attr':
+            if node.op == 'get_param':
                 parent, _ = _parent_name(node.target)
                 self.qconfig_map[node.name] = get_qconfig(self.modules[parent])
             elif node.op == 'call_function':
@@ -557,7 +557,7 @@ def load_arg(a):
                 setattr(quantized_root, packed_weight_name, packed_weight)
                 # replace prepack node with a getattr node
                 env[node.name] = folded_graph.create_node(
-                    'get_attr', packed_weight_name, (), {})
+                    'get_param', packed_weight_name, (), {})
             elif prepack_node is not None:
                 # remove the foled node
                 continue
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 5d5532dc48fc..95d19df1e1b4 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -17,7 +17,7 @@ def graph_pretty_str(g, shorten=True) -> str:
     built_in_meth_re = re.compile('<built-in method (.*) of type.*>')
     op_dict = {
         'placeholder': 'plchdr',
-        'get_attr': 'gt_prm',
+        'get_param': 'gt_prm',
         'call_function': 'cl_fun',
         'call_module': 'cl_mod',
         'call_method': 'cl_meth',
@@ -136,5 +136,5 @@ def get_next_qparams_idx(module, qparams):
     for key, value in qparams.items():
         setattr(root_module, key + str(idx), value)
         qparam_full_path = key + str(idx)
-        inputs.append(graph.create_node('get_attr', qparam_full_path))
+        inputs.append(graph.create_node('get_param', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})

From 10f287539f64431a41c3571b40b966c0a8e85e65 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Tue, 22 Sep 2020 10:48:06 -0700
Subject: [PATCH 016/449] Align casing in test_dispatch with dispatch keys.
 (#44933)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44933

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23778247

Pulled By: ailzhang

fbshipit-source-id: bc3725eae670b03543015afe763cb3bb16baf8f6
---
 test/test_dispatch.py                | 89 +++++++++++++++++++---------
 torch/csrc/utils/python_dispatch.cpp | 13 ++--
 2 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/test/test_dispatch.py b/test/test_dispatch.py
index ec9fd20797e3..45480d8916f0 100644
--- a/test/test_dispatch.py
+++ b/test/test_dispatch.py
@@ -229,11 +229,11 @@ def test_def(self):
             # m.impl("test_def", [](const Tensor& x) { return x })
             lambda m: m.impl_t_t("foo"),
             # m.impl("test_def", kCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="cpu"),
+            lambda m: m.impl_t_t("foo", dispatch="CPU"),
             # m.impl("test_def", kAutograd, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="autograd"),
+            lambda m: m.impl_t_t("foo", dispatch="Autograd"),
             # m.impl("test_def", kAutogradCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", dispatch="autogradcpu")
+            lambda m: m.impl_t_t("foo", dispatch="AutogradCPU")
         ]).state
         self.assertExpectedInline(state, '''\
 name: test::foo
@@ -262,11 +262,11 @@ def test_def_with_inference(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu"),
+            lambda m: m.impl_t_t("foo", "CPU"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd"),
             # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autogradcpu")
+            lambda m: m.impl_t_t("foo", "AutogradCPU")
         ]).state
         self.assertExpectedInline(state, '''\
 name: test::foo
@@ -296,11 +296,11 @@ def test_impl_only(self):
             # m.impl("foo", [](const Tensor& x) { return x })
             lambda m: m.impl_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu"),
+            lambda m: m.impl_t_t("foo", "CPU"),
             # m.impl("foo", torch::kAutograd, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd"),
             # m.impl("foo", torch::kAutogradCPU, [](const Tensor& x) { return x })
-            lambda m: m.impl_t_t("foo", "autogradcpu")
+            lambda m: m.impl_t_t("foo", "AutogradCPU")
         ]).state
         self.assertExpectedInline(state, '''\
 name: test::foo
@@ -316,13 +316,13 @@ def test_computed_table(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kCUDA, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "xla", debug="fn_xla"),
+            lambda m: m.impl_t_t("foo", "XLA", debug="fn_xla"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
             # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autogradcpu", debug="fn_autogradcpu")
+            lambda m: m.impl_t_t("foo", "AutogradCPU", debug="fn_autogradcpu")
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -351,12 +351,12 @@ def test_computed_table(self):
 ''')
 
     def test_computed_table_with_cpu_catchall(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu"),
+            lambda m: m.impl_t_t("foo", "CPU"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -382,12 +382,12 @@ def test_computed_table_with_cpu_catchall(self):
 ''')
 
     def test_computed_table_with_math(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo(Tensor x) -> Tensor")
             lambda m: m.def_("foo(Tensor x) -> Tensor"),
             # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "math"),
+            lambda m: m.impl_t_t("foo", "Math"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -412,14 +412,14 @@ def test_computed_table_with_math(self):
 ''')
 
     def test_computed_table_with_cpu_math(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo(Tensor x) -> Tensor")
             lambda m: m.def_("foo(Tensor x) -> Tensor"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "math", debug="fn_math"),
+            lambda m: m.impl_t_t("foo", "Math", debug="fn_math"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -445,12 +445,12 @@ def test_computed_table_with_cpu_math(self):
 ''')
 
     def test_computed_table_with_autograd(self):
-        global_m = C._dispatch_library("IMPL", "_", "autogradcpu")
+        global_m = C._dispatch_library("IMPL", "_", "AutogradCPU")
         result = self.commute("foo", [
             # m.def("foo(Tensor x) -> Tensor")
             lambda m: m.def_("foo(Tensor x) -> Tensor"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -476,11 +476,11 @@ def test_computed_table_with_cpu_autograd_math_catchall(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
             # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "math", debug="fn_math"),
+            lambda m: m.impl_t_t("foo", "Math", debug="fn_math"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -512,9 +512,9 @@ def test_computed_table_with_cpu_autograd_catchall(self):
             # m.def("foo", [](const Tensor & x) { return x })
             lambda m: m.def_name_t_t("foo"),
             # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"),
+            lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"),
             # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x })
-            lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"),
+            lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"),
         ])
         state, table = result.state, result.table
         self.assertExpectedInline(state, '''\
@@ -538,6 +538,39 @@ def test_computed_table_with_cpu_autograd_catchall(self):
 AutogradCPU: fn_autograd [autograd kernel]
 AutogradCUDA: fn_autograd [autograd kernel]
 AutogradXLA: fn_autograd [autograd kernel]
+''')
+
+    def test_computed_table_with_ambiguous_autogradother(self):
+        result = self.commute("foo", [
+            # m.def("foo", [](const Tensor & x) { return x })
+            lambda m: m.def_name_t_t("foo"),
+            # m.impl("foo", torch::kMath, [](const Tensor & x) { return x })
+            lambda m: m.impl_t_t("foo", "Math", debug="fn_math"),
+            # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x })
+            lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"),
+        ])
+        state, table = result.state, result.table
+        self.assertExpectedInline(state, '''\
+name: test::foo
+schema: test::foo(Tensor _0) -> (Tensor _0)
+debug: registered at /dev/null:0
+alias analysis kind: CONSERVATIVE
+QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
+Math[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
+catchall: default_def_name_t_t :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ]
+''')
+
+        # computed dispatch table is too big, so we only check on a few entries we're interested in.
+        extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check)
+
+        self.assertExpectedInline(extracted_table, '''\
+CPU: fn_math [math kernel]
+CUDA: fn_math [math kernel]
+XLA: fn_math [math kernel]
+AutogradOther: ambiguous_autogradother [ambiguous autogradother]
+AutogradCPU: fn_math [math kernel]
+AutogradCUDA: fn_math [math kernel]
+AutogradXLA: fn_math [math kernel]
 ''')
 
     # Can't do this yet for BC reasons
@@ -631,7 +664,7 @@ def test_multiple_def_alias_mismatch(self):
         )
 
     def test_multiple_fallback(self):
-        global_m = C._dispatch_library("IMPL", "_", "xla")
+        global_m = C._dispatch_library("IMPL", "_", "XLA")
         global_m.fallback_fallthrough(),
         try:
             global_m.fallback_fallthrough(),
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 21bf8e69adc4..f0f63bf7a2f0 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -27,12 +27,13 @@ torch::Library::Kind parseKind(const std::string& k) {
 
 c10::optional<c10::DispatchKey> parseDispatchKey(const std::string& k) {
   static std::unordered_map<std::string, c10::DispatchKey> key_map = {
-    {"cpu", c10::DispatchKey::CPU},
-    {"cuda", c10::DispatchKey::CUDA},
-    {"xla", c10::DispatchKey::XLA},
-    {"math", c10::DispatchKey::Math},
-    {"autograd", c10::DispatchKey::Autograd},
-    {"autogradcpu", c10::DispatchKey::AutogradCPU},
+    {"CPU", c10::DispatchKey::CPU},
+    {"CUDA", c10::DispatchKey::CUDA},
+    {"XLA", c10::DispatchKey::XLA},
+    {"QuantizedCPU", c10::DispatchKey::QuantizedCPU},
+    {"Math", c10::DispatchKey::Math},
+    {"Autograd", c10::DispatchKey::Autograd},
+    {"AutogradCPU", c10::DispatchKey::AutogradCPU},
     {"", c10::DispatchKey::Undefined},
   };
   auto it = key_map.find(k);

From ef885c10d8591a924bd889d0d8778f485dc10f42 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 22 Sep 2020 11:33:45 -0700
Subject: [PATCH 017/449] [pytorch] Add triplet margin loss with custom
 distance (#43680)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43680

As discussed [here](https://github.com/pytorch/pytorch/issues/43342),
adding in a Python-only implementation of the triplet-margin loss that takes a
custom distance function.  Still discussing whether this is necessary to add to
PyTorch Core.

Test Plan:
python test/run_tests.py

Imported from OSS

Reviewed By: albanD

Differential Revision: D23363898

fbshipit-source-id: 1cafc05abecdbe7812b41deaa1e50ea11239d0cb
---
 docs/source/nn.functional.rst |   7 +-
 docs/source/nn.rst            |   3 +-
 test/test_nn.py               |  80 ++++++++++++++++++++++
 torch/nn/functional.py        |  36 ++++++++++
 torch/nn/functional.pyi.in    |  11 ++-
 torch/nn/modules/__init__.py  |   6 +-
 torch/nn/modules/loss.py      | 123 +++++++++++++++++++++++++++++++++-
 torch/overrides.py            |   3 +
 8 files changed, 258 insertions(+), 11 deletions(-)

diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index eb88b50e6d56..416121cec8d6 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -483,6 +483,11 @@ Loss functions
 
 .. autofunction:: triplet_margin_loss
 
+:hidden:`triplet_margin_with_distance_loss`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: triplet_margin_with_distance_loss
+
 Vision functions
 ----------------
 
@@ -533,5 +538,3 @@ DataParallel functions (multi-GPU, distributed)
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autofunction:: torch.nn.parallel.data_parallel
-
-
diff --git a/docs/source/nn.rst b/docs/source/nn.rst
index 3a6cb7e19316..8d195c04037c 100644
--- a/docs/source/nn.rst
+++ b/docs/source/nn.rst
@@ -10,7 +10,7 @@ These are the basic building block for graphs
     :depth: 2
     :local:
     :backlinks: top
-    
+
 
 .. currentmodule:: torch.nn
 
@@ -269,6 +269,7 @@ Loss Functions
     nn.CosineEmbeddingLoss
     nn.MultiMarginLoss
     nn.TripletMarginLoss
+    nn.TripletMarginWithDistanceLoss
 
 Vision Layers
 ----------------
diff --git a/test/test_nn.py b/test/test_nn.py
index 2dde3c46b74e..00614c0cdc34 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9866,6 +9866,7 @@ def v(fn):
             v(lambda: F.multilabel_margin_loss(input, zeros, reduction=reduction))
 
             v(lambda: F.triplet_margin_loss(input, input, input, reduction=reduction))
+            v(lambda: F.triplet_margin_with_distance_loss(input, input, input, reduction=reduction))
             v(lambda: F.margin_ranking_loss(input, input, input.sign(), reduction=reduction))
             v(lambda: F.cosine_embedding_loss(input, input, input[:, 0].sign(), reduction=reduction))
 
@@ -12185,6 +12186,85 @@ def test_threshold_inplace_overlap(self, device):
         F.threshold(x, 0.5, 0.5, inplace=True)
         F.threshold_(x, 0.5, 0.5)
 
+    @onlyOnCPUAndCUDA
+    def test_triplet_margin_with_distance_loss_default_parity(self, device):
+        # Test for `nn.TripletMarginWithDistanceLoss` and
+        # `F.triplet_margin_with_distance_loss`.  Checks
+        # for parity against the respective non-distance-agnostic
+        # implementations of triplet margin loss (``nn.TripletMarginLoss`
+        # and `F.triplet_margin_loss`) under *default args*.
+
+        for extra_args in \
+                itertools.product((0.5, 1, 1.5), (True, False), ('none', 'mean', 'sum')):
+            kwargs = {'margin': extra_args[0], 'swap': extra_args[1], 'reduction': extra_args[2]}
+
+            anchor = torch.randn(5, 10, device=device, requires_grad=True)
+            positive = torch.randn(5, 10, device=device, requires_grad=True)
+            negative = torch.randn(5, 10, device=device, requires_grad=True)
+
+            # Test forward, functional
+            expected = F.triplet_margin_loss(anchor, positive, negative, **kwargs)
+            actual = F.triplet_margin_with_distance_loss(anchor, positive, negative, **kwargs)
+            self.assertEqual(actual, expected, rtol=1e-6, atol=1e-6)
+
+            # Test forward, module
+            loss_ref = nn.TripletMarginLoss(**kwargs)
+            loss_op = nn.TripletMarginWithDistanceLoss(**kwargs)
+            self.assertEqual(loss_op(anchor, positive, negative),
+                             loss_ref(anchor, positive, negative),
+                             rtol=1e-6, atol=1e-6)
+
+            # Test backward
+            self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss(
+                a, p, n, **kwargs), (anchor, positive, negative)))
+            self.assertTrue(gradcheck(lambda a, p, n: loss_op(a, p, n),
+                            (anchor, positive, negative)))
+
+    @onlyOnCPUAndCUDA
+    def test_triplet_margin_with_distance_loss(self, device):
+        # Test for parity between `nn.TripletMarginWithDistanceLoss` and
+        # `F.triplet_margin_with_distance_loss`.
+
+        pairwise_distance = nn.PairwiseDistance()
+
+        def cosine_distance(x, y):
+            return 1.0 - F.cosine_similarity(x, y)
+
+        distance_functions = (pairwise_distance, cosine_distance,
+                              lambda x, y: 1.0 - F.cosine_similarity(x, y))
+
+        reductions = ('mean', 'none', 'sum')
+        margins = (1.0, 1.5, 0.5)
+        swaps = (True, False)
+
+        for distance_fn, reduction, margin, swap \
+                in itertools.product(distance_functions, reductions, margins, swaps):
+            anchor = torch.randn(5, 10, device=device, requires_grad=True)
+            positive = torch.randn(5, 10, device=device, requires_grad=True)
+            negative = torch.randn(5, 10, device=device, requires_grad=True)
+
+            # Test backward
+            self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss(
+                a, p, n, distance_function=distance_fn, reduction=reduction, margin=margin, swap=swap),
+                (anchor, positive, negative)))
+            loss_op = nn.TripletMarginWithDistanceLoss(distance_function=distance_fn,
+                                                       reduction=reduction, margin=margin, swap=swap)
+            self.assertTrue(gradcheck(lambda a, p, n: loss_op(
+                a, p, n), (anchor, positive, negative)))
+            traced_loss_op = torch.jit.trace(loss_op, (anchor, positive, negative))
+            self.assertTrue(gradcheck(lambda a, p, n: traced_loss_op(
+                a, p, n), (anchor, positive, negative)))
+
+            # Test forward parity
+            functional = F.triplet_margin_with_distance_loss(anchor, positive, negative,
+                                                             distance_function=distance_fn,
+                                                             reduction=reduction, margin=margin, swap=swap)
+            modular = loss_op(anchor, positive, negative)
+            traced = traced_loss_op(anchor, positive, negative)
+            self.assertEqual(functional, modular, atol=1e-6, rtol=1e-6)
+            self.assertEqual(traced, modular, atol=1e-6, rtol=1e-6)
+
+
 class TestModuleGlobalHooks(TestCase):
 
     def tearDown(self):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index edde49a1d358..f4dbceeb88b1 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -3728,6 +3728,42 @@ def triplet_margin_loss(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, s
                                      swap, reduction_enum)
 
 
+def triplet_margin_with_distance_loss(anchor, positive, negative, *, distance_function=None,
+                                      margin=1.0, swap=False, reduction="mean"):
+    # type: (Tensor, Tensor, Tensor, Optional[Callable[[Tensor, Tensor], Tensor]], float, bool, str) -> Tensor
+    r"""
+    See :class:`~torch.nn.TripletMarginWithDistanceLoss` for details.
+    """
+    if torch.jit.is_scripting():
+        raise NotImplementedError("F.triplet_margin_with_distance_loss does not support JIT scripting: "
+                                  "functions requiring Callables cannot be scripted.")
+
+    tens_ops = (anchor, positive, negative)
+    if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+        return handle_torch_function(
+            triplet_margin_with_distance_loss, tens_ops, anchor, positive, negative,
+            distance_function=distance_function, margin=margin, swap=swap, reduction=reduction)
+
+    distance_function = distance_function if distance_function is not None else pairwise_distance
+
+    positive_dist = distance_function(anchor, positive)
+    negative_dist = distance_function(anchor, negative)
+
+    if swap:
+        swap_dist = distance_function(positive, negative)
+        negative_dist = torch.min(negative_dist, swap_dist)
+
+    output = torch.clamp(positive_dist - negative_dist + margin, min=0.0)
+
+    reduction_enum = _Reduction.get_enum(reduction)
+    if reduction_enum == 1:
+        return output.mean()
+    elif reduction_enum == 2:
+        return output.sum()
+    else:
+        return output
+
+
 def normalize(input, p=2, dim=1, eps=1e-12, out=None):
     # type: (Tensor, float, int, float, Optional[Tensor]) -> Tensor
     r"""Performs :math:`L_p` normalization of inputs over specified dimension.
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index d7656b72425a..215fb0278dc6 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -22,9 +22,9 @@ GRID_SAMPLE_PADDING_MODES = Dict[str, int]
 # This was necessary since the JIT uses BroadcastingList* types but static checking with mypy etc requires a `Sequence`
 # type. There is no way to express the expected lengths of these lists in the current Python typing system.
 #
-# Functions created via `_add_docstr` in `functional.py` where merely typed as `Any` by `stubgen`, so those were 
-# deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code 
-# generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system 
+# Functions created via `_add_docstr` in `functional.py` where merely typed as `Any` by `stubgen`, so those were
+# deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code
+# generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system
 # to encode the type semantics of `_add_docstr`, should that system ever become widespread.
 def fractional_max_pool2d_with_indices(input: Tensor, kernel_size: _size, output_size: Optional[_size] = ...,
                                        output_ratio: Optional[_ratio_any_t] = ..., return_indices: bool = ...,
@@ -319,6 +319,11 @@ def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, marg
                         reduce: Optional[bool] = ..., reduction: str = ...) -> Tensor: ...
 
 
+def triplet_margin_with_distance_loss(anchor: Tensor, positive: Tensor, negative: Tensor, *,
+                                      distance_function: Optional[Callable[[Tensor, Tensor], Tensor]]=...,
+                                      margin: float=..., swap: bool=..., reduction: str=...) -> Tensor: ...
+
+
 def normalize(input: Tensor, p: float = ..., dim: int = ..., eps: float = ...,
               out: Optional[Tensor] = ...) -> Tensor: ...
 
diff --git a/torch/nn/modules/__init__.py b/torch/nn/modules/__init__.py
index b5a03d4a049d..06a565700550 100644
--- a/torch/nn/modules/__init__.py
+++ b/torch/nn/modules/__init__.py
@@ -8,8 +8,8 @@
     Hardsigmoid, Hardswish, SiLU
 from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
     CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
-    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, \
-    SmoothL1Loss, SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, PoissonNLLLoss
+    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, SmoothL1Loss, \
+    SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, TripletMarginWithDistanceLoss, PoissonNLLLoss
 from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
 from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \
     MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, FractionalMaxPool3d, LPPool1d, LPPool2d, \
@@ -54,5 +54,5 @@
     'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
     'AdaptiveLogSoftmaxWithLoss', 'TransformerEncoder', 'TransformerDecoder',
     'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Transformer',
-    'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU',
+    'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU', 'TripletMarginWithDistanceLoss'
 ]
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 62323fda40f4..91a62a85771e 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1,11 +1,12 @@
 import warnings
 
+from .distance import PairwiseDistance
 from .module import Module
 from .. import functional as F
 from .. import _reduction as _Reduction
 
 from torch import Tensor
-from typing import Optional
+from typing import Callable, Optional
 
 
 class _Loss(Module):
@@ -1191,6 +1192,9 @@ class TripletMarginLoss(_Loss):
     .. math::
         d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
 
+    See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the
+    triplet margin loss for input tensors using a custom distance function.
+
     Args:
         margin (float, optional): Default: :math:`1`.
         p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
@@ -1215,7 +1219,8 @@ class TripletMarginLoss(_Loss):
 
     Shape:
         - Input: :math:`(N, D)` where :math:`D` is the vector dimension.
-        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
+            otherwise.
 
     >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
     >>> anchor = torch.randn(100, 128, requires_grad=True)
@@ -1246,6 +1251,120 @@ def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
                                      eps=self.eps, swap=self.swap, reduction=self.reduction)
 
 
+class TripletMarginWithDistanceLoss(_Loss):
+    r"""Creates a criterion that measures the triplet loss given input
+    tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+    positive, and negative examples, respectively), and a nonnegative,
+    real-valued function ("distance function") used to compute the relationship
+    between the anchor and positive example ("positive distance") and the
+    anchor and negative example ("negative distance").
+
+    The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``)
+    can be described as:
+
+    .. math::
+        \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
+
+    where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function
+    quantifying the closeness of two tensors, referred to as the :attr:`distance_function`;
+    and :math:`margin` is a non-negative margin representing the minimum difference
+    between the positive and negative distances that is required for the loss to
+    be 0.  The input tensors have :math:`N` elements each and can be of any shape
+    that the distance function can handle.
+
+    If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet
+    loss for input tensors using the :math:`l_p` distance as the distance function.
+
+    Args:
+        distance_function (callable, optional): A nonnegative, real-valued function that
+            quantifies the closeness of two tensors. If not specified,
+            `nn.PairwiseDistance` will be used.  Default: ``None``
+        margin (float, optional): A non-negative margin representing the minimum difference
+            between the positive and negative distances required for the loss to be 0. Larger
+            margins penalize cases where the negative examples are not distant enough from the
+            anchors, relative to the positives. Default: :math:`1`.
+        swap (bool, optional): Whether to use the distance swap described in the paper
+            `Learning shallow convolutional feature descriptors with triplet losses` by
+            V. Balntas, E. Riba et al. If True, and if the positive example is closer to the
+            negative example than the anchor is, swaps the positive example and the anchor in
+            the loss computation. Default: ``False``.
+        reduction (string, optional): Specifies the (optional) reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions
+          as supported by the distance function.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
+          otherwise.
+
+    Examples::
+
+    >>> # Initialize embeddings
+    >>> embedding = nn.Embedding(1000, 128)
+    >>> anchor_ids = torch.randint(0, 1000, (1,), requires_grad=True)
+    >>> positive_ids = torch.randint(0, 1000, (1,), requires_grad=True)
+    >>> negative_ids = torch.randint(0, 1000, (1,), requires_grad=True)
+    >>> anchor = embedding(anchor_ids)
+    >>> positive = embedding(positive_ids)
+    >>> negative = embedding(negative_ids)
+    >>>
+    >>> # Built-in Distance Function
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance())
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function
+    >>> def l_infinity(x1, x2):
+    >>>     return torch.max(torch.abs(x1 - x2), dim=1).values
+    >>>
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5)
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function (Lambda)
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(
+    >>>         distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+
+    Reference:
+        V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
+        http://www.bmva.org/bmvc/2016/papers/paper119/index.html
+    """
+    __constants__ = ['margin', 'swap', 'reduction']
+    margin: float
+    swap: bool
+
+    def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
+                 margin: float = 1.0, swap: bool = False, reduction: str = 'mean'):
+        super(TripletMarginWithDistanceLoss, self).__init__(size_average=None, reduce=None, reduction=reduction)
+        self.distance_function = distance_function if distance_function is not None else PairwiseDistance()
+        self.margin = margin
+        self.swap = swap
+
+    def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+        return F.triplet_margin_with_distance_loss(anchor, positive, negative,
+                                                   distance_function=self.distance_function,
+                                                   margin=self.margin, swap=self.swap, reduction=self.reduction)
+
+
 class CTCLoss(_Loss):
     r"""The Connectionist Temporal Classification loss.
 
diff --git a/torch/overrides.py b/torch/overrides.py
index d17c6c4f7473..b287bf17958a 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -624,6 +624,9 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.nn.functional.threshold: lambda input, threshold, value, inplace=False: -1,
         torch.nn.functional.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06,
                                                   swap=False, size_average=None, reduce=None, reduction='mean': -1),
+        torch.nn.functional.triplet_margin_with_distance_loss: (lambda anchor, positive, negative, *,
+                                                                distance_function=None, margin=1.0,
+                                                                swap=False, reduction='mean': -1),
         torch.nn.functional.unfold: lambda input, kernel_size, dilation=1, padding=0, stride=1: -1,
         torch.nonzero: lambda input, as_tuple=False: -1,
         torch.norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1,

From e2b40ce793674531e0435510c6a1fe8f63e3958a Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Tue, 22 Sep 2020 11:40:12 -0700
Subject: [PATCH 018/449] Support BFloat16 for binary logical operators on CUDA
 (#42485)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/42485

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D23684423

Pulled By: mruberry

fbshipit-source-id: edc2b46b726361d4c8bf8a4bf4e4a09197b20428
---
 .../ATen/native/cuda/BinaryLogicalOpsKernels.cu  |  6 +++---
 test/test_torch.py                               | 16 ----------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
index 2a9b188520f5..20a851d1b2ce 100644
--- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
@@ -10,7 +10,7 @@
 namespace at { namespace native {
 
 void logical_and_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_and_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_and_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return a && b;
     });
@@ -18,7 +18,7 @@ void logical_and_kernel_cuda(TensorIterator& iter) {
 }
 
 void logical_or_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_or_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_or_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return a || b;
     });
@@ -26,7 +26,7 @@ void logical_or_kernel_cuda(TensorIterator& iter) {
 }
 
 void logical_xor_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_xor_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_xor_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return bool(a) != bool(b);
     });
diff --git a/test/test_torch.py b/test/test_torch.py
index c8dfd5115333..440bf30286bb 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6342,16 +6342,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
         a = torch.tensor(a_, dtype=dtypes[0], device=device)
         b = torch.tensor(b_, dtype=dtypes[1], device=device)
 
-        # Skip bfloat16 on CUDA. Remove this after bfloat16 is supported on CUDA.
-        # After type promotion of bfloat16 is supported, some bfloat16 logical operation will go through on
-        # CUDA as long as the two tensors are promoted to a supported type.
-        # TODO: Remove this once logical operators are improved to take care of bfloat16.
-        if self.device_type == 'cuda' and torch.bfloat16 in dtypes:
-            if torch.promote_types(dtypes[0], dtypes[1]) == torch.bfloat16:
-                with self.assertRaises(RuntimeError):
-                    getattr(a, op)(b)
-                return
-
         if dtypes[0].is_complex or dtypes[1].is_complex:
             with self.assertRaises(RuntimeError):
                 getattr(a, op)(b)
@@ -6371,12 +6361,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
                 getattr(a, op + '_')(b)
             return
 
-        # TODO: remove when logical ops support bfloat16 on CUDA.
-        if self.device_type == 'cuda' and dtypes[0] == torch.bfloat16:
-            with self.assertRaises(RuntimeError):
-                getattr(a, op + '_')(b)
-            return
-
         # TODO: remove when complex ops are supported
         if dtypes[0].is_complex:
             with self.assertRaises(RuntimeError):

From 09aee06e821546d1aaee345143183e42261cb674 Mon Sep 17 00:00:00 2001
From: Daya Khudia <dskhudia@fb.com>
Date: Tue, 22 Sep 2020 11:42:57 -0700
Subject: [PATCH 019/449] [caffe2] Replace embedding conversion ops with fbgemm
 functions (#44843)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44843

Replace perfkernels calls with fbgemm kernels to avoid code duplication
ghstack-source-id: 112496292

Test Plan: CI

Reviewed By: radkris-git

Differential Revision: D23675519

fbshipit-source-id: 05c285a9eeb9ea109a04a78cb442a24ee40a4aec
---
 .../fused_nbit_rowwise_conversion.cc          |  92 ++-
 .../fused_nbit_rowwise_conversion_avx2.cc     | 534 ------------------
 test/quantization/test_quantized_op.py        |   4 +-
 3 files changed, 34 insertions(+), 596 deletions(-)
 delete mode 100644 caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc

diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc
index 528bbee3c2ca..35b9605021e6 100644
--- a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc
+++ b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc
@@ -6,6 +6,10 @@
 
 #include "common.h"
 
+#ifdef USE_FBGEMM
+#include "fbgemm/QuantUtils.h"
+#endif
+
 namespace caffe2 {
 
 void FloatToFused8BitRowwiseQuantized__base(
@@ -58,46 +62,32 @@ void Fused8BitRowwiseQuantizedToFloat__base(
   }
 }
 
-decltype(FloatToFused8BitRowwiseQuantized__base)
-    FloatToFused8BitRowwiseQuantized__avx2_fma;
 void FloatToFused8BitRowwiseQuantized(
     const float* input,
     int input_rows,
     int input_columns,
     std::uint8_t* output) {
-  AVX2_FMA_DO(
-      FloatToFused8BitRowwiseQuantized,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      FloatToFused8BitRowwiseQuantized,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat(
+      input, input_rows, input_columns, output);
+#else
+  FloatToFused8BitRowwiseQuantized__base(
+      input, input_rows, input_columns, output);
+#endif
 }
 
-decltype(Fused8BitRowwiseQuantizedToFloat__base)
-    Fused8BitRowwiseQuantizedToFloat__avx2_fma;
 void Fused8BitRowwiseQuantizedToFloat(
     const std::uint8_t* input,
     int input_rows,
     int input_columns,
     float* output) {
-  AVX2_FMA_DO(
-      Fused8BitRowwiseQuantizedToFloat,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      Fused8BitRowwiseQuantizedToFloat,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat(
+      input, input_rows, input_columns, output);
+#else
+  Fused8BitRowwiseQuantizedToFloat__base(
+      input, input_rows, input_columns, output);
+#endif
 }
 
 void FloatToFusedNBitRowwiseQuantizedSBHalf__base(
@@ -184,52 +174,34 @@ void FusedNBitRowwiseQuantizedSBHalfToFloat__base(
   }
 }
 
-decltype(FloatToFusedNBitRowwiseQuantizedSBHalf__base)
-    FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma;
 void FloatToFusedNBitRowwiseQuantizedSBHalf(
     int bit_rate,
     const float* input,
     int input_rows,
     int input_columns,
     std::uint8_t* output) {
-  AVX2_FMA_DO(
-      FloatToFusedNBitRowwiseQuantizedSBHalf,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      FloatToFusedNBitRowwiseQuantizedSBHalf,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf(
+      bit_rate, input, input_rows, input_columns, output);
+#else
+  FloatToFusedNBitRowwiseQuantizedSBHalf__base(
+      bit_rate, input, input_rows, input_columns, output);
+#endif
 }
 
-decltype(FusedNBitRowwiseQuantizedSBHalfToFloat__base)
-    FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma;
 void FusedNBitRowwiseQuantizedSBHalfToFloat(
     int bit_rate,
     const std::uint8_t* input,
     int input_rows,
     int input_columns,
     float* output) {
-  AVX2_FMA_DO(
-      FusedNBitRowwiseQuantizedSBHalfToFloat,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
-  BASE_DO(
-      FusedNBitRowwiseQuantizedSBHalfToFloat,
-      bit_rate,
-      input,
-      input_rows,
-      input_columns,
-      output);
+#ifdef USE_FBGEMM
+  fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat(
+      bit_rate, input, input_rows, input_columns, output);
+#else
+  FusedNBitRowwiseQuantizedSBHalfToFloat__base(
+      bit_rate, input, input_rows, input_columns, output);
+#endif
 }
 
 } // namespace caffe2
diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc
deleted file mode 100644
index e7053b5136c0..000000000000
--- a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc
+++ /dev/null
@@ -1,534 +0,0 @@
-#include "./fused_nbit_rowwise_conversion.h"
-
-#include <immintrin.h>
-#include <algorithm>
-#include <cfloat> // for FLT_MAX
-#include <cmath>
-
-#include "./cvtsh_ss_bugfix.h"
-
-namespace caffe2 {
-
-constexpr int VLEN = 8;
-
-void FloatToFused8BitRowwiseQuantized__avx2_fma(
-    const float* input,
-    int input_rows,
-    int input_columns,
-    std::uint8_t* output) {
-  constexpr float kEpsilon = 1e-8f;
-
-  __m256i permute_mask1_v =
-      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
-  __m256i shuffle_mask_v = _mm256_set_epi8(
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0x0c,
-      0x08,
-      0x04,
-      0x00,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0xff,
-      0x0c,
-      0x08,
-      0x04,
-      0x00);
-  __m256i permute_mask2_v =
-      _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
-
-  int output_columns = input_columns + 2 * sizeof(float);
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const float* input_row = input + row * input_columns;
-    std::uint8_t* output_row = output + row * output_columns;
-    float* output_row_scale_bias =
-        reinterpret_cast<float*>(output_row + input_columns);
-
-    float minimum_element = FLT_MAX;
-    float maximum_element = -FLT_MAX;
-    __m256 min_v = _mm256_set1_ps(minimum_element);
-    __m256 max_v = _mm256_set1_ps(maximum_element);
-    std::size_t col;
-    for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) {
-      __m256 in_v = _mm256_loadu_ps(input_row + col);
-      min_v = _mm256_min_ps(min_v, in_v);
-      max_v = _mm256_max_ps(max_v, in_v);
-    }
-    alignas(64) float min_buf[VLEN], max_buf[VLEN];
-    _mm256_store_ps(min_buf, min_v);
-    _mm256_store_ps(max_buf, max_v);
-    for (int i = 0; i < VLEN; ++i) {
-      minimum_element = std::min(minimum_element, min_buf[i]);
-      maximum_element = std::max(maximum_element, max_buf[i]);
-    }
-    for (; col < input_columns; ++col) {
-      minimum_element = std::min(minimum_element, input_row[col]);
-      maximum_element = std::max(maximum_element, input_row[col]);
-    }
-
-    float range = maximum_element - minimum_element;
-
-    output_row_scale_bias[0] = range / 255.0f;
-    output_row_scale_bias[1] = minimum_element;
-    const auto inverse_scale = 255.0f / (range + kEpsilon);
-    min_v = _mm256_set1_ps(minimum_element);
-    __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale);
-
-    for (col = 0; col < input_columns / (4 * VLEN) * (4 * VLEN);
-         col += 4 * VLEN) {
-      __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v),
-          inverse_scale_v));
-      __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v),
-          inverse_scale_v));
-      __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v),
-          inverse_scale_v));
-      __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v),
-          inverse_scale_v));
-
-      // An instruction sequence to save 32 32-bit integers as 8-bit integers
-      __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v);
-      __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v);
-      __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v);
-      xyzw_packed_v =
-          _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v);
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i*>(output_row + col), xyzw_packed_v);
-    }
-    for (; col < input_columns / VLEN * VLEN; col += VLEN) {
-      __m256i rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-          _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v),
-          inverse_scale_v));
-
-      // An instruction sequence to save 8 32-bit integers as 8-bit integers
-      rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v);
-      rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask2_v);
-      _mm_storel_epi64(
-          reinterpret_cast<__m128i*>(output_row + col),
-          _mm256_castsi256_si128(rounded_v));
-    }
-    for (; col < input_columns; ++col) {
-      output_row[col] =
-          std::lrintf((input_row[col] - minimum_element) * inverse_scale);
-    }
-  }
-}
-
-void Fused8BitRowwiseQuantizedToFloat__avx2_fma(
-    const std::uint8_t* input,
-    int input_rows,
-    int input_columns,
-    float* output) {
-  int output_columns = input_columns - 2 * sizeof(float);
-
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const std::uint8_t* input_row = input + row * input_columns;
-    const float* input_row_scale_bias =
-        reinterpret_cast<const float*>(input_row + output_columns);
-    float* output_row = output + row * output_columns;
-
-    __m256 scale_v = _mm256_set1_ps(input_row_scale_bias[0]);
-    __m256 bias_v = _mm256_set1_ps(input_row_scale_bias[1]);
-
-    std::size_t col;
-    for (col = 0; col < output_columns / VLEN * VLEN; col += VLEN) {
-      __m256 in_v = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(
-          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(input_row + col))));
-      _mm256_storeu_ps(
-          output_row + col,
-          _mm256_add_ps(_mm256_mul_ps(in_v, scale_v), bias_v));
-    }
-
-    for (; col < output_columns; ++col) {
-      output_row[col] =
-          input_row[col] * input_row_scale_bias[0] + input_row_scale_bias[1];
-    }
-  }
-}
-
-namespace {
-
-template <int BIT_RATE>
-void FloatToFusedNBitRowwiseQuantizedSBHalf_(
-    const float* input,
-    int input_rows,
-    int input_columns,
-    std::uint8_t* output) {
-  __m256i permute_mask1_v =
-      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
-
-  int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
-  int output_columns =
-      (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE +
-      2 * sizeof(std::uint16_t);
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const float* input_row = input + row * input_columns;
-    std::uint8_t* output_row = output + row * output_columns;
-    std::uint16_t* output_row_scale_bias = reinterpret_cast<std::uint16_t*>(
-        output_row +
-        (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
-
-    float minimum_element = FLT_MAX;
-    float maximum_element = -FLT_MAX;
-    __m256 min_v = _mm256_set1_ps(minimum_element);
-    __m256 max_v = _mm256_set1_ps(maximum_element);
-    std::size_t col;
-    for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) {
-      __m256 in_v = _mm256_loadu_ps(input_row + col);
-      min_v = _mm256_min_ps(min_v, in_v);
-      max_v = _mm256_max_ps(max_v, in_v);
-    }
-    alignas(64) float min_buf[VLEN], max_buf[VLEN];
-    _mm256_store_ps(min_buf, min_v);
-    _mm256_store_ps(max_buf, max_v);
-    for (int i = 0; i < VLEN; ++i) {
-      minimum_element = std::min(minimum_element, min_buf[i]);
-      maximum_element = std::max(maximum_element, max_buf[i]);
-    }
-    for (; col < input_columns; ++col) {
-      minimum_element = std::min(minimum_element, input_row[col]);
-      maximum_element = std::max(maximum_element, input_row[col]);
-    }
-
-    output_row_scale_bias[1] = _cvtss_sh(
-        minimum_element, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    minimum_element = _cvtsh_ss(output_row_scale_bias[1]);
-    const float range = maximum_element - minimum_element;
-
-    float scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1);
-    std::uint16_t scale_fp16 =
-        _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    scale = _cvtsh_ss(scale_fp16);
-    if (scale == 0) {
-      // Corner case handling when maximum_element == minimum_element
-      // Any scale would work because maximum_element - minimum_element will be
-      // 0 for all X
-      scale = 1.0f;
-    }
-    float inverse_scale = 1.0f / scale;
-    if (std::isinf(inverse_scale)) {
-      scale = 1.0f;
-      inverse_scale = 1.0f;
-    }
-
-    output_row_scale_bias[0] =
-        _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-
-    __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale);
-    min_v = _mm256_set1_ps(minimum_element);
-
-    col = 0;
-
-    if (BIT_RATE == 2 || BIT_RATE == 4) {
-      for (; col + 4 * VLEN <= input_columns; col += 4 * VLEN) {
-        __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v),
-            inverse_scale_v));
-        __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v),
-            inverse_scale_v));
-        __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v),
-            inverse_scale_v));
-        __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps(
-            _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v),
-            inverse_scale_v));
-
-        // An instruction sequence to save 32 32-bit integers as 8-bit integers
-        __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v);
-        __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v);
-        __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v);
-        xyzw_packed_v =
-            _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v);
-
-        // saturate to BIT_RATE
-        xyzw_packed_v = _mm256_min_epu8(
-            xyzw_packed_v,
-            _mm256_set1_epi8(static_cast<char>((1 << BIT_RATE) - 1)));
-
-        if (BIT_RATE == 4) {
-          // pack into lower 8-bit of each 16-bit
-          xyzw_packed_v = _mm256_and_si256(
-              _mm256_or_si256(
-                  xyzw_packed_v, _mm256_srli_epi16(xyzw_packed_v, 4)),
-              _mm256_set1_epi16(0x00ff));
-        } else {
-          // pack into lower 8-bit of each 32-bit
-          xyzw_packed_v = _mm256_and_si256(
-              _mm256_or_si256(
-                  _mm256_or_si256(
-                      xyzw_packed_v, _mm256_srli_epi32(xyzw_packed_v, 6)),
-                  _mm256_or_si256(
-                      _mm256_srli_epi32(xyzw_packed_v, 8 + 4),
-                      _mm256_srli_epi32(xyzw_packed_v, 2 * 8 + 2))),
-              _mm256_set1_epi32(0x00ff));
-        }
-
-        __m128i out_v;
-        if (BIT_RATE == 4) {
-          // avx2 doesn't have _mm256_cvtepi16_epi8
-          out_v = _mm_packus_epi16(
-              _mm256_castsi256_si128(xyzw_packed_v),
-              _mm256_extractf128_si256(xyzw_packed_v, 1));
-          _mm_storeu_si128(
-              reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE),
-              out_v);
-        } else {
-          // avx2 doesn't have _mm256_cvtepi32_epi8
-          out_v = _mm_packus_epi32(
-              _mm256_castsi256_si128(xyzw_packed_v),
-              _mm256_extractf128_si256(xyzw_packed_v, 1));
-          out_v = _mm_packus_epi16(out_v, out_v);
-          _mm_storel_epi64(
-              reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE),
-              out_v);
-        }
-      }
-    }
-
-    for (; col < input_columns; ++col) {
-      float X = input_row[col];
-      std::uint8_t quantized = std::max(
-          0,
-          std::min<int>(
-              std::lrintf((X - minimum_element) * inverse_scale),
-              (1 << BIT_RATE) - 1));
-      if (col % NUM_ELEM_PER_BYTE == 0) {
-        output_row[col / NUM_ELEM_PER_BYTE] = quantized;
-      } else {
-        output_row[col / NUM_ELEM_PER_BYTE] |=
-            (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE));
-      }
-    }
-  }
-}
-
-template <int BIT_RATE>
-void FusedNBitRowwiseQuantizedSBHalfToFloat_(
-    const std::uint8_t* input,
-    int input_rows,
-    int input_columns,
-    float* output) {
-  constexpr int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
-  int output_columns =
-      (input_columns - 2 * sizeof(std::uint16_t)) * NUM_ELEM_PER_BYTE;
-
-  // mask can be accessed by avx2_ps_or_epi32_combined_mask[(8 - remainder) % 8]
-  static const int avx2_ps_or_epi32_combined_mask[16] = {
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      -1,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-      0,
-  };
-
-  // Compute a remainder for vector load
-  // Since every row is followed by 2 fp16 (scale and bias), luckily
-  // we don't need mask at bit-rate granularity but just at 32-bit
-  // granularity.
-  constexpr int NUM_ELEM_PER_32BIT = 32 / BIT_RATE;
-  // multiply by 4 because we're handling 4 vlen per iteration
-  constexpr int NUM_OF_32BIT_PER_VLOAD = VLEN * 4 / NUM_ELEM_PER_32BIT;
-  int remainder_32bit_granularity = (output_columns + NUM_ELEM_PER_32BIT - 1) /
-      NUM_ELEM_PER_32BIT % NUM_OF_32BIT_PER_VLOAD;
-  __m128i vmask_load = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(
-      avx2_ps_or_epi32_combined_mask + NUM_OF_32BIT_PER_VLOAD +
-      (NUM_OF_32BIT_PER_VLOAD - remainder_32bit_granularity) %
-          NUM_OF_32BIT_PER_VLOAD));
-  int remainder = output_columns % (4 * VLEN);
-  __m256i vmask_store0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN - std::min(output_columns % (4 * VLEN), VLEN) % (VLEN + 1))));
-  __m256i vmask_store1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN -
-       std::max(0, std::min(output_columns % (4 * VLEN) - VLEN, VLEN)) %
-           (VLEN + 1))));
-  __m256i vmask_store2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN -
-       std::max(0, std::min(output_columns % (4 * VLEN) - 2 * VLEN, VLEN)) %
-           (VLEN + 1))));
-  __m256i vmask_store3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(
-      avx2_ps_or_epi32_combined_mask +
-      (VLEN -
-       std::max(0, std::min(output_columns % (4 * VLEN) - 3 * VLEN, VLEN)) %
-           (VLEN + 1))));
-
-  for (std::size_t row = 0; row < input_rows; ++row) {
-    const std::uint8_t* input_row = input + row * input_columns;
-    const std::uint16_t* input_row_scale_bias =
-        reinterpret_cast<const std::uint16_t*>(
-            input_row +
-            (output_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
-    float scale = _cvtsh_ss(input_row_scale_bias[0]);
-    float bias = _cvtsh_ss(input_row_scale_bias[1]);
-    float* output_row = output + row * output_columns;
-
-    std::size_t col = 0;
-    if (BIT_RATE == 4 || BIT_RATE == 2) {
-      __m256 vscale = _mm256_set1_ps(scale);
-      __m256 vbias = _mm256_set1_ps(bias);
-      for (; col + 4 * VLEN <= output_columns; col += 4 * VLEN) {
-        __m256i vinq;
-        // unpack to 8-bit integers
-        if (BIT_RATE == 4) {
-          vinq = _mm256_cvtepu8_epi16(
-              _mm_loadu_si128(reinterpret_cast<const __m128i*>(
-                  input_row + col / NUM_ELEM_PER_BYTE)));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)),
-              _mm256_set1_epi16(0x0f0f));
-        } else {
-          vinq = _mm256_cvtepu8_epi32(
-              _mm_loadl_epi64(reinterpret_cast<const __m128i*>(
-                  input_row + col / NUM_ELEM_PER_BYTE)));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(
-                  _mm256_or_si256(
-                      _mm256_slli_epi32(vinq, 2 * 8 + 2),
-                      _mm256_slli_epi32(vinq, 8 + 4)),
-                  _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)),
-              _mm256_set1_epi32(0x03030303));
-        }
-        __m256 vinq0 = _mm256_cvtepi32_ps(
-            _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq)));
-        __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1))));
-        __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2))));
-        __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3))));
-        vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias);
-        vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias);
-        vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias);
-        vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias);
-        _mm256_storeu_ps(output_row + col, vinq0);
-        _mm256_storeu_ps(output_row + col + VLEN, vinq1);
-        _mm256_storeu_ps(output_row + col + 2 * VLEN, vinq2);
-        _mm256_storeu_ps(output_row + col + 3 * VLEN, vinq3);
-      }
-
-      if (remainder) {
-        __m256i vinq;
-        if (BIT_RATE == 4) {
-          vinq = _mm256_cvtepu8_epi16(_mm_maskload_epi32(
-              reinterpret_cast<const int*>(input_row + col / NUM_ELEM_PER_BYTE),
-              vmask_load));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)),
-              _mm256_set1_epi16(0x0f0f));
-        } else {
-          vinq = _mm256_cvtepu8_epi32(_mm_maskload_epi32(
-              reinterpret_cast<const int*>(input_row + col / NUM_ELEM_PER_BYTE),
-              vmask_load));
-          vinq = _mm256_and_si256(
-              _mm256_or_si256(
-                  _mm256_or_si256(
-                      _mm256_slli_epi32(vinq, 2 * 8 + 2),
-                      _mm256_slli_epi32(vinq, 8 + 4)),
-                  _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)),
-              _mm256_set1_epi32(0x03030303));
-        }
-
-        __m256 vinq0 = _mm256_cvtepi32_ps(
-            _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq)));
-        __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1))));
-        __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2))));
-        __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(
-            _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3))));
-
-        vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias);
-        vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias);
-        vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias);
-        vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias);
-
-        _mm256_maskstore_ps(output_row + col, vmask_store0, vinq0);
-        _mm256_maskstore_ps(output_row + col + VLEN, vmask_store1, vinq1);
-        _mm256_maskstore_ps(output_row + col + 2 * VLEN, vmask_store2, vinq2);
-        _mm256_maskstore_ps(output_row + col + 3 * VLEN, vmask_store3, vinq3);
-      }
-    } else {
-      for (; col < output_columns; ++col) {
-        std::uint8_t quantized = input_row[col / NUM_ELEM_PER_BYTE];
-        quantized >>= (col % NUM_ELEM_PER_BYTE) * BIT_RATE;
-        quantized &= (1 << BIT_RATE) - 1;
-        output_row[col] = scale * quantized + bias;
-      }
-    }
-  }
-}
-} // namespace
-
-void FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma(
-    int bit_rate,
-    const float* input,
-    int input_rows,
-    int input_columns,
-    std::uint8_t* output) {
-  if (bit_rate == 2) {
-    FloatToFusedNBitRowwiseQuantizedSBHalf_<2>(
-        input, input_rows, input_columns, output);
-  } else if (bit_rate == 4) {
-    FloatToFusedNBitRowwiseQuantizedSBHalf_<4>(
-        input, input_rows, input_columns, output);
-  } else if (bit_rate == 8) {
-    FloatToFusedNBitRowwiseQuantizedSBHalf_<8>(
-        input, input_rows, input_columns, output);
-  }
-}
-
-void FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma(
-    int bit_rate,
-    const std::uint8_t* input,
-    int input_rows,
-    int input_columns,
-    float* output) {
-  if (bit_rate == 2) {
-    FusedNBitRowwiseQuantizedSBHalfToFloat_<2>(
-        input, input_rows, input_columns, output);
-  } else if (bit_rate == 4) {
-    FusedNBitRowwiseQuantizedSBHalfToFloat_<4>(
-        input, input_rows, input_columns, output);
-  } else {
-    FusedNBitRowwiseQuantizedSBHalfToFloat_<8>(
-        input, input_rows, input_columns, output);
-  }
-}
-
-} // namespace caffe2
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 3a0e6f10bf33..674ace864343 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -2779,9 +2779,9 @@ def get_c2_weights(weights):
         w_packed_c2, w_unpacked_c2 = get_c2_weights(weights)
 
         # Compare packed weights against C2.
-        np.testing.assert_equal(w_packed.numpy(), w_packed_c2.numpy())
+        np.testing.assert_allclose(w_packed.numpy(), w_packed_c2.numpy(), atol=1e-6, rtol=1e-6)
         # Compare unpacked weights against C2
-        np.testing.assert_equal(w_unpacked.numpy(), w_unpacked_c2.numpy())
+        np.testing.assert_allclose(w_unpacked.numpy(), w_unpacked_c2.numpy(), atol=1e-6, rtol=1e-6)
 
     """ Tests the correctness of the embedding_bag_8bit pack/unpack op against C2 """
     @given(num_embeddings=st.integers(10, 100),

From 2b1f25885e667b29f806afd01f68652393fbd07c Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Tue, 22 Sep 2020 11:44:14 -0700
Subject: [PATCH 020/449] [quant] Fix ConvTranspose mapping (#44844)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44844

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23746466

Pulled By: z-a-f

fbshipit-source-id: cb84e0fef5ab82e8ed8dd118d9fb21ee7b480ef7
---
 torch/quantization/quantization_mappings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py
index 585b018a5b01..60d166ae4896 100644
--- a/torch/quantization/quantization_mappings.py
+++ b/torch/quantization/quantization_mappings.py
@@ -21,6 +21,8 @@
     nn.Conv1d: nnq.Conv1d,
     nn.Conv2d: nnq.Conv2d,
     nn.Conv3d: nnq.Conv3d,
+    nn.ConvTranspose1d: nnq.ConvTranspose1d,
+    nn.ConvTranspose2d: nnq.ConvTranspose2d,
     nn.BatchNorm2d: nnq.BatchNorm2d,
     nn.BatchNorm3d: nnq.BatchNorm3d,
     nn.LayerNorm: nnq.LayerNorm,

From c253b101545cc1a2ba5a4ab467cd972a63ac072d Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@gmail.com>
Date: Tue, 22 Sep 2020 11:51:58 -0700
Subject: [PATCH 021/449] Fix incorrect EnumValue serialization issue (#44891)

Summary:
Previously, `prim::EnumValue` is serialized to `ops.prim.EnumValue`, which doesn't have the right implementation to refine return type. This diff correctly serializes it to enum.value, thus fixing the issue.

Fixes https://github.com/pytorch/pytorch/issues/44892

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44891

Reviewed By: malfet

Differential Revision: D23818962

Pulled By: gmagogsfm

fbshipit-source-id: 6edfdf9c4b932176b08abc69284a916cab10081b
---
 test/jit/test_enum.py                         | 20 +++++++++++++++++++
 torch/csrc/jit/serialization/python_print.cpp | 10 ++++++++++
 2 files changed, 30 insertions(+)

diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py
index a242217a94c1..aa34c22413ad 100644
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@@ -267,6 +267,26 @@ def forward(self):
 
         self.assertEqual(scripted(), Color.RED.value)
 
+    def test_string_enum_as_module_attribute(self):
+        global Color
+
+        class Color(Enum):
+            RED = "red"
+            GREEN = "green"
+
+        class TestModule(torch.nn.Module):
+            def __init__(self, e: Color):
+                super(TestModule, self).__init__()
+                self.e = e
+
+            def forward(self):
+                return (self.e.name, self.e.value)
+
+        m = TestModule(Color.RED)
+        scripted = torch.jit.script(m)
+
+        self.assertEqual(scripted(), (Color.RED.name, Color.RED.value))
+
     def test_enum_return(self):
         global Color
 
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index b8339b5c86a7..e04339dacc22 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -1126,6 +1126,16 @@ struct PythonPrintImpl {
         stmt << useOf(node->input(0)) << ".tolist()"
              << ")";
       } break;
+      case prim::EnumValue:
+        // Note: This CAN NOT be printed as raw operator ops.prim.EnumValue
+        // because its return type depends on type of enum and must be further
+        // resolved, but ops.prim.EnumValue construction does not provide such
+        // functionality.
+        stmt << "(" << useOf(node->input()) << ").value";
+        break;
+      case prim::EnumName:
+        stmt << "(" << useOf(node->input()) << ").name";
+        break;
       default: {
         printOpName(stmt, node->kind());
         const FunctionSchema& schema = node->schema();

From a4ce3f4194d2a5764d228f623b3daaf72480ed51 Mon Sep 17 00:00:00 2001
From: Viswesh Sankaran <viswesh@fb.com>
Date: Tue, 22 Sep 2020 13:37:40 -0700
Subject: [PATCH 022/449] Fix type hint warnings for
 common_methods_invocations.py (#44971)

Summary:
Fixes a subtask of https://github.com/pytorch/pytorch/issues/42969

Tested the following and no warnings were seen.

python test/test_type_hints.py
....
----------------------------------------------------------------------
Ran 4 tests in 180.759s

OK

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44971

Reviewed By: walterddr

Differential Revision: D23822274

Pulled By: visweshfb

fbshipit-source-id: e3485021e348ee0a8508a9d128f04bad721795ef
---
 mypy.ini                                              | 3 ---
 torch/testing/_internal/common_methods_invocations.py | 8 +++++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index a7d4acea9571..1891f2790d1e 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -62,9 +62,6 @@ ignore_errors = True
 [mypy-torch.testing._internal.hypothesis_utils.*]
 ignore_errors = True
 
-[mypy-torch.testing._internal.common_methods_invocations.*]
-ignore_errors = True
-
 [mypy-torch.testing._internal.common_nn.*]
 ignore_errors = True
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index dd429deacbf0..b208f220e30a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -7,6 +7,8 @@
 from torch._six import inf, istuple
 from torch.autograd import Variable
 
+from typing import List, Tuple, Dict, Any
+
 from torch.testing import \
     (make_non_contiguous, _dispatch_dtypes,
      floating_types, floating_types_and, floating_and_complex_types,
@@ -1540,7 +1542,7 @@ def _compare_large_trilu_indices(
     (2028, 1, -1)
 ]
 
-tri_large_tests_args = [
+tri_large_tests_args: List[Tuple[int, ...]] = [
     # Large test cases below are deliberately commented out to speed up CI
     # tests and to avoid OOM error. When modifying implementations of
     # tril_indices and triu_indices, please enable these tests and make sure
@@ -1602,9 +1604,9 @@ def unpack_variables(args):
     'reshape',
     'where'  # argument order
 }
-EXCLUDE_GRADCHECK = {
+EXCLUDE_GRADCHECK: Dict[str, Any] = {
 }
-EXCLUDE_GRADGRADCHECK = {
+EXCLUDE_GRADGRADCHECK: Dict[str, Any] = {
 }
 EXCLUDE_GRADGRADCHECK_BY_TEST_NAME = {
     # *det methods uses svd in backward when matrix is not invertible. However,

From def433bbb6a914532bf3eb0687751a56e8dae685 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Tue, 22 Sep 2020 14:55:49 -0700
Subject: [PATCH 023/449] .circleci: Upgrade all xcode 9 workers to xcode 11
 (#45153)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45153

xcode 9 is being deprectated within circleci infra so we should get
everything else on a more recent version of xcode

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D23852774

Pulled By: seemethere

fbshipit-source-id: c02e162f1993d408de439fee21b340e9640e5a24
---
 .circleci/config.yml                                      | 8 ++++----
 .circleci/verbatim-sources/job-specs/binary-job-specs.yml | 4 ++--
 .circleci/verbatim-sources/job-specs/job-specs-custom.yml | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5ca2d725b9e9..c952ee716b3d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -924,7 +924,7 @@ jobs:
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run:
@@ -949,7 +949,7 @@ jobs:
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
@@ -1253,7 +1253,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -1287,7 +1287,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - attach_workspace:
diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
index bd26e8b2b373..7e635f42bce4 100644
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@@ -135,7 +135,7 @@
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run:
@@ -160,7 +160,7 @@
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 810f16922d5c..9cc75136cfdd 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -109,7 +109,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -143,7 +143,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "9.4.1"
+      xcode: "11.2.1"
     steps:
       - checkout
       - attach_workspace:

From 79fe794f871691f7c4f3727694a3b9a9339b32f3 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 22 Sep 2020 14:56:15 -0700
Subject: [PATCH 024/449] [FX] Make Graphs immutable and make GraphModule
 recompile after assigning graph (#44830)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44830

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D23743850

Pulled By: jamesr66a

fbshipit-source-id: 501b92a89ff636c26abeff13105a75462384554c
---
 test/test_fx.py          | 13 ++++++++-----
 torch/fx/graph.py        | 20 ++++++++++++++++----
 torch/fx/graph_module.py | 30 ++++++++++++++++++++++++++----
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 89311e2a2873..f191a73c40c4 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -174,10 +174,12 @@ def forward(self, a, b):
                 return a + b
         m = M()
         g = symbolic_trace(m).graph
-        t = Proxy(g.result)
+        new_g = torch.fx.Graph()
+        new_g.graph_copy(g)
+        t = Proxy(new_g.nodes[-1])
         # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules.
-        g.output((t + t).node)
-        gm = GraphModule(m, g)
+        new_g.output((t + t).node)
+        gm = GraphModule(m, new_g)
         self.assertEqual(gm(3, 4), 14)
 
     @skipIfNoTorchVision
@@ -466,9 +468,10 @@ def test_deepcopy_graphmodule_with_transform(self):
         traced = symbolic_trace(st)
 
         def transform(traced):
-            new_graph = copy.deepcopy(traced.graph)
+            new_graph = torch.fx.Graph()
+            new_graph.graph_copy(traced.graph)
             relu_out = new_graph.create_node(
-                op='call_method', target='neg', args=(new_graph.result,), kwargs={})
+                op='call_method', target='neg', args=(new_graph.nodes[-1],), kwargs={})
             new_graph.output(relu_out)
             return GraphModule(traced, new_graph)
         transformed = transform(traced)
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index a63b7c8b35dc..6214f60c61e6 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -67,9 +67,21 @@ def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
 
 class Graph:
     def __init__(self):
-        self.nodes : List[Node] = []
+        self._nodes : List[Node] = []
         self._used_names : Dict[str, int] = {}  # base name -> number
 
+    @property
+    def nodes(self):
+        return tuple(self._nodes)
+
+    def graph_copy(self, g : 'Graph'):
+        """
+        Append all nodes from graph `g` to this graph
+        """
+        val_map : Dict[Node, Node] = {}
+        for node in g._nodes:
+            val_map[node] = self.node_copy(node, lambda n : val_map[n])
+
     def _mark_uses(self, a: Argument):
         def add_use(n: Node):
             n.uses += 1
@@ -86,7 +98,7 @@ def create_node(self, op: str, target: Target,
         self._mark_uses(args)
         self._mark_uses(kwargs)
         n = Node(self, name if name is not None else self._name(target), op, target, args, kwargs)
-        self.nodes.append(n)
+        self._nodes.append(n)
         return n
 
     # sugar for above when you know the op
@@ -161,7 +173,7 @@ def _name(self, target: Target) -> str:
     def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
         free_vars: List[str] = []
         body: List[str] = []
-        for node in self.nodes:
+        for node in self._nodes:
             if node.op == 'placeholder':
                 assert isinstance(node.target, str)
                 free_vars.append(node.target)
@@ -237,7 +249,7 @@ def format_node(n : Node) -> Optional[str]:
                        f'args = {format_arg(n.args)}, kwargs = {format_arg(n.kwargs)})'
 
 
-        node_strs = [format_node(node) for node in self.nodes]
+        node_strs = [format_node(node) for node in self._nodes]
         param_str = ', '.join(placeholder_names)
         s = f'graph({param_str}):'
         for node_str in node_strs:
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index df40cbd84fe1..83feed72b752 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -97,6 +97,17 @@ def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
     setattr(to_module, field, from_obj)
 
 class GraphModule(torch.nn.Module):
+    """
+    GraphModule is an nn.Module generated from an fx.Graph. GraphModule has
+    important attributes:
+
+        graph : The graph from which this GraphModule was generated
+        code : The Python source code for the function generated from `graph`
+        forward : The Python method generated from `graph`
+
+    Note that when `graph` is reassigned, `code` and `forward` will be automatically
+    regenerated.
+    """
     def __new__(cls: 'Type[GraphModule]', *args, **kwargs):
         # each instance of a graph module needs its own forward method
         # so create a new singleton class for each instance.
@@ -148,10 +159,21 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
         else:
             raise RuntimeError('Unsupported type ' + str(root) + ' passed for root!')
         self.graph = graph
-        self._generate_forward()
 
-    def _generate_forward(self) -> None:
-        body, result, free_variables = self.graph.python_code(root_module='self')
+    # TorchScript breaks trying to compile the graph setter because of the
+    # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
+    #
+    # Shouldn't be an issue since these methods shouldn't be used in TorchScript anyway
+    __ignored_properties__ = ['graph']
+
+    @property
+    def graph(self):
+        return self._graph
+
+    @graph.setter
+    def graph(self, val) -> None:
+        self._graph = val
+        body, result, free_variables = self._graph.python_code(root_module='self')
         body = '\n'.join('    ' + line for line in body.split('\n')) + '\n'
         self.code = f"""\
 def forward(self, {', '.join(free_variables)}):
@@ -163,7 +185,7 @@ def forward(self, {', '.join(free_variables)}):
 
     def __reduce__(self):
         dict_without_graph = self.__dict__.copy()
-        del dict_without_graph['graph']
+        del dict_without_graph['_graph']
         return (deserialize_graphmodule, (dict_without_graph,))
 
     # because __reduce__ is defined for serialization,

From d1c68a706985d7115c42bfb007b4cf643d172050 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Tue, 22 Sep 2020 15:06:14 -0700
Subject: [PATCH 025/449] Clarify that 5-D 'bilinear' grid_sample is actually
 trilinear (#45090)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/41528

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45090

Reviewed By: ailzhang

Differential Revision: D23841046

Pulled By: zou3519

fbshipit-source-id: 941770cd5b3e705608957739026e9113e5f0c616
---
 torch/nn/functional.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index f4dbceeb88b1..2fdb40b2d93f 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -3331,6 +3331,9 @@ def grid_sample(input, grid, mode='bilinear', padding_mode='zeros', align_corner
                        or :math:`(N, D_\text{out}, H_\text{out}, W_\text{out}, 3)` (5-D case)
         mode (str): interpolation mode to calculate output values
             ``'bilinear'`` | ``'nearest'``. Default: ``'bilinear'``
+            Note: When ``mode='bilinear'`` and the input is 5-D, the interpolation mode
+            used internally will actually be trilinear. However, when the input is 4-D,
+            the interpolation mode will legitimately be bilinear.
         padding_mode (str): padding mode for outside grid values
             ``'zeros'`` | ``'border'`` | ``'reflection'``. Default: ``'zeros'``
         align_corners (bool, optional): Geometrically, we consider the pixels of the

From cddcfde81d6482baa2e84fd1400e9ee60c4f9a3e Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Tue, 22 Sep 2020 16:28:39 -0700
Subject: [PATCH 026/449] [JIT] Fix WithTest.test_with_exceptions (#45106)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45106

**Summary**
This commit fixes `WithTest.test_with_exceptions`. It's been running
in regular Python this whole time; none of the functions created and
invoked for the test were scripted. Fortunately, the tests still pass
after being fixed.

**Test Plan**
Ran unit tests + continuous integration.

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D23848206

Pulled By: SplitInfinity

fbshipit-source-id: fd975ee34db9441ef4e4a4abf2fb21298166bbaa
---
 test/jit/test_with.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/test/jit/test_with.py b/test/jit/test_with.py
index 15e1362ea722..ffd0631639f6 100644
--- a/test/jit/test_with.py
+++ b/test/jit/test_with.py
@@ -359,6 +359,7 @@ def test_with_exceptions(self):
         Check that exceptions thrown in the bodies of with-statements are
         handled correctly.
         """
+        global Context
 
         @torch.jit.script
         class Context(object):
@@ -379,10 +380,12 @@ def __enter__(self):
             def __exit__(self, type: Any, value: Any, tb: Any):
                 self.count.sub_(0.3)
 
+        @torch.jit.script
         def method_that_raises():
             # type: () -> Tensor
-            raise Exception()
+            raise Exception("raised exception")
 
+        @torch.jit.script
         def test_exception(x, c):
             # type: (Tensor, Context) -> Tensor
             """
@@ -393,6 +396,7 @@ def test_exception(x, c):
 
             return x
 
+        @torch.jit.script
         def test_exception_nested(x, c):
             # type: (Tensor, Context) -> Tensor
             """
@@ -404,6 +408,7 @@ def test_exception_nested(x, c):
 
             return x
 
+        @torch.jit.script
         def with_that_raises(c):
             # type: (Context) -> Tensor
             a = torch.tensor([1])
@@ -413,6 +418,7 @@ def with_that_raises(c):
 
             return a
 
+        @torch.jit.script
         def test_exception_fn_call(x, c):
             # type: (Tensor, Context) -> Tensor
             """
@@ -426,15 +432,18 @@ def test_exception_fn_call(x, c):
 
         c = Context(1)
 
-        with self.assertRaises(Exception):
+        # checkScript and checkScriptRaisesRegex cannot be used because the string frontend will
+        # not compile class types (of which Context, the context manager being used for this test
+        # is one).
+        with self.assertRaisesRegex(Exception, r"raised exception"):
             test_exception(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
-        with self.assertRaises(Exception):
+        with self.assertRaisesRegex(Exception, r"raised exception"):
             test_exception_nested(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 
-        with self.assertRaises(Exception):
+        with self.assertRaisesRegex(Exception, r"raised exception"):
             test_exception_fn_call(torch.randn(2), c)
         self.assertEqual(c.count, 1)
 

From 35cdb01327ddbfc886ca08a60064009fe362fdad Mon Sep 17 00:00:00 2001
From: hangjunxu <hangjunxu@devvm654.atn0.facebook.com>
Date: Tue, 22 Sep 2020 16:52:34 -0700
Subject: [PATCH 027/449] [PyTorch] Enable type check for autocast_test_lists
 (#45107)

Summary:
This is a sub-task for addressing: https://github.com/pytorch/pytorch/issues/42969. We re-enable type check for `autocast_test_lists `.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45107

Test Plan:
`python test/test_type_hints.py` passed:
```
(pytorch) bash-5.0$ with-proxy python test/test_type_hints.py
....
----------------------------------------------------------------------
Ran 4 tests in 103.871s

OK
```

Reviewed By: walterddr

Differential Revision: D23842884

Pulled By: Hangjun

fbshipit-source-id: a39f3810e3abebc6b4c1cb996b06312f6d42ffd6
---
 mypy.ini                                       | 3 ---
 torch/testing/_internal/autocast_test_lists.py | 3 +--
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 1891f2790d1e..07cdbc4dd6fa 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -56,9 +56,6 @@ ignore_errors = True
 [mypy-torch.testing._internal.codegen.*]
 ignore_errors = True
 
-[mypy-torch.testing._internal.autocast_test_lists.*]
-ignore_errors = True
-
 [mypy-torch.testing._internal.hypothesis_utils.*]
 ignore_errors = True
 
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index 015cbd658816..13f65952af24 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -155,8 +155,7 @@ def __init__(self, dev):
             ("norm", pointwise0_fp16, {"p": 1}),
             ("norm", pointwise0_fp16, {"p": 1, "dim": 0}),
             ("cosine_similarity", mat0_fp16 + mat1_fp16),
-            ("poisson_nll_loss", mat0_fp16 + mat1_fp16 + (True, False, 1.e-8,
-                                                          torch.nn.functional._Reduction.get_enum('mean'))),
+            ("poisson_nll_loss", mat0_fp16 + mat1_fp16 + (True, False, 1.e-8, torch.nn._reduction.get_enum('mean'))),
             ("cosine_embedding_loss", (torch.tensor([[1, 2, 3]], device=dev, dtype=torch.float16),
                                        torch.tensor([[1, 3, 4]], device=dev, dtype=torch.float16),
                                        torch.tensor([1], device=dev, dtype=torch.int))),

From 7f4a27be3a23487cb74c578d31535c7e4c8aa6c4 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Tue, 22 Sep 2020 17:04:35 -0700
Subject: [PATCH 028/449] [resubmit][FX] s/get_param/get_attr/ (#45147)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45147

ghstack-source-id: 112605923

Test Plan: Imported from OSS

Reviewed By: eellison

Differential Revision: D23845096

fbshipit-source-id: 9ca209aa84cbaddd6e89c52b541e43b11197e2d5
---
 test/fx/quantization.py           |  2 +-
 test/test_fx.py                   |  6 +++---
 torch/fx/__init__.py              |  4 ++--
 torch/fx/graph.py                 | 10 +++++-----
 torch/fx/graph_module.py          |  4 ++--
 torch/fx/symbolic_trace.py        |  8 ++++----
 torch/quantization/fx/quantize.py |  4 ++--
 torch/quantization/fx/utils.py    |  4 ++--
 8 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index 8116ed5ce89a..968c797c9163 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -222,7 +222,7 @@ def load_arg(a):
         for node in self.graph.nodes:
             if node.op == 'placeholder':
                 result = next(args_iter)
-            elif node.op == 'get_param':
+            elif node.op == 'get_attr':
                 result = self.state_dict[node.target]
             elif node.op == 'call_function':
                 result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
diff --git a/test/test_fx.py b/test/test_fx.py
index f191a73c40c4..a48274e16809 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -338,7 +338,7 @@ def __init__(self, interpreter):
                 placeholder_nodes.append(graph.create_node('placeholder', name))
 
             # Get the interpreter object
-            interpreter_node = graph.create_node('get_param', 'interpreter')
+            interpreter_node = graph.create_node('get_attr', 'interpreter')
 
             # Add a node to call the interpreter instance
             output_node = graph.create_node(
@@ -570,7 +570,7 @@ def test_graph_fns(self):
         g = Graph()
         a = g.placeholder('a')
         b = g.call_module('linear', (a,))
-        c = g.get_param('bias')
+        c = g.get_attr('bias')
         d = g.call_method('add', (b, c))
         e = g.call_function(torch.sin, (d,))
         g.output(e)
@@ -587,7 +587,7 @@ def test_construct_root_dict(self):
         graph : torch.fx.Graph = torch.fx.Graph()
         a : torch.fx.Node = graph.create_node('placeholder', 'x')
         b : torch.fx.Node = graph.create_node('call_module', 'foo.bar.baz', args=(a,))
-        c : torch.fx.Node = graph.create_node('get_param', 'zip.zap.zam')
+        c : torch.fx.Node = graph.create_node('get_attr', 'zip.zap.zam')
         d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
         graph.output(d)
 
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 5b90c434340c..185511460740 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -36,7 +36,7 @@ def forward(self, x):
 opcode         name           target                                                   args                kwargs
 -------------  -------------  -------------------------------------------------------  ------------------  -----------
 placeholder    x              x                                                        ()                  {}
-get_param      linear_weight  linear.weight                                            ()                  {}
+get_attr       linear_weight  linear.weight                                            ()                  {}
 call_function  add_1          <built-in function add>                                  (x, linear_weight)  {}
 call_module    linear_1       linear                                                   (add_1,)            {}
 call_method    relu_2         relu                                                     [linear_1]          {}
@@ -48,7 +48,7 @@ def forward(self, x):
 
 - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on.
   `target` is similarly the name of the argument. `args` and `kwargs` are don't-care
-- `get_param` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
+- `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
    fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy.
    `args` and `kwargs` are don't-care
 - `call_function` applies a free function to some values. `name` is similarly the name of the value to assign
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 6214f60c61e6..6ca60f6211aa 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -92,7 +92,7 @@ def create_node(self, op: str, target: Target,
                     args: Optional[Tuple[Argument, ...]] = None,
                     kwargs: Optional[Dict[str, Argument]] = None,
                     name: Optional[str] = None) -> Node:
-        assert op in ('call_function', 'call_method', 'get_param', 'call_module', 'placeholder')
+        assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder')
         args = () if args is None else args
         kwargs = {} if kwargs is None else kwargs
         self._mark_uses(args)
@@ -105,8 +105,8 @@ def create_node(self, op: str, target: Target,
     def placeholder(self, name: str) -> Node:
         return self.create_node('placeholder', name)
 
-    def get_param(self, name: str) -> Node:
-        return self.create_node('get_param', name)
+    def get_attr(self, name: str) -> Node:
+        return self.create_node('get_attr', name)
 
     def call_module(self,
                     module_name: str,
@@ -208,7 +208,7 @@ def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
                 assert isinstance(node.target, str)
                 body.append(f'{node.name} = {_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})\n')
                 continue
-            elif node.op == 'get_param':
+            elif node.op == 'get_attr':
                 assert isinstance(node.target, str)
                 body.append(f'{node.name} = {_format_target(root_module, node.target)}\n')
                 continue
@@ -242,7 +242,7 @@ def format_node(n : Node) -> Optional[str]:
                 assert isinstance(n.target, str)
                 placeholder_names.append(n.target)
                 return None
-            elif n.op == 'get_param':
+            elif n.op == 'get_attr':
                 return f'%{n.name} : [uses={n.uses}] = self.{n.target}'
             else:
                 return f'%{n.name} : [uses={n.uses}] = {n.op}[target={n.target}](' \
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 83feed72b752..e635819550ad 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -135,13 +135,13 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
             if hasattr(root, 'training'):
                 self.training = root.training
             for node in graph.nodes:
-                if node.op in ['get_param', 'call_module']:
+                if node.op in ['get_attr', 'call_module']:
                     assert isinstance(node.target, str)
                     _copy_attr(root, self, node.target)
         elif isinstance(root, dict):
             targets_to_copy = []
             for node in graph.nodes:
-                if node.op in ['get_param', 'call_module']:
+                if node.op in ['get_attr', 'call_module']:
                     assert isinstance(node.target, str)
                     if node.target not in root:
                         raise RuntimeError('Node ' + str(node) + ' referenced target ' + node.target +
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index 442fa28c36d9..9b192dd5501f 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -55,15 +55,15 @@ def create_arg(self, a: Any) -> Argument:
         if isinstance(a, torch.nn.Parameter):
             for n, p in self.root.named_parameters():
                 if a is p:
-                    return self.create_node('get_param', n, (), {})
+                    return self.create_node('get_attr', n, (), {})
             raise NameError('parameter is not a member of this module')
         # Tensors do not have a reliable string repr() from which they can be
         # constructed (and we probably don't want to rely on that, either), so
         # for any constant Tensor values we encounter, first search for if they
         # are an attribute of some module in the module hierarchy. If so, emit
-        # a get_param to retrieve that tensor. Otherwise, we'll store away the
+        # a get_attr to retrieve that tensor. Otherwise, we'll store away the
         # tensor value into a special attribute on the Module s.t. we can
-        # retrieve it with a get_param.
+        # retrieve it with a get_attr.
         if isinstance(a, torch.Tensor):
             # TODO: slow
             def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]:
@@ -96,7 +96,7 @@ def search_for_tensor(m : torch.nn.Module) -> Optional[List[str]]:
                     i += 1
                 setattr(self.root, qualname, a)
 
-            return self.create_node('get_param', qualname, (), {})
+            return self.create_node('get_attr', qualname, (), {})
         return super().create_arg(a)
 
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 8d8ef0f328c3..7967b4ec2dcb 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -177,7 +177,7 @@ def get_qconfig(module):
 
         self.qconfig_map = dict()
         for node in input_graph.nodes:
-            if node.op == 'get_param':
+            if node.op == 'get_attr':
                 parent, _ = _parent_name(node.target)
                 self.qconfig_map[node.name] = get_qconfig(self.modules[parent])
             elif node.op == 'call_function':
@@ -557,7 +557,7 @@ def load_arg(a):
                 setattr(quantized_root, packed_weight_name, packed_weight)
                 # replace prepack node with a getattr node
                 env[node.name] = folded_graph.create_node(
-                    'get_param', packed_weight_name, (), {})
+                    'get_attr', packed_weight_name, (), {})
             elif prepack_node is not None:
                 # remove the foled node
                 continue
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 95d19df1e1b4..5d5532dc48fc 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -17,7 +17,7 @@ def graph_pretty_str(g, shorten=True) -> str:
     built_in_meth_re = re.compile('<built-in method (.*) of type.*>')
     op_dict = {
         'placeholder': 'plchdr',
-        'get_param': 'gt_prm',
+        'get_attr': 'gt_prm',
         'call_function': 'cl_fun',
         'call_module': 'cl_mod',
         'call_method': 'cl_meth',
@@ -136,5 +136,5 @@ def get_next_qparams_idx(module, qparams):
     for key, value in qparams.items():
         setattr(root_module, key + str(idx), value)
         qparam_full_path = key + str(idx)
-        inputs.append(graph.create_node('get_param', qparam_full_path))
+        inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})

From ccfbfe5eb5c318d17f6994be31fe3f38261addff Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 22 Sep 2020 17:09:47 -0700
Subject: [PATCH 029/449] [quant][graphmode][fx] Custom module support (#44766)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44766

There might be modules that are not symbolically traceable, e.g. LSTM (since it has
input dependent control flows), to support quantization in these cases, user will provide
the corresponding observed and quantized version of the custom module, the observed
custom module with observers already inserted in the module and the quantized version will
have the corresponding ops quantized. And use
```
from torch.quantization import register_observed_custom_module_mapping
from torch.quantization import register_quantized_custom_module_mapping
register_observed_custom_module_mapping(CustomModule, ObservedCustomModule)
register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule)
```
to register the custom module mappings, we'll also need to define a custom delegate class
for symbolic trace in order to prevent the custom module from being traced:
```python
class CustomDelegate(DefaultDelegate):
      def is_leaf_module(self, m):
          return (m.__module__.startswith('torch.nn') and
                    not isinstance(m, torch.nn.Sequential)) or \
                    isinstance(m, CustomModule)
m = symbolic_trace(original_m, delegate_class=CustomDelegate)
```

Test Plan: Imported from OSS

Reviewed By: z-a-f

Differential Revision: D23723455

fbshipit-source-id: 50d666e29b94cbcbea5fb6bcc73b00cff87eb77a
---
 test/quantization/test_quantize_fx.py         | 136 ++++++++++++++++++
 torch/nn/quantized/modules/conv.py            |   2 +-
 torch/quantization/__init__.py                |   6 +
 .../custom_module_class_mappings.py           |  75 ++++++++++
 .../quantization/fx/quantization_patterns.py  |  25 ++++
 torch/quantization/fx/quantize.py             |  32 ++++-
 6 files changed, 274 insertions(+), 2 deletions(-)
 create mode 100644 torch/quantization/custom_module_class_mappings.py

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 3170bfbfe8b4..fc4a735854ef 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -20,6 +20,8 @@
     quantize_static_fx,
     quantize_dynamic_fx,
     prepare_qat_fx,
+    register_observed_custom_module_mapping,
+    register_quantized_custom_module_mapping,
 )
 
 from torch.quantization import (
@@ -482,6 +484,140 @@ def forward(self, x):
         # Verify that loaded state dict produces same results.
         self.assertEqual(quant(x), quant_2(x))
 
+    @skipIfNoFBGEMM
+    def test_custom_module_class(self):
+        class CustomModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        class ObservedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_float(cls, float_module):
+                assert hasattr(float_module, 'qconfig')
+                observed = cls(float_module.conv)
+                observed.qconfig = float_module.qconfig
+                return observed
+
+        class QuantizedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_observed(cls, observed_module):
+                assert hasattr(observed_module, 'qconfig')
+                assert hasattr(observed_module, 'activation_post_process')
+                observed_module.conv.activation_post_process = \
+                    observed_module.activation_post_process
+                quantized = cls(nnq.Conv2d.from_float(observed_module.conv))
+                return quantized
+
+        class DynamicallyQuantizedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_observed(cls, observed_module):
+                assert hasattr(observed_module, 'qconfig')
+                assert hasattr(observed_module, 'activation_post_process')
+                quantized = cls(nnqd.Conv2d.from_float(observed_module.conv))
+                return quantized
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.custom = CustomModule()
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.custom(x)
+                return x
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(1, 1, 1)
+                self.conv2 = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        data = torch.randn(1, 1, 1, 1)
+        # instantiate M and RefM and align the parameters
+        original_m = M()
+        original_ref_m = RefM()
+        original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
+        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+        original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach())
+        original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach())
+
+        from torch.fx.symbolic_trace import Tracer
+
+        # define a custom tracer to not trace through the custom module
+
+        class CustomTracer(Tracer):
+            def is_leaf_module(self, m, module_qualified_name):
+                return (m.__module__.startswith('torch.nn') and
+                        not isinstance(m, torch.nn.Sequential)) or \
+                    isinstance(m, CustomModule)
+
+        # TODO: add other quant types after mixed mode support
+        for quant_type in [QuantType.STATIC]:
+            # register observed and quantized custom module classes
+            register_observed_custom_module_mapping(CustomModule, ObservedCustomModule)
+            register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule)
+
+            m = CustomTracer().trace(original_m).eval()
+            qconfig_dict = {'': default_qconfig}
+            # check prepared model
+            m = prepare_static_fx(m, qconfig_dict)
+            # calibration
+            m(data)
+            # all activation observers are inserted in the top level module
+            count_check = {
+                ns.call_module(torch.quantization.MinMaxObserver): 3
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+
+            # check converted/quantized model
+            m = convert_static_fx(m)
+            count_check = {
+                ns.call_function(torch.quantize_per_tensor) : 1,
+                ns.call_module(nnq.Conv2d) : 1,
+                ns.call_method('dequantize') : 1,
+            }
+            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+            res = m(data)
+
+            # quantize the reference model
+            ref_m = symbolic_trace(original_ref_m).eval()
+            ref_m = prepare_fx(ref_m, qconfig_dict)
+            ref_m(data)
+            ref_m = convert_fx(ref_m)
+            ref_res = ref_m(data)
+            self.assertEqual(res, ref_res)
+
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
     """
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index fe1ced91624f..773a9a37fbb3 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -146,7 +146,7 @@ def __setstate__(self, state):
 
     @classmethod
     def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
-        r"""Creates a qconv object and returns it. 
+        r"""Creates a qconv object and returns it.
         """
         if weight_post_process is None:
             weight_post_process = mod.qconfig.weight()
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index ed908ddf85c3..3193c332469f 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -9,6 +9,7 @@
 from .quantize_fx import *
 from .quantization_mappings import *
 from .fuser_method_mappings import *
+from .custom_module_class_mappings import *
 
 def default_eval_fn(model, calib_data):
     r"""
@@ -40,6 +41,11 @@ def default_eval_fn(model, calib_data):
     'get_compare_output_module_list',
     'register_quantized_operator_mapping', 'get_quantized_operator',
     'register_fuser_method', 'get_fuser_method',
+    'register_observed_custom_module_mapping',
+    'get_observed_custom_module_class',
+    'register_quantized_custom_mdoule_mapping',
+    'get_quantized_custom_module_class',
+    'is_custom_module_class',
     # Sub functions for `prepare` and `swap_module`
     'propagate_qconfig_', 'add_quant_dequant', 'add_observer_', 'swap_module',
     'default_eval_fn', 'get_observer_dict',
diff --git a/torch/quantization/custom_module_class_mappings.py b/torch/quantization/custom_module_class_mappings.py
new file mode 100644
index 000000000000..c62290228c5b
--- /dev/null
+++ b/torch/quantization/custom_module_class_mappings.py
@@ -0,0 +1,75 @@
+OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS = dict()
+
+def register_observed_custom_module_mapping(float_custom_module_class, observed_custom_module_class):
+    """ Register a mapping from `float_custom_module_class` to
+    `observed_custom_module_class`
+    `observed_custom_module_class` will have a `from_float` classmethod,
+    which will return an observed custom module instance given
+    a float custom module instance.
+    This will be used in prepare step of post training static quantization or
+    quantization aware training
+    """
+    assert hasattr(observed_custom_module_class, 'from_float'), 'from_float must be' + \
+        ' defined in observed custom module class'
+    OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS[float_custom_module_class] = \
+        observed_custom_module_class
+
+def get_observed_custom_module_class(float_custom_module_class):
+    """ Get the corresponding observed module class for a given
+    float custom module.
+    """
+    observed_custom_module_class = \
+        OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS.get(float_custom_module_class, None)
+    assert observed_custom_module_class is not None, \
+        'Float Custom module class {}'.format(float_custom_module_class) + \
+        ' does not have a corresponding observed module class'
+    return observed_custom_module_class
+
+QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS = dict()
+
+def register_quantized_custom_module_mapping(float_custom_module_class, quantized_custom_module_class):
+    """ Register a mapping from `float_custom_module_class` to `quantized_custom_module_class`
+    A quantized custom module class should accept quantized input and
+    return quantized output. (we can relax this condition in the
+    future if there is a need)
+    `quantized_custom_module_class` will have a `from_observed` classmethod,
+    which will return an quantized custom module instance given
+    a observed custom module instance.
+    This will be used in prepare step of post training static quantization or
+    quantization aware training
+    """
+    assert hasattr(quantized_custom_module_class, 'from_observed'), 'from_observed' + \
+        ' must be defined in quantized custom module class'
+    QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS[float_custom_module_class] = \
+        quantized_custom_module_class
+
+def get_quantized_custom_module_class(float_custom_module_class):
+    """ Get the corresponding quantized module class for a given
+    float custom module.
+    """
+    quantized_custom_module_class = \
+        QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS.get(float_custom_module_class, None)
+    assert quantized_custom_module_class is not None, \
+        'Float Custom module class {}'.format(float_custom_module_class) + \
+        ' does not have a corresponding quantized module class'
+    return quantized_custom_module_class
+
+def is_custom_module_class(module_class):
+    """ Check if a given module class is a custom module class
+    """
+    return module_class in OBSERVED_CUSTOM_MODULE_CLASS_MAPPINGS and \
+        module_class in QUANTIZED_CUSTOM_MODULE_CLASS_MAPPINGS
+
+def mark_observed_custom_module(module, custom_module_class):
+    """ Mark a module as observed custom module, so that
+    it can be identified during convert step
+    """
+    module._is_observed_custom_module = True
+    module._FLOAT_MODULE = custom_module_class
+
+def is_observed_custom_module(module):
+    """ Check if a module is marked as observed custom module
+    or not
+    """
+    return hasattr(module, '_is_observed_custom_module') and \
+        module._is_observed_custom_module
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index fa5a8733bbf7..ab85c9a9daff 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -6,6 +6,9 @@
     get_static_quant_module_class,
     get_quantized_operator,
 )
+from ..custom_module_class_mappings import (
+    get_quantized_custom_module_class,
+)
 from .pattern_utils import (
     register_quant_pattern,
     register_dynamic_quant_pattern,
@@ -507,6 +510,28 @@ def convert(self, quantizer, node):
             quantizer.quantized_graph,
             node, quantizer.activation_post_process_map[node.name])
 
+class CustomModuleQuantizeHandler(QuantizeHandler):
+    def convert(self, quantizer, node, load_arg, debug=False):
+        """ Convert a float custom module to quantized custom module
+        """
+        assert node.op == 'call_module'
+        observed_custom_module = quantizer.modules[node.target]
+        if node.name in quantizer.activation_post_process_map:
+            observed_custom_module.activation_post_process = \
+                quantizer.activation_post_process_map[node.name]
+        quantized_custom_module_class = \
+            get_quantized_custom_module_class(observed_custom_module._FLOAT_MODULE)
+        quantized_custom_module = \
+            quantized_custom_module_class.from_observed(observed_custom_module)
+        parent_name, name = _parent_name(node.target)
+        setattr(quantizer.modules[parent_name], name, quantized_custom_module)
+        # hardcoded the qunatized input to be None (take whatever is in the environemnt),
+        # we can extend this
+        # if there is a need, e.g. get the indexes of quantized inputs from some
+        # module attribute like module._QUANTIZED_INPUT_INDEXES
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
+
+
 # 2. Post Training Dynamic Quantizatoin Patterns
 @register_dynamic_quant_pattern(torch.nn.Linear)
 @register_dynamic_quant_pattern(torch.nn.functional.linear)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 7967b4ec2dcb..8d742255838a 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -18,6 +18,12 @@
 from ..quantization_mappings import (
     get_qat_module_mappings,
 )
+from ..custom_module_class_mappings import (
+    is_custom_module_class,
+    get_observed_custom_module_class,
+    mark_observed_custom_module,
+    is_observed_custom_module,
+)
 
 from ..quantize import _remove_qconfig
 
@@ -193,7 +199,6 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         if not inplace:
             model = copy.deepcopy(model)
         self.is_dynamic_quant = is_dynamic_quant
-        # TODO: allow user specified patterns
         if self.is_dynamic_quant:
             self.patterns = get_dynamic_quant_patterns()
         else:
@@ -235,6 +240,8 @@ def load_arg(a):
                 env[node.name] = observed_graph.node_copy(node, load_arg)
             elif root_node is node:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
+                if qconfig is None:
+                    continue
 
                 def insert_observer(node, observer, device):
                     get_new_observer_name = get_new_attr_name_with_prefix(prefix)
@@ -246,10 +253,22 @@ def insert_observer(node, observer, device):
                     if device:
                         getattr(model, observer_name).to(device)
 
+                if isinstance(obj, CustomModuleQuantizeHandler):
+                    custom_module = self.modules[node.target]
+                    observed_custom_module_class = \
+                        get_observed_custom_module_class(type(custom_module))
+                    observed_custom_module = \
+                        observed_custom_module_class.from_float(custom_module)
+                    mark_observed_custom_module(observed_custom_module, type(custom_module))
+                    parent_name, name = _parent_name(node.target)
+                    setattr(self.modules[parent_name], name, observed_custom_module)
+
                 # don't need to insert observer for output in dynamic quantization
                 if self.is_dynamic_quant:
                     continue
 
+                # inserting observers for output of observed module, or mark the output
+                # as observed
                 if isinstance(obj, CopyNode):
                     assert node.op in [
                         'call_module',
@@ -355,6 +374,7 @@ def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False):
         self.modules = dict(model.named_modules())
 
         matches = self._find_matches(model.graph, self.modules, self.patterns)
+
         quants = self._find_quants(model.graph, matches)
         self.quantized_graph = Graph()
         env = {}
@@ -619,6 +639,16 @@ def record_match(pattern, node, matched):
                             all_matched.add(n.name)
                         # break after finding the first match
                         break
+
+        # add custom module instances to the match result
+        for node in graph.nodes:
+            if node.op == 'call_module' and \
+               (is_custom_module_class(type(self.modules[node.target])) or
+                    is_observed_custom_module(self.modules[node.target])):
+                custom_module_qconfig = self.qconfig_map[node.name]
+                match_map[node.name] = (
+                    node, [node], CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
+
         return match_map
 
     def _find_quants(self, graph, matches):

From 2a37f3fd2f74e2d10f3440e6dfef2d5389caab62 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Tue, 22 Sep 2020 17:24:54 -0700
Subject: [PATCH 030/449] Relax CUDA architecture check (#45130)

Summary:
NVIDIA GPUs are binary compatible within major compute capability revision

This would prevent: "GeForce RTX 3080 with CUDA capability sm_86 is not compatible with the current PyTorch installation." messages from appearing, since CUDA-11 do not support code generation for sm_85.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45130

Reviewed By: ngimel

Differential Revision: D23841556

Pulled By: malfet

fbshipit-source-id: bcfc9e8da63dfe62cdec06909b6c049aaed6a18a
---
 torch/cuda/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 53aea1141d47..e8687cad17e8 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -96,9 +96,8 @@ def _check_cubins():
     supported_sm = [int(arch.split('_')[1]) for arch in arch_list if 'sm_' in arch]
     for idx in range(device_count()):
         cap_major, cap_minor = get_device_capability(idx)
-        capability = cap_major * 10 + cap_minor
-        # NVIDIA GPU compute architectures are backward compatible within 5 minor revisions versions
-        supported = any([capability >= sm and capability - (sm // 5) * 5 < 5 for sm in supported_sm])
+        # NVIDIA GPU compute architectures are backward compatible within major version
+        supported = any([sm // 10 == cap_major for sm in supported_sm])
         if not supported:
             device_name = get_device_name(idx)
             warnings.warn(incompatible_device_warn.format(device_name, capability, " ".join(arch_list), device_name))

From b98ac208492a421944a1ae19ef7883ab1a97bb73 Mon Sep 17 00:00:00 2001
From: "Daily, Jeff" <jeff.daily@amd.com>
Date: Tue, 22 Sep 2020 17:41:41 -0700
Subject: [PATCH 031/449] install ATen/native/cuda and hip headers (#45097)

Summary:
The ATen/native/cuda headers were copied to torch/include, but then not included in the final package.  Further, add ATen/native/hip headers to the installation, as well.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45097

Reviewed By: mruberry

Differential Revision: D23831006

Pulled By: malfet

fbshipit-source-id: ab527928185faaa912fd8cab208733a9b11a097b
---
 aten/src/ATen/CMakeLists.txt | 3 ++-
 setup.py                     | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 1bcbae8abeff..5ec9d24eea39 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -78,6 +78,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu")
 exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp})
 file(GLOB native_cuda_cpp "native/cuda/*.cpp")
 file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh")
+file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh")
 file(GLOB native_cudnn_cpp "native/cudnn/*.cpp")
 file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu")
 file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
@@ -372,7 +373,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
 
 set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
 if(NOT INTERN_BUILD_MOBILE)
-  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${cudnn_h} ${hip_h} ${miopen_h})
+  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h})
 endif()
 
 # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake
diff --git a/setup.py b/setup.py
index 2a2f911e0d3d..753e2b0f14a1 100644
--- a/setup.py
+++ b/setup.py
@@ -776,6 +776,10 @@ def print_box(msg):
                 'include/ATen/detail/*.h',
                 'include/ATen/native/*.h',
                 'include/ATen/native/cpu/*.h',
+                'include/ATen/native/cuda/*.h',
+                'include/ATen/native/cuda/*.cuh',
+                'include/ATen/native/hip/*.h',
+                'include/ATen/native/hip/*.cuh',
                 'include/ATen/native/quantized/*.h',
                 'include/ATen/native/quantized/cpu/*.h',
                 'include/ATen/quantized/*.h',

From c0267c68454cf469d760c0eb3e952c1cb5f63af5 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Tue, 22 Sep 2020 17:47:28 -0700
Subject: [PATCH 032/449] [caffe2] Support data types in shape hints (#45110)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45110

A recent change in DSNN quantizes the ad embedding to 8 bits. Ad embeddings are part of the inputs to the DSNN merge net. To correctly pass shape hints of input tensors including quantized ad embeddings, we need to be able to annotate the data types in shape hints.

A bit on the corner cases, if type is omitted or not a valid type, e.g., white spaces, instead of throwing an exception, I decided to return the default type, float.

Test Plan:
```
buck test caffe2/caffe2/fb/opt:shape_info_utils_test
```

Reviewed By: yinghai

Differential Revision: D23834091

fbshipit-source-id: 5e072144a7a7ff4b5126b618062dfc4041851dd3
---
 caffe2/opt/shape_info.cc | 75 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/caffe2/opt/shape_info.cc b/caffe2/opt/shape_info.cc
index 0ff55693395f..dfcdeb0356bd 100644
--- a/caffe2/opt/shape_info.cc
+++ b/caffe2/opt/shape_info.cc
@@ -5,6 +5,63 @@
 
 namespace caffe2 {
 
+namespace {
+bool isNumber(const std::string& s) {
+  bool empty = true;
+  for (const char c : s) {
+    if (std::isalpha(c)) {
+      return false;
+    }
+    if (!std::isspace(c)) {
+      empty = false;
+    }
+  }
+  return !empty;
+}
+
+std::string toLower(const std::string& s) {
+  std::string t;
+  t.resize(s.size());
+  for (size_t i = 0; i < t.size(); i++) {
+    t[i] = std::tolower(s[i]);
+  }
+  return t;
+}
+
+TensorProto_DataType toTensorProtoDataType(const std::string& in) {
+  std::string s = toLower(in);
+  if (s == "uint8") {
+    return TensorProto_DataType_UINT8;
+  } else if (s == "int8") {
+    return TensorProto_DataType_INT8;
+  } else if (s == "uint16") {
+    return TensorProto_DataType_UINT16;
+  } else if (s == "int16") {
+    return TensorProto_DataType_INT16;
+  } else if (s == "int32") {
+    return TensorProto_DataType_INT32;
+  } else if (s == "int64") {
+    return TensorProto_DataType_INT64;
+  } else if (s == "float16" || s == "half") {
+    return TensorProto_DataType_FLOAT16;
+  } else if (s == "float") {
+    return TensorProto_DataType_FLOAT;
+  } else if (s == "double") {
+    return TensorProto_DataType_DOUBLE;
+  } else if (s == "byte") {
+    return TensorProto_DataType_BYTE;
+  } else if (s == "string") {
+    return TensorProto_DataType_STRING;
+  } else if (s == "bool") {
+    return TensorProto_DataType_BOOL;
+  } else if (s == "hash") {
+    return TensorProto_DataType_ZERO_COLLISION_HASH;
+  }
+  // return default data type, float
+  return TensorProto_DataType_FLOAT;
+}
+} // namespace
+
 ShapeInfo getShapeInfoFromBlob(const Blob* blob) {
   ShapeInfo shape_info;
   shape_info.shape = GetTensorShapeOfBlob(blob);
@@ -138,14 +195,24 @@ void parseShapeInfoMapFromString(
     const auto& name = kv[0];
 
     TensorShape shape;
-    if (name.find("int8") != std::string::npos) {
-      shape.set_data_type(TensorProto_DataType_UINT8);
+    size_t size = kv.size();
+    CAFFE_ENFORCE_GT(size, 1);
+    if (!isNumber(kv[size - 1])) {
+      // last value is the type
+      shape.set_data_type(toTensorProtoDataType(kv[size - 1]));
+      size--;
     } else {
-      shape.set_data_type(TensorProto_DataType_FLOAT);
+      if (name.find("int8") != std::string::npos) {
+        // Kept for backwards compatibility.
+        // Set type explicitly to overwrite it.
+        shape.set_data_type(TensorProto_DataType_UINT8);
+      } else {
+        shape.set_data_type(TensorProto_DataType_FLOAT);
+      }
     }
 
     bool valid = true;
-    for (int i = 1; i < kv.size(); i++) {
+    for (int i = 1; i < size; i++) {
       auto dim = kv[i];
       try {
         shape.add_dims(std::stoi(dim));

From ebde5a80bb0bdb30acb83124d7b326644ae76508 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Tue, 22 Sep 2020 18:15:56 -0700
Subject: [PATCH 033/449] [tensorexpr] Add flag to fuse with unknown shapes
 (#44401)

Summary:
This flag simply allows users to get fusion groups that will *eventually* have shapes (such that `getOperation` is a valid).

This is useful for doing early analysis and compiling just in time.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44401

Reviewed By: ZolotukhinM

Differential Revision: D23656140

Pulled By: bwasti

fbshipit-source-id: 9a26c202752399d1932ad7d69f21c88081ffc1e5
---
 test/cpp/tensorexpr/test_te_fuser_pass.cpp | 21 ++++++++++++++++++++-
 test/cpp/tensorexpr/tests.h                |  1 +
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 20 +++++++++++++++-----
 torch/csrc/jit/passes/tensorexpr_fuser.h   |  8 +++++++-
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
index 680311685375..826cf7209346 100644
--- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp
+++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp
@@ -151,7 +151,7 @@ void testFuserPass_UnknownShapes() {
           %y : Tensor):
       %a : Tensor = aten::mul(%x, %y)
       %b : Tensor = aten::mul(%x, %a)
-      return (%a))IR";
+      return (%b))IR";
   auto g = std::make_shared<Graph>();
   torch::jit::parseIR(graph_string, g.get());
 
@@ -311,5 +311,24 @@ void testFuserPass_MergeGroups() {
       ->run(*g);
 }
 
+void testFuserPass_UnknownShapesIgnored() {
+  WithCPUFuser cf;
+  KernelScope kernel_scope;
+  const auto graph_string = R"IR(
+    graph(%x : Float(device=cpu),
+          %y : Float(device=cpu)):
+      %a : Float(device=cpu) = aten::mul(%x, %y)
+      %b : Float(device=cpu) = aten::mul(%x, %a)
+      return (%b))IR";
+  auto g = std::make_shared<Graph>();
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  FuseTensorExprs(g, /* min_group_size= */ 2, /* disable_shape_checks= */ true);
+
+  // Test that we are generating fusion groups even though shapes are not known
+  testing::FileCheck().check("prim::TensorExprGroup")->run(*g);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
index 20206a348d25..c38a368af13c 100644
--- a/test/cpp/tensorexpr/tests.h
+++ b/test/cpp/tensorexpr/tests.h
@@ -291,6 +291,7 @@ namespace jit {
   _(FuserPass_0DimInput)                    \
   _(FuserPass_UnfusibleDevice)              \
   _(FuserPass_UnknownShapes)                \
+  _(FuserPass_UnknownShapesIgnored)         \
   _(FuserPass_Multidevice)                  \
   _(FuserPass_MergeGroups)                  \
   _(TrainBasic)
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 02b4861eabfe..4d98110d3975 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -284,8 +284,13 @@ void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph) {
 
 class TensorExprFuser {
  public:
-  TensorExprFuser(std::shared_ptr<Graph> graph, size_t min_group_size)
-      : graph_(std::move(graph)), min_group_size_(min_group_size) {}
+  TensorExprFuser(
+      std::shared_ptr<Graph> graph,
+      size_t min_group_size,
+      bool disable_shape_checks)
+      : graph_(std::move(graph)),
+        min_group_size_(min_group_size),
+        disable_shape_checks_(disable_shape_checks) {}
 
   void run() {
     aliasDb_ = torch::make_unique<AliasDb>(graph_);
@@ -606,7 +611,7 @@ class TensorExprFuser {
 
   bool canHandle(Node* node) {
     REQ(node->kind() != prim::Constant);
-    REQ(allShapesAreKnown(node));
+    REQ(disable_shape_checks_ || allShapesAreKnown(node));
     REQ(isFusableOnDevice(node));
 
     // Don't include nodes whose inputs are tensor constants - we cannot handle
@@ -836,9 +841,14 @@ class TensorExprFuser {
 
   // Minimal size of a fusion group
   size_t min_group_size_;
+  // If true, shapes are ignored
+  bool disable_shape_checks_;
 };
 
-void FuseTensorExprs(std::shared_ptr<Graph>& graph, size_t min_group_size) {
+void FuseTensorExprs(
+    std::shared_ptr<Graph>& graph,
+    size_t min_group_size,
+    bool disable_shape_checks) {
   GRAPH_DUMP("Before TExprFuser: ", graph);
 
   // Temporary change for Block code generation.
@@ -849,7 +859,7 @@ void FuseTensorExprs(std::shared_ptr<Graph>& graph, size_t min_group_size) {
   // Get rid of dead code so that we don't waste effort fusing it.
   EliminateDeadCode(graph);
 
-  TensorExprFuser fuser(graph, min_group_size);
+  TensorExprFuser fuser(graph, min_group_size, disable_shape_checks);
   fuser.run();
 
   EliminateCommonSubexpression(graph);
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index db2ee0482960..a99cc88ef439 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -10,9 +10,15 @@ namespace jit {
 struct Graph;
 
 // Run TensorExpressions-based fuser.
+//
+// If shape checks are disabled it is the responsibilty of
+// the caller to ensure that the resultant subgraph is correctly
+// annotated with shapes by the time "getOperation" is called
+// on the node.
 TORCH_API void FuseTensorExprs(
     std::shared_ptr<Graph>& graph,
-    size_t min_group_size = 2);
+    size_t min_group_size = 2,
+    bool disable_shape_checks = false);
 
 TORCH_API void setTensorExprFuserEnabled(bool val);
 TORCH_API bool tensorExprFuserEnabled();

From e045119956aa8ed07e293714fd674bcff6251d69 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Tue, 22 Sep 2020 18:35:55 -0700
Subject: [PATCH 034/449] [JIT] Add default arguments for class types (#45098)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45098

**Summary**
This commit adds support for default arguments in methods of class
types. Similar to how default arguments are supported for regular
script functions and methods on scripted modules, default values are
retrieved from the definition of a TorchScript class in Python as Python
objects, converted to IValues, and then attached to the schemas of
already compiled class methods.

**Test Plan**
This commit adds a set of new tests to TestClassType to test default
arguments.

**Fixes**
This commit fixes #42562.

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D23844769

Pulled By: SplitInfinity

fbshipit-source-id: ceedff7703bf9ede8bd07b3abcb44a0f654936bd
---
 test/jit/test_class_type.py           | 102 +++++++++++++++++++++++++-
 torch/_C/__init__.pyi.in              |   1 +
 torch/csrc/jit/python/script_init.cpp |  26 +++++++
 torch/jit/_script.py                  |   3 +-
 torch/jit/frontend.py                 |  26 +++++++
 5 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index c71be6ac1d9f..3fcd89347091 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -13,7 +13,7 @@
 from torch.testing._internal.jit_utils import JitTestCase
 import torch.testing._internal.jit_utils
 from torch.testing._internal.common_utils import IS_SANDCASTLE
-from typing import List, Tuple, Iterable
+from typing import List, Tuple, Iterable, Optional, Dict
 
 if __name__ == '__main__':
     raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
@@ -1020,6 +1020,106 @@ def foo():
             y.my_list = new_list
             return y
 
+    def test_default_args(self):
+        """
+        Test that methods on class types can have default arguments.
+        """
+        @torch.jit.script
+        class ClassWithDefaultArgs:
+            def __init__(
+                self,
+                a: int = 1,
+                b: Optional[List[int]] = None,
+                c: Tuple[int, int, int] = (1, 2, 3),
+                d: Optional[Dict[int, int]] = None,
+                e: Optional[str] = None,
+            ):
+                self.int = a
+                self.tup = c
+                self.str = e
+
+                self.list = [1, 2, 3]
+                if b is not None:
+                    self.list = b
+
+                self.dict = {1: 2, 3: 4}
+                if d is not None:
+                    self.dict = d
+
+            def add(self, b: int, scale: float = 1.0) -> float:
+                return self.int * scale + b
+
+        def all_defaults() -> int:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs()
+            return obj.int + obj.list[2] + obj.tup[1]
+
+        def some_defaults() -> int:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs(b=[5, 6, 7])
+            return obj.int + obj.list[2] + obj.dict[1]
+
+        def override_defaults() -> int:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs(3, [9, 10, 11], (12, 13, 14), {3: 4}, "str")
+            s: int = obj.int
+
+            for x in obj.list:
+                s += x
+
+            for y in obj.tup:
+                s += y
+
+            s += obj.dict[3]
+
+            st = obj.str
+            if st is not None:
+                s += len(st)
+
+            return s
+
+        def method_defaults() -> float:
+            obj: ClassWithDefaultArgs = ClassWithDefaultArgs()
+            return obj.add(3) + obj.add(3, 0.25)
+
+        self.checkScript(all_defaults, ())
+        self.checkScript(some_defaults, ())
+        self.checkScript(override_defaults, ())
+        self.checkScript(method_defaults, ())
+
+        # The constructor of this class below has some arguments without default values.
+        class ClassWithSomeDefaultArgs:  # noqa: B903
+            def __init__(
+                self,
+                a: int,
+                b: int = 1,
+            ):
+                self.a = a
+                self.b = b
+
+        def default_b() -> int:
+            obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1)
+            return obj.a + obj.b
+
+        def set_b() -> int:
+            obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1, 4)
+            return obj.a + obj.b
+
+        self.checkScript(default_b, ())
+        self.checkScript(set_b, ())
+
+        # The constructor of this class below has mutable arguments. This should throw
+        # an error.
+        class ClassWithMutableArgs:   # noqa: B903
+            def __init__(
+                self,
+                a: List[int] = [1, 2, 3],  # noqa: B006
+            ):
+                self.a = a
+
+        def should_fail():
+            obj: ClassWithMutableArgs = ClassWithMutableArgs()
+
+        with self.assertRaisesRegex(RuntimeError, "Mutable default parameters are not supported"):
+            torch.jit.script(should_fail)
+
     def test_staticmethod(self):
         """
         Test static methods on class types.
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 0d48ea710fdd..41e0e887f829 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -224,6 +224,7 @@ def _jit_script_compile(
 def _jit_script_class_compile(
     qual_name: str,
     definition: ClassDef,
+    defaults: Dict[str, Dict[str, Any]],
     rcb: ResolutionCallback,
 ): ...
 def _parse_source_def(src: str) -> Def: ...
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 5ed9ba9dc0a7..95d041fe315b 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -53,6 +53,7 @@ using ::c10::FunctionSchema;
 
 using ResolutionCallback = std::function<py::function(std::string)>;
 using FunctionDefaults = std::unordered_map<std::string, py::object>;
+using ClassMethodDefaults = std::unordered_map<std::string, FunctionDefaults>;
 
 namespace {
 
@@ -1301,6 +1302,7 @@ void initJitScriptBindings(PyObject* module) {
       "_jit_script_class_compile",
       [](const std::string& qualifiedName,
          const ClassDef& classDef,
+         const ClassMethodDefaults& defaults,
          ResolutionCallback rcb) {
         C10_LOG_API_USAGE_ONCE("torch.script.class");
         if (classDef.superclass().present()) {
@@ -1339,6 +1341,30 @@ void initJitScriptBindings(PyObject* module) {
 
         const auto self = SimpleSelf(classType);
         cu->define(classname, props, propRcbs, methodDefs, methodRcbs, &self);
+
+        // Stitch in default arguments for methods. Properties don't need to be
+        // considered since there is no way to invoke setters without passing in
+        // a value.
+        auto defs_it = methodDefs.begin();
+        while (defs_it != methodDefs.end()) {
+          auto def_name = (*defs_it).name().name();
+          // If the method is not in the defaults map, assume there are
+          // no default arguments for it.
+          auto default_it = defaults.find(def_name);
+          if (default_it == defaults.end()) {
+            continue;
+          }
+
+          const auto method_name =
+              QualifiedName(classname, (*defs_it).name().name());
+          auto& method = cu->get_function(method_name);
+          method.setSchema(getSchemaWithNameAndDefaults(
+              defs_it->range(),
+              method.getSchema(),
+              at::nullopt,
+              default_it->second));
+          ++defs_it;
+        }
       });
   m.def(
       "_jit_script_interface_compile",
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index f5969dbaf030..fb0465288e3f 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -60,7 +60,8 @@ def _is_new_style_class(cls):
 
 def _compile_and_register_class(obj, rcb, qualified_name):
     ast = get_jit_class_def(obj, obj.__name__)
-    torch._C._jit_script_class_compile(qualified_name, ast, rcb)
+    defaults = torch.jit.frontend.get_default_args_for_class(obj)
+    torch._C._jit_script_class_compile(qualified_name, ast, defaults, rcb)
     torch.jit._state._add_script_class(obj, qualified_name)
 
 
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 2a6dfb498986..4cfba50d0466 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -305,6 +305,32 @@ def get_default_args(fn):
     }
 
 
+def get_default_args_for_class(cls):
+    """
+    Get default arguments for all methods in a class (except for static methods).
+
+    Args:
+        cls: type - The class type to inspect for default arguments.
+    Returns:
+        A Dict[str, Dict[str, Any]] which maps each method name to a Dict[str, Any]
+        that maps each argument name to its default value.
+    """
+    # Get methods (except static methods because those are compiled separately as
+    # if they were independent script functions).
+    methods = inspect.getmembers(
+        cls,
+        predicate=lambda m: (inspect.ismethod(m) or inspect.isfunction(m))
+        and not is_static_fn(cls, m.__name__)
+        and m.__name__ in cls.__dict__
+    )
+
+    # Get method defaults. Property defaults do not need to be considered
+    # because setters cannot be invoked without a value.
+    defaults = {method_name: get_default_args(method_impl) for method_name, method_impl in methods}
+
+    return defaults
+
+
 class WithItemBuilder(Builder):
     @staticmethod
     def build_withitem(ctx, item):

From f575df201f290fba6e8db6d8581f57bc9ba9b07f Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 22 Sep 2020 19:35:45 -0700
Subject: [PATCH 035/449] [quant][graphmode][jit][api] Expose preserved_attrs
 from finalize to convert_jit (#44490)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44490

Test Plan: Imported from OSS

Reviewed By: z-a-f

Differential Revision: D23631142

fbshipit-source-id: f0913f0cb4576067e2a7288326024942d12e0ae0
---
 torch/csrc/jit/passes/quantization/finalize.cpp |  7 +++++--
 torch/csrc/jit/passes/quantization/finalize.h   |  4 +++-
 torch/csrc/jit/python/init.cpp                  |  9 ++++++---
 torch/quantization/quantize_jit.py              | 15 +++++++++------
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/torch/csrc/jit/passes/quantization/finalize.cpp b/torch/csrc/jit/passes/quantization/finalize.cpp
index 3d0d9a6eff6c..635c02728f6b 100644
--- a/torch/csrc/jit/passes/quantization/finalize.cpp
+++ b/torch/csrc/jit/passes/quantization/finalize.cpp
@@ -73,12 +73,15 @@ void FoldQuantizedPrepackingOps(Module& module) {
   PrePackingOpsFolder(module, filter_fn, "quantized");
 }
 
-Module Finalize(Module& module, QuantType quant_type) {
+Module Finalize(
+    Module& module,
+    QuantType quant_type,
+    const std::vector<std::string>& preserved_attrs) {
   auto graph = module.get_method("forward").graph();
   InsertPrepackUnpack(graph);
   GRAPH_DUMP("Before QuantFusion:", graph);
   QuantFusion(graph, quant_type);
-  auto frozen = freeze_module(module);
+  auto frozen = freeze_module(module, preserved_attrs);
   FoldQuantizedPrepackingOps(frozen);
   return frozen;
 }
diff --git a/torch/csrc/jit/passes/quantization/finalize.h b/torch/csrc/jit/passes/quantization/finalize.h
index 1de65dcb20e4..062d1e24251e 100644
--- a/torch/csrc/jit/passes/quantization/finalize.h
+++ b/torch/csrc/jit/passes/quantization/finalize.h
@@ -49,7 +49,9 @@ TORCH_API void InsertPrepackUnpack(Module& module);
 
 TORCH_API script::Module Finalize(
     script::Module& module,
-    QuantType quant_type = QuantType::STATIC);
+    QuantType quant_type = QuantType::STATIC,
+    const std::vector<std::string>& preserved_attrs =
+        std::vector<std::string>());
 
 TORCH_API void FoldQuantizedPrepackingOps(Module& module);
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 052b22a78917..db866704aa97 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -284,12 +284,15 @@ void initJITBindings(PyObject* module) {
           [](Module& module) { SwapFunctionalLinear(module); })
       .def(
           "_jit_pass_quant_finalize",
-          [](Module& module, int quant_type_int) {
+          [](Module& module,
+             int quant_type_int,
+             const std::vector<std::string>& preserved_attrs) {
             auto quant_type = static_cast<QuantType>(quant_type_int);
-            return Finalize(module, quant_type);
+            return Finalize(module, quant_type, preserved_attrs);
           },
           py::arg("module"),
-          py::arg("quant_type_int") = 1)
+          py::arg("quant_type_int") = 1,
+          py::arg("preserved_attrs") = std::vector<std::string>())
       .def(
           "_jit_pass_pattern_based_rewrite",
           [](const Module& m) { return PatternBasedRewrite(m); })
diff --git a/torch/quantization/quantize_jit.py b/torch/quantization/quantize_jit.py
index 130f0297357c..ef6792d521f6 100644
--- a/torch/quantization/quantize_jit.py
+++ b/torch/quantization/quantize_jit.py
@@ -67,7 +67,8 @@ def prepare_jit(model, qconfig_dict, inplace=False):
 def prepare_dynamic_jit(model, qconfig_dict, inplace=False):
     return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.DYNAMIC)
 
-def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC):
+def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC,
+                 preserved_attrs=None):
     _check_is_script_module(model)
     model.eval()
     model_c = model._c
@@ -76,18 +77,20 @@ def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC)
         # Moving model parameters to CPU since quantized operators
         # are only supported on CPU right now
         model.cpu()
-        model_c = torch._C._jit_pass_quant_finalize(model_c, quant_type)
+        if preserved_attrs is None:
+            preserved_attrs = []
+        model_c = torch._C._jit_pass_quant_finalize(model_c, quant_type, preserved_attrs)
     if inplace:
         model._reconstruct(model_c)
     else:
         model = wrap_cpp_module(model_c)
     return model
 
-def convert_jit(model, inplace=False, debug=False):
-    return _convert_jit(model, inplace, debug, quant_type=QuantType.STATIC)
+def convert_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    return _convert_jit(model, inplace, debug, quant_type=QuantType.STATIC, preserved_attrs=preserved_attrs)
 
-def convert_dynamic_jit(model, inplace=False, debug=False):
-    return _convert_jit(model, inplace, debug, quant_type=QuantType.DYNAMIC)
+def convert_dynamic_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    return _convert_jit(model, inplace, debug, quant_type=QuantType.DYNAMIC, preserved_attrs=preserved_attrs)
 
 def _quantize_jit(model, qconfig_dict, run_fn=None, run_args=None, inplace=False, debug=False, quant_type=QuantType.STATIC):
     # Always do inplace convert because the Tensor is already

From 666223df46fbc271a694293db3b4465271717f34 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Tue, 22 Sep 2020 19:42:28 -0700
Subject: [PATCH 036/449] [jit] gtestify test_argument_spec.cpp (#45019)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45019

See https://github.com/pytorch/pytorch/pull/45018 for context.

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D23802298

Pulled By: suo

fbshipit-source-id: 0e36d095d4d81dcd5ebe6d56b3dc469d6d5482d0
---
 test/cpp/jit/test_argument_spec.cpp | 52 ++++++++++++++++-------------
 test/cpp/jit/test_base.h            |  5 +++
 test/cpp/jit/tests.h                |  4 ---
 tools/build_variables.bzl           | 44 ------------------------
 4 files changed, 34 insertions(+), 71 deletions(-)

diff --git a/test/cpp/jit/test_argument_spec.cpp b/test/cpp/jit/test_argument_spec.cpp
index 01e27caac05f..bf40761fc468 100644
--- a/test/cpp/jit/test_argument_spec.cpp
+++ b/test/cpp/jit/test_argument_spec.cpp
@@ -1,3 +1,5 @@
+#include <gtest/gtest.h>
+
 #include <torch/jit.h>
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/runtime/argument_spec.h"
@@ -5,6 +7,8 @@
 namespace torch {
 namespace jit {
 
+namespace {
+
 int device(const autograd::Variable& v) {
   return v.device().is_cuda() ? v.get_device() : -1;
 }
@@ -38,8 +42,9 @@ autograd::Variable var(
 autograd::Variable undef() {
   return autograd::Variable();
 }
+} // namespace
 
-void testCompleteArgumentSpec() {
+TEST(ArgumentSpecTest, CompleteArgumentSpec_CUDA) {
   auto const CF = at::CPU(at::kFloat);
   auto const CD = at::CPU(at::kDouble);
   auto const GF = at::CUDA(at::kFloat);
@@ -94,34 +99,35 @@ void testCompleteArgumentSpec() {
   ASSERT_EQ(with_const.at(2).sizes().size(), 2);
 }
 
-size_t hashCode(const TensorTypePtr& ptr) {
-  return std::hash<TensorType>()(*ptr.get());
-}
+// TODO: this test was disabled for unknown reasons and doesn't run.
+// static size_t hashCode(const TensorTypePtr& ptr) {
+//   return std::hash<TensorType>()(*ptr.get());
+// }
 
-void testProfiledTensorTypeHashing() {
-  c10::VaryingShape<int64_t> vs(c10::optional<size_t>{});
-  auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false);
-  auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false);
-  ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2));
+// TEST(ArgumentSpecTest, VaryingShape) {
+//   c10::VaryingShape<int64_t> vs(c10::optional<size_t>{});
+//   auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false);
+//   auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false);
+//   ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2));
 
-  c10::VaryingShape<int64_t> vs22(std::vector<int64_t>{2, 2});
-  auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false);
-  auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false);
-  ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2));
+//   c10::VaryingShape<int64_t> vs22(std::vector<int64_t>{2, 2});
+//   auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false);
+//   auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false);
+//   ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2));
 
-  c10::VaryingShape<int64_t> vs23(std::vector<int64_t>{2, 3});
-  auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false);
-  ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2));
+//   c10::VaryingShape<int64_t> vs23(std::vector<int64_t>{2, 3});
+//   auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false);
+//   ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2));
 
-  auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true);
-  auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true);
-  ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true));
+//   auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true);
+//   auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true);
+//   ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true));
 
-  auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false);
-  ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false));
-}
+//   auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false);
+//   ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false));
+// }
 
-void testArgumentSpec() {
+TEST(ArgumentSpecTest, Basic_CUDA) {
   auto& CF = at::CPU(at::kFloat);
   auto& CD = at::CPU(at::kDouble);
   auto& GF = at::CUDA(at::kFloat);
diff --git a/test/cpp/jit/test_base.h b/test/cpp/jit/test_base.h
index 54a59e445e95..25f9e9f36cde 100644
--- a/test/cpp/jit/test_base.h
+++ b/test/cpp/jit/test_base.h
@@ -10,6 +10,10 @@
 #include <test/cpp/common/support.h>
 #else
 #include "c10/util/Exception.h"
+// Temporary: we are going to remove these polyfills entirely.
+// But for now avoid redefining them if they are already defined in gtest.
+// (ASSERT_EQ is a proxy for whether gtest is already present)
+#ifndef ASSERT_EQ
 #define ASSERT_EQ(x, y) TORCH_INTERNAL_ASSERT((x) == (y))
 #define ASSERT_NE(x, y) TORCH_INTERNAL_ASSERT((x) != (y))
 #define ASSERT_TRUE TORCH_INTERNAL_ASSERT
@@ -31,6 +35,7 @@
     }                                   \
     ASSERT_TRUE(threw);                 \
   }
+#endif // ndef(ASSERT_EQ)
 
 #endif // defined(USE_GTEST)
 
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index df45054edc43..452156fc052b 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -113,8 +113,6 @@ namespace jit {
 
 #if defined(USE_CUDA)
 #define TH_FORALL_TESTS_CUDA(_)                     \
-  _(ArgumentSpec)                                   \
-  _(CompleteArgumentSpec)                           \
   _(Fusion)                                         \
   _(GraphExecutor)                                  \
   _(ModuleConversion)                               \
@@ -220,8 +218,6 @@ namespace jit {
   _(GPU_FusionThreadPredicate)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
-  _(ArgumentSpec)               \
-  _(CompleteArgumentSpec)       \
   _(Fusion)                     \
   _(GraphExecutor)              \
   _(ModuleConversion)           \
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 3cc3585aa555..3f5126358804 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -575,48 +575,4 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
     _libtorch_python_sources.extend(libtorch_python_core_sources)
     _libtorch_python_sources.extend(libtorch_python_distributed_sources)
 
-    _libtorch_python_sources.extend([
-        "test/cpp/jit/torch_python_test.cpp",
-        "test/cpp/tensorexpr/padded_buffer.cpp",
-        "test/cpp/jit/test_alias_analysis.cpp",
-        "test/cpp/jit/test_argument_spec.cpp",
-        "test/cpp/jit/test_autodiff.cpp",
-        "test/cpp/jit/test_backend.cpp",
-        "test/cpp/jit/test_base.cpp",
-        "test/cpp/jit/test_class_import.cpp",
-        "test/cpp/jit/test_class_parser.cpp",
-        "test/cpp/jit/test_class_type.cpp",
-        "test/cpp/jit/test_code_template.cpp",
-        "test/cpp/jit/test_constant_pooling.cpp",
-        "test/cpp/jit/test_cleanup_passes.cpp",
-        "test/cpp/jit/test_create_autodiff_subgraphs.cpp",
-        "test/cpp/jit/test_custom_class.cpp",
-        "test/cpp/jit/test_custom_operators.cpp",
-        "test/cpp/jit/test_dce.cpp",
-        "test/cpp/jit/test_fuser.cpp",
-        "test/cpp/jit/test_gpu.cpp",
-        "test/cpp/jit/test_graph_executor.cpp",
-        "test/cpp/jit/test_inliner.cpp",
-        "test/cpp/jit/test_interface.cpp",
-        "test/cpp/jit/test_interpreter.cpp",
-        "test/cpp/jit/test_ir.cpp",
-        "test/cpp/jit/test_irparser.cpp",
-        "test/cpp/jit/test_jit_type.cpp",
-        "test/cpp/jit/test_lite_interpreter.cpp",
-        "test/cpp/jit/test_lite_trainer.cpp",
-        "test/cpp/jit/test_misc.cpp",
-        "test/cpp/jit/test_mobile_type_parser.cpp",
-        "test/cpp/jit/test_module_api.cpp",
-        "test/cpp/jit/test_peephole_optimize.cpp",
-        "test/cpp/jit/test_qualified_name.cpp",
-        "test/cpp/jit/test_save_load.cpp",
-        "test/cpp/jit/test_schema_matching.cpp",
-        "test/cpp/jit/test_subgraph_matcher.cpp",
-        "test/cpp/jit/test_subgraph_rewriter.cpp",
-        "test/cpp/jit/test_subgraph_utils.cpp",
-        "test/cpp/jit/test_utils.cpp",
-    ])
-
-    _libtorch_python_sources.extend(native.glob(["test/cpp/tensorexpr/test_*.cpp"]))
-
     return _libtorch_python_sources

From 67a19fecef1605267ea7581e67eb6a1f74b4842c Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 22 Sep 2020 20:17:23 -0700
Subject: [PATCH 037/449] CUDA BFloat16 pooling (#45151)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45151

Reviewed By: ailzhang

Differential Revision: D23854056

Pulled By: ngimel

fbshipit-source-id: 32f0835218c2602a09654a9ac2d161c4eb360f90
---
 aten/src/ATen/native/cuda/DilatedMaxPool2d.cu | 284 +++++++++---------
 aten/src/ATen/native/cuda/DilatedMaxPool3d.cu |  50 ++-
 test/test_nn.py                               |  17 +-
 3 files changed, 172 insertions(+), 179 deletions(-)

diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index c629dfc4030c..3e0e70c01952 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -366,70 +366,68 @@ void max_pool2d_with_indices_out_cuda_template(
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool2d_with_indices_out_cuda_frame",
     [&] {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-
-        scalar_t *output_data = output.data_ptr<scalar_t>();
-        scalar_t *input_data = input.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        switch (memory_format) {
-          case MemoryFormat::ChannelsLast: {
-            const int max_threads = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
-            int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
-            int block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
-            int block_y = std::min<int>(
-                maxThreadsDim[1], std::min<int>(lastPow2(outputWidth), max_threads / block_x));
-            int block_z = std::min<int>(
-                maxThreadsDim[2], std::min<int>(lastPow2(outputHeight), max_threads / block_x / block_y));
-            block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
-            const dim3 block(block_x, block_y, block_z);
-
-            int kernel_stride_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * 4); 
-            int kernel_size_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C); 
-
-            int grid_x = nbatch*kernel_stride_C;
-            int grid_y = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputWidth), block_y*BLOCK_STRIDE));
-            int grid_z = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE));
-            const dim3 grid(grid_x, grid_y, grid_z);
-
-            size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
-            AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
-
-            max_pool_forward_nhwc<scalar_t, scalar_t>
-            <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
-                input_data, nbatch, 
-                    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    in_stride_n, in_stride_c, 
-                    in_stride_h, in_stride_w,
-                    kernel_stride_C, kernel_size_C, 
-                    output_data, indices_data);
-            break;
-          }
-          case MemoryFormat::Contiguous: {
-            const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
-                                             BLOCK_THREADS);
-            max_pool_forward_nchw<scalar_t, scalar_t>
-                <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                count, input_data,
-                    nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    output_data, indices_data);
-            break;
-          }
-          default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      scalar_t *output_data = output.data_ptr<scalar_t>();
+      scalar_t *input_data = input.data_ptr<scalar_t>();
+      int64_t *indices_data = indices.data_ptr<int64_t>();
+
+      switch (memory_format) {
+        case MemoryFormat::ChannelsLast: {
+          const int max_threads = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
+          int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
+          int block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
+          int block_y = std::min<int>(
+              maxThreadsDim[1], std::min<int>(lastPow2(outputWidth), max_threads / block_x));
+          int block_z = std::min<int>(
+              maxThreadsDim[2], std::min<int>(lastPow2(outputHeight), max_threads / block_x / block_y));
+          block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
+          const dim3 block(block_x, block_y, block_z);
+
+          int kernel_stride_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * 4); 
+          int kernel_size_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C); 
+
+          int grid_x = nbatch*kernel_stride_C;
+          int grid_y = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputWidth), block_y*BLOCK_STRIDE));
+          int grid_z = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE));
+          const dim3 grid(grid_x, grid_y, grid_z);
+
+          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
+          AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
+
+          max_pool_forward_nhwc<scalar_t, scalar_t>
+          <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
+              input_data, nbatch, 
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  in_stride_n, in_stride_c, 
+                  in_stride_h, in_stride_w,
+                  kernel_stride_C, kernel_size_C, 
+                  output_data, indices_data);
+          break;
         }
-      });
+        case MemoryFormat::Contiguous: {
+          const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock,
+                                            BLOCK_THREADS);
+          max_pool_forward_nchw<scalar_t, scalar_t>
+              <<<cuda::ATenCeilDiv(count, num_threads), num_threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+              count, input_data,
+                  nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  output_data, indices_data);
+          break;
+        }
+        default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      }
     }
   );
 
@@ -532,88 +530,86 @@ void max_pool2d_with_indices_backward_out_cuda_template(
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool2d_with_indices_out_cuda_frame",
     [&] {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] {
-        using accscalar_t = acc_type<scalar_t, true>;
-
-        scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
-        scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
-        int64_t *indices_data = indices.data_ptr<int64_t>();
-
-        switch (memory_format) {
-          case MemoryFormat::ChannelsLast: {
-            const int max_threads = std::min<int>(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
-            int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
-            int block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
-            int block_y = std::min<int>(
-                maxThreadsDim[1], std::min<int>(lastPow2(inputWidth), max_threads / block_x));
-            int block_z = std::min<int>(
-                maxThreadsDim[2], std::min<int>(lastPow2(inputHeight), max_threads / block_x / block_y));
-            block_x = std::min<int>(
-                maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
-            const dim3 block(block_x, block_y, block_z);
-
-            int kernel_stride_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
-            int kernel_size_C = cuda::ATenCeilDiv(
-                safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C);
-
-            int grid_x = nbatch*kernel_stride_C;
-            int grid_y = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputWidth), block_y*BLOCK_STRIDE));
-            int grid_z = std::min<int>(
-                at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
-                cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputHeight), block_z*BLOCK_STRIDE));
-            const dim3 grid(grid_x, grid_y, grid_z);
-
-            size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t);
-            AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
-
-            // The backward kernel is launched on input instead output. 
-            // If it is launched on output layer, atomic_add would not provide much benefit on FP16. 
-            // Please check comments at https://github.com/pytorch/pytorch/pull/34519. 
-            max_pool_backward_nhwc<scalar_t, accscalar_t>
-            <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
-                count,
-                    gradOutput_data,
-                    indices_data,
-                    nbatch,
-                    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    out_stride_c, out_stride_h, out_stride_w,
-                    in_stride_n, in_stride_c, 
-                    in_stride_h, in_stride_w,
-                    kernel_stride_C, kernel_size_C, 
-                    gradInput_data);
-            break;
-          }
-          case MemoryFormat::Contiguous: {
-            int imgcount = inputWidth * inputHeight;
-            dim3 grid;
-            const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
-            grid.x = blocks;
-            grid.y = nbatch;
-            uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
-            if (maxGridY < grid.y) grid.y = maxGridY;
-            grid.z = nInputPlane;
-            uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
-            if (maxGridZ < grid.z) grid.z = maxGridZ;
-
-            max_pool_backward_nchw<scalar_t, accscalar_t>
-            <<<grid, BLOCK_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-                count,
-                    gradOutput_data,
-                    indices_data,
-                    nbatch,
-                    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                    gradInput_data);
-            break;
-          }
-          default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      using accscalar_t = acc_type<scalar_t, true>;
+
+      scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
+      scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
+      int64_t *indices_data = indices.data_ptr<int64_t>();
+
+      switch (memory_format) {
+        case MemoryFormat::ChannelsLast: {
+          const int max_threads = std::min<int>(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS);
+          int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim;
+          int block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), at::cuda::warp_size()));
+          int block_y = std::min<int>(
+              maxThreadsDim[1], std::min<int>(lastPow2(inputWidth), max_threads / block_x));
+          int block_z = std::min<int>(
+              maxThreadsDim[2], std::min<int>(lastPow2(inputHeight), max_threads / block_x / block_y));
+          block_x = std::min<int>(
+              maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
+          const dim3 block(block_x, block_y, block_z);
+
+          int kernel_stride_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
+          int kernel_size_C = cuda::ATenCeilDiv(
+              safe_downcast<int, int64_t>(nInputPlane), block_x * kernel_stride_C);
+
+          int grid_x = nbatch*kernel_stride_C;
+          int grid_y = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[1],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputWidth), block_y*BLOCK_STRIDE));
+          int grid_z = std::min<int>(
+              at::cuda::getCurrentDeviceProperties()->maxGridSize[2],
+              cuda::ATenCeilDiv(safe_downcast<int, int64_t>(inputHeight), block_z*BLOCK_STRIDE));
+          const dim3 grid(grid_x, grid_y, grid_z);
+
+          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t);
+          AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); 
+
+          // The backward kernel is launched on input instead output. 
+          // If it is launched on output layer, atomic_add would not provide much benefit on FP16. 
+          // Please check comments at https://github.com/pytorch/pytorch/pull/34519. 
+          max_pool_backward_nhwc<scalar_t, accscalar_t>
+          <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
+              count,
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  out_stride_c, out_stride_h, out_stride_w,
+                  in_stride_n, in_stride_c, 
+                  in_stride_h, in_stride_w,
+                  kernel_stride_C, kernel_size_C, 
+                  gradInput_data);
+          break;
         }
-      });
+        case MemoryFormat::Contiguous: {
+          int imgcount = inputWidth * inputHeight;
+          dim3 grid;
+          const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS;
+          grid.x = blocks;
+          grid.y = nbatch;
+          uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+          if (maxGridY < grid.y) grid.y = maxGridY;
+          grid.z = nInputPlane;
+          uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2];
+          if (maxGridZ < grid.z) grid.z = maxGridZ;
+
+          max_pool_backward_nchw<scalar_t, accscalar_t>
+          <<<grid, BLOCK_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
+              count,
+                  gradOutput_data,
+                  indices_data,
+                  nbatch,
+                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                  gradInput_data);
+          break;
+        }
+        default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
+      }
     }
   );
 
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
index 2b0ba37c8880..9d72e0027007 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
@@ -276,20 +276,18 @@ void max_pool3d_with_indices_out_cuda_template(
     input.scalar_type(),
     "max_pool3d_with_indices_out_frame",
     [&]{
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_out_frame", [&] {
-        scalar_t *input_data = work_input.data_ptr<scalar_t>();
-        int64_t totalZ = otime * nslices * nbatch;
-
-        max_pool3d_with_indices_out_frame(
-          input_data, work_output, work_indices,
-          totalZ,
-          itime, iheight, iwidth,
-          otime, oheight, owidth,
-          kT, kH, kW,
-          dT, dH, dW,
-          pT, pH, pW,
-          dilationT, dilationH, dilationW);
-      });
+      scalar_t *input_data = work_input.data_ptr<scalar_t>();
+      int64_t totalZ = otime * nslices * nbatch;
+
+      max_pool3d_with_indices_out_frame(
+        input_data, work_output, work_indices,
+        totalZ,
+        itime, iheight, iwidth,
+        otime, oheight, owidth,
+        kT, kH, kW,
+        dT, dH, dW,
+        pT, pH, pW,
+        dilationT, dilationH, dilationW);
     }
   );
 }
@@ -387,19 +385,17 @@ void max_pool3d_with_indices_backward_out_cuda_template(
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(),
     "max_pool3d_with_indices_backward_out_frame",
     [&] {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_backward_out_frame", [&] {
-        const int64_t totalZ = otime * nslices * nbatch;
-        scalar_t *grad_input_data = work_grad_input.data_ptr<scalar_t>();
-
-        max_pool3d_with_indices_backward_out_frame(
-          grad_input_data, work_grad_output, work_indices,
-          totalZ,
-          itime, iheight, iwidth,
-          oheight, owidth,
-          dT, dH, dW,
-          pT, pH, pW,
-          dilationT, dilationH, dilationW);
-      });
+      const int64_t totalZ = otime * nslices * nbatch;
+      scalar_t *grad_input_data = work_grad_input.data_ptr<scalar_t>();
+
+      max_pool3d_with_indices_backward_out_frame(
+        grad_input_data, work_grad_output, work_indices,
+        totalZ,
+        itime, iheight, iwidth,
+        oheight, owidth,
+        dT, dH, dW,
+        pT, pH, pW,
+        dilationT, dilationH, dilationW);
     }
   );
 }
diff --git a/test/test_nn.py b/test/test_nn.py
index 00614c0cdc34..281425e26782 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -33,6 +33,7 @@
 from torch.autograd.gradcheck import gradgradcheck
 from torch.nn import Parameter
 from torch.nn.parallel._functions import Broadcast
+from torch.testing import get_all_fp_dtypes
 from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
     TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \
     get_function_arglist, load_tests, repeat_test_for_types, ALL_TENSORTYPES, \
@@ -11552,32 +11553,32 @@ def expected_output(dim):
         self.assertEqual(output[0, 0, 0, 0], float("-inf"))
         self.assertEqual(indices[0, 0, 0, 0], 0)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_MaxPool1d_indices(self, device, dtype):
         self._test_maxpool_indices(1, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_MaxPool2d_indices(self, device, dtype):
         self._test_maxpool_indices(2, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_MaxPool3d_indices(self, device, dtype):
         self._test_maxpool_indices(3, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_AdaptiveMaxPool1d_indices(self, device, dtype):
         self._test_maxpool_indices(1, adaptive=True, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_AdaptiveMaxPool2d_indices(self, device, dtype):
         self._test_maxpool_indices(2, adaptive=True, device=device, dtype=dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_AdaptiveMaxPool3d_indices(self, device, dtype):
         self._test_maxpool_indices(3, adaptive=True, device=device, dtype=dtype)
@@ -11650,7 +11651,7 @@ def test_pooling_zero_stride(self, device):
                 self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero",
                                        lambda: fn_module(x))
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_pool_large_size(self, device, dtype):
         for op in ('max', 'avg'):
@@ -11664,7 +11665,7 @@ def test_pool_large_size(self, device, dtype):
                 # check if the output shape was still computed correctly
                 self.assertEqual(x.shape[2], res.shape[2])
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes())
     @dtypes(torch.float)
     def test_pool_invalid_size(self, device, dtype):
         for op in ('max', 'avg'):

From 1bd6533d60797949b599843dc5473eda0e3fce65 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 22 Sep 2020 21:13:07 -0700
Subject: [PATCH 038/449] Remove thread_local RecordFunctionGuard from
 profiler. (#44646)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44646

Per a discussion with ilia-cher, this is not needed anymore and
removing it would make some future changes to support async RPC profiling
easier. Tested by ensuring profiling tests in `test_autograd.py` still pass.
ghstack-source-id: 112605618

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D23683998

fbshipit-source-id: 4e49a439509884fe04d922553890ae353e3331ab
---
 torch/csrc/autograd/profiler.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 049f857f8bbf..9d75eea84328 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -388,9 +388,6 @@ void pushProfilingCallbacks() {
 
 const int kCUDAWarmupStart = 5;
 
-// temp. workaround for dispatcher ::Profiler key
-thread_local std::vector<std::shared_ptr<at::RecordFunctionGuard>> g_;
-
 } // namespace
 
 void registerCUDAMethods(CUDAStubs* stubs) {
@@ -450,7 +447,6 @@ void enableProfiler(const ProfilerConfig& new_config) {
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
   pushProfilingCallbacks();
-  g_.emplace_back(std::make_shared<at::RecordFunctionGuard>());
 
   if (new_config.state == ProfilerState::CUDA) {
     // event recording appears to have some startup overhead, so we need to
@@ -479,7 +475,6 @@ thread_event_lists disableProfiler() {
   TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled,
       "Can't disable profiler when it's not running");
 
-  g_.pop_back();
   at::removeCallback(state_ptr->callbackHandle());
 
   if (state_ptr->config().state == ProfilerState::NVTX) {

From 70d2e4d1f6c46db2edc056c0a4c458c57e3e46f6 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 22 Sep 2020 21:13:07 -0700
Subject: [PATCH 039/449] [RPC profiling] Allow disableProfiler() to be called
 from another thread. (#44653)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44653

This changes the profiler per a discussion with ilia-cher offline that enables `disableProfiler()` event consolidation logic to be called from different threads (i.e. threads where the profiler was not explicitly enabled). This is needed to support the functionality enabled by D23638387 where we defer profiling event collection until executing an async callback that can execute on a different thread, to support RPC async function profiling.

This is done by introducing 2 flags `cleanupTLSState` and `consolidate` which controls whether we should clean up thread local settings (we don't do this when calling `disableProfiler()` on non-main threads) and whether we should consolidate all profiled events. Backwards compatiblity is ensured since both options are true by default.

Added a test in `test_misc.cpp` to test this.
ghstack-source-id: 112605620

Reviewed By: mrshenli

Differential Revision: D23638499

fbshipit-source-id: f5bbb0d41ef883c5e5870bc27e086b8b8908f46b
---
 c10/util/ThreadLocalDebugInfo.cpp |  9 +++++++
 c10/util/ThreadLocalDebugInfo.h   |  3 +++
 test/cpp/jit/test_misc.cpp        | 41 +++++++++++++++++++++++++++++++
 test/cpp/jit/tests.h              |  1 +
 torch/csrc/autograd/init.cpp      |  6 ++++-
 torch/csrc/autograd/profiler.cpp  | 19 +++++++++-----
 torch/csrc/autograd/profiler.h    |  2 +-
 7 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/c10/util/ThreadLocalDebugInfo.cpp b/c10/util/ThreadLocalDebugInfo.cpp
index a9cdc26b5934..20d473667a8d 100644
--- a/c10/util/ThreadLocalDebugInfo.cpp
+++ b/c10/util/ThreadLocalDebugInfo.cpp
@@ -51,6 +51,15 @@ std::shared_ptr<DebugInfoBase> ThreadLocalDebugInfo::_pop(DebugInfoKind kind) {
   return res->info_;
 }
 
+/* static */
+std::shared_ptr<DebugInfoBase> ThreadLocalDebugInfo::_peek(DebugInfoKind kind) {
+  TORCH_CHECK(
+      debug_info && debug_info->kind_ == kind,
+      "Expected debug info of type ",
+      (size_t)kind);
+  return debug_info->info_;
+}
+
 
 DebugInfoGuard::DebugInfoGuard(
     DebugInfoKind kind, std::shared_ptr<DebugInfoBase> info) {
diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h
index 207abed781b0..9620cfb9fdea 100644
--- a/c10/util/ThreadLocalDebugInfo.h
+++ b/c10/util/ThreadLocalDebugInfo.h
@@ -46,6 +46,9 @@ class C10_API ThreadLocalDebugInfo {
   // Pop debug info, throws in case the last pushed
   // debug info is not of a given kind
   static std::shared_ptr<DebugInfoBase> _pop(DebugInfoKind kind);
+  // Peek debug info, throws in case the last pushed debug info is not of the
+  // given kind
+  static std::shared_ptr<DebugInfoBase> _peek(DebugInfoKind kind);
 
  private:
   std::shared_ptr<DebugInfoBase> info_;
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 4941c11d6cae..953d1bf42fc0 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2184,5 +2184,46 @@ void testTLSFutureCallbacks() {
   }
 }
 
+void testProfilerDisableInCallback() {
+  // cb that verifies the profiler is enabled
+  auto profilerEnabledCb = []() {
+    ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
+  };
+  torch::autograd::profiler::enableProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::CPU, false, false));
+  auto s1 = c10::make_intrusive<Future>(IntType::get());
+  s1->addCallback(wrapPropagateTLSState<void>([&profilerEnabledCb] {
+    // Ensure the profiler is still enabled in this thread.
+    profilerEnabledCb();
+    auto t1 = torch::ones({2, 2});
+    auto t2 = torch::ones({2, 2});
+    torch::add(t1, t2);
+    // Don't cleanup TLSState, and just consolidate.
+    auto thread_event_lists =
+        torch::autograd::profiler::disableProfiler(false, true);
+    // Ensure that the events from this thread are still profiled and we obtain
+    // the expected in events in our consolidated list when calling
+    // disableProfiler().
+    bool found_ones = false;
+    bool found_add = false;
+    for (const auto& li : thread_event_lists) {
+      for (const auto& evt : li) {
+        if (strcmp(evt.name(), "aten::add") == 0) {
+          found_add = true;
+        } else if (strcmp(evt.name(), "aten::ones") == 0) {
+          found_ones = true;
+        }
+      }
+    }
+    ASSERT_TRUE(found_ones);
+    ASSERT_TRUE(found_add);
+  }));
+  // Disable the profiler, but do not consolidate results in the main thread.
+  torch::autograd::profiler::disableProfiler(true, false);
+  std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); });
+  t.join();
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 452156fc052b..45d7f48b1f8a 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -84,6 +84,7 @@ namespace jit {
   _(DefaultArgTypeHinting)                        \
   _(Futures)                                      \
   _(TLSFutureCallbacks)                           \
+  _(ProfilerDisableInCallback)                    \
   _(MobileTypeParser)                             \
   _(LiteInterpreterBuiltinFunction)               \
   _(LiteInterpreterPrim)                          \
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index c92654cf7815..69759d1948b2 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -61,7 +61,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("sequence_nr", &Event::sequence_nr);
 
   m.def("_enable_profiler", enableProfiler);
-  m.def("_disable_profiler", disableProfiler);
+  m.def(
+      "_disable_profiler",
+      disableProfiler,
+      py::arg("cleanup_tls_states") = true,
+      py::arg("consolidate") = true);
   m.def("_profiler_enabled", profilerEnabled);
   m.def("_enable_record_function", [](bool enable) {
     at::enableRecordFunction(enable);
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 9d75eea84328..bab21ee5a7a8 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -442,7 +442,6 @@ void enableProfiler(const ProfilerConfig& new_config) {
 
   auto state_ptr = getProfilerTLSState();
   TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread");
-
   auto state = std::make_shared<ProfilerThreadLocalState>(new_config);
   c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state);
 
@@ -468,21 +467,29 @@ void enableProfiler(const ProfilerConfig& new_config) {
   state->mark("__start_profile", false);
 }
 
-thread_event_lists disableProfiler() {
+thread_event_lists disableProfiler(bool cleanupTLSState, bool consolidate) {
   // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
-  auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+  std::shared_ptr<c10::DebugInfoBase> state;
+  if (cleanupTLSState) {
+    state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE);
+  } else {
+    state = c10::ThreadLocalDebugInfo::_peek(c10::DebugInfoKind::PROFILER_STATE);
+  }
+
   auto state_ptr = static_cast<ProfilerThreadLocalState*>(state.get());
   TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled,
       "Can't disable profiler when it's not running");
 
-  at::removeCallback(state_ptr->callbackHandle());
+  if (cleanupTLSState) {
+    at::removeCallback(state_ptr->callbackHandle());
+  }
 
-  if (state_ptr->config().state == ProfilerState::NVTX) {
+  if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) {
     return thread_event_lists();
   }
 
   state_ptr->mark("__stop_profile");
-
+  // Note that this will erase the underlying events.
   return state_ptr->consolidate();
 }
 
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 3f962eff341d..6a7c5095a071 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -341,7 +341,7 @@ using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfiler(const ProfilerConfig&);
-TORCH_API thread_event_lists disableProfiler();
+TORCH_API thread_event_lists disableProfiler(bool cleanupTLSState = true, bool consolidate = true);
 // adds profiledEvents to the current thread local recorded events. Each event
 // will be marked with node ID given by fromNodeId.
 TORCH_API void addEventList(std::vector<Event>&& profiledEvents);

From d4a634c2093d1a47ed0390765a7f4d4c6d70e015 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Tue, 22 Sep 2020 21:13:07 -0700
Subject: [PATCH 040/449] [RPC profiling] Don't wrap toHere() calls with
 profiling (#44655)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44655

Since `toHere()` does not execute operations over RPC and simply
transfers the value to the local node, we don't need to enable the profiler
remotely for this message. This causes unnecessary overhead and is not needed.

Since `toHere` is a blocking call, we already profile the call on the local node using `RECORD_USER_SCOPE`, so this does not change the expected profiler results (validated by ensuring all remote profiling tests pass).
ghstack-source-id: 112605610

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D23641466

fbshipit-source-id: 109d9eb10bd7fe76122b2026aaf1c7893ad10588
---
 torch/csrc/distributed/autograd/utils.cpp | 5 +++--
 torch/csrc/distributed/autograd/utils.h   | 3 ++-
 torch/csrc/distributed/rpc/rref_impl.cpp  | 9 +++++----
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp
index 726cc605a913..464d8248d8a4 100644
--- a/torch/csrc/distributed/autograd/utils.cpp
+++ b/torch/csrc/distributed/autograd/utils.cpp
@@ -143,7 +143,8 @@ std::shared_ptr<FutureMessage> sendMessageWithAutograd(
     const WorkerInfo& dst,
     torch::distributed::rpc::Message&& wrappedRpcMsg,
     bool forceGradRecording,
-    const float rpcTimeoutSeconds) {
+    const float rpcTimeoutSeconds,
+    bool forceDisableProfiling) {
   auto msg = getMessageWithAutograd(
       dst.id_,
       std::move(wrappedRpcMsg),
@@ -153,7 +154,7 @@ std::shared_ptr<FutureMessage> sendMessageWithAutograd(
   std::shared_ptr<FutureMessage> fut;
   // If profiler is enabled, wrap this message with profiling metadata that will
   // tell the remote end to process this request with the profiler enabled.
-  if (torch::autograd::profiler::profilerEnabled()) {
+  if (!forceDisableProfiling && torch::autograd::profiler::profilerEnabled()) {
     auto profilerConfig = torch::autograd::profiler::getProfilerConfig();
     auto msgWithProfiling = getMessageWithProfiling(
         std::move(msg),
diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h
index c6316378a146..2a0a066e1a95 100644
--- a/torch/csrc/distributed/autograd/utils.h
+++ b/torch/csrc/distributed/autograd/utils.h
@@ -51,7 +51,8 @@ sendMessageWithAutograd(
     const rpc::WorkerInfo& dst,
     rpc::Message&& wrappedRpcMsg,
     bool forceGradRecording = false,
-    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout);
+    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+    bool forceDisableProfiling = false);
 
 } // namespace autograd
 } // namespace distributed
diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp
index 34249172473c..6c6a377a4652 100644
--- a/torch/csrc/distributed/rpc/rref_impl.cpp
+++ b/torch/csrc/distributed/rpc/rref_impl.cpp
@@ -141,9 +141,6 @@ IValue UserRRef::toHere(const float timeoutSeconds) const {
         "to_here#({})->({})",
         RpcAgent::getCurrentRpcAgent()->getWorkerInfo().name_,
         RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_).name_);
-    auto& remoteProfilerManager =
-        torch::distributed::rpc::RemoteProfilerManager::getInstance();
-    remoteProfilerManager.setCurrentKey(toHereKey);
   }
   RECORD_USER_SCOPE(toHereKey);
   TORCH_CHECK(
@@ -170,12 +167,16 @@ IValue UserRRef::toHere(const float timeoutSeconds) const {
     msgToSend = ScriptRRefFetchCall(ownerId_, rrefId()).toMessage();
   }
 
+  // toHere is profiled as a blocking call, and does not execute operations on
+  // the remote node. Hence, don't wrap it with a profiling message since we
+  // don't need the profiler to be enabled remotely.
   auto futureResponse = autograd::sendMessageWithAutograd(
       *agent,
       agent->getWorkerInfo(ownerId_),
       std::move(msgToSend),
       true /* forceGradRecording */,
-      timeoutSeconds);
+      timeoutSeconds,
+      true /* forceDisableProfiling */);
 
   // TODO: we should ideally be able to interrupt this blocking wait if we check
   // getTimedOut() and it is true

From cb75addee4dfd7604766397f8f0a294b950c4a03 Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Tue, 22 Sep 2020 21:15:15 -0700
Subject: [PATCH 041/449] torch.package - a way to package models and code
 (#45015)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45015

torch.package allows you to write packages of code, pickled python data, and
arbitrary binary and text resources into a self-contained package.

torch.package.PackageExporter writes the packages and
torch.package.PackageImporter reads them.

The importers can load this code in a hermetic way, such that code is loaded
from the package rather than the normal python import system. This allows
for the packaging of PyTorch model code and data so that it can be run
on a server or used in the future for transfer learning.

The code contained in packages is copied file-by-file from the original
source when it is created, and the file format is a specially organized
zip file. Future users of the package can unzip the package, and edit the code
in order to perform custom modifications to it.

The importer for packages ensures that code in the module can only be loaded from
within the package, except for modules explicitly listed as external using :method:`extern_module`.
The file `extern_modules` in the zip archive lists all the modules that a package externally depends on.
This prevents "implicit" dependencies where the package runs locally because it is importing
a locally-installed package, but then fails when the package is copied to another machine.

Test Plan: Imported from OSS

Reviewed By: SplitInfinity

Differential Revision: D23824337

Pulled By: zdevito

fbshipit-source-id: 1247c34ba9b656f9db68a83e31f2a0fbe3bea6bd
---
 test/module_a.py                        |   1 +
 test/namespace_b/subpackage.py          |   0
 test/package_a/__init__.py              |   7 +
 test/package_a/subpackage.py            |   3 +
 test/run_test.py                        |   3 +-
 test/test_package.py                    | 309 +++++++++++++++++
 torch/package/__init__.py               |   2 +
 torch/package/_custom_import_pickler.py |  78 +++++
 torch/package/_importlib.py             |  83 +++++
 torch/package/_mock.py                  |  39 +++
 torch/package/_mock_zipreader.py        |  48 +++
 torch/package/exporter.py               | 435 ++++++++++++++++++++++++
 torch/package/find_file_dependencies.py |  42 +++
 torch/package/importer.py               | 388 +++++++++++++++++++++
 torch/serialization.py                  |   4 +-
 15 files changed, 1439 insertions(+), 3 deletions(-)
 create mode 100644 test/module_a.py
 create mode 100644 test/namespace_b/subpackage.py
 create mode 100644 test/package_a/__init__.py
 create mode 100644 test/package_a/subpackage.py
 create mode 100644 test/test_package.py
 create mode 100644 torch/package/__init__.py
 create mode 100644 torch/package/_custom_import_pickler.py
 create mode 100644 torch/package/_importlib.py
 create mode 100644 torch/package/_mock.py
 create mode 100644 torch/package/_mock_zipreader.py
 create mode 100644 torch/package/exporter.py
 create mode 100644 torch/package/find_file_dependencies.py
 create mode 100644 torch/package/importer.py

diff --git a/test/module_a.py b/test/module_a.py
new file mode 100644
index 000000000000..685af9bc1569
--- /dev/null
+++ b/test/module_a.py
@@ -0,0 +1 @@
+result = 'module_a'
diff --git a/test/namespace_b/subpackage.py b/test/namespace_b/subpackage.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/package_a/__init__.py b/test/package_a/__init__.py
new file mode 100644
index 000000000000..4761b3db5e41
--- /dev/null
+++ b/test/package_a/__init__.py
@@ -0,0 +1,7 @@
+result = 'package_a'
+
+class PackageAObject:
+    __slots__ = ['obj']
+
+    def __init__(self, obj):
+        self.obj = obj
diff --git a/test/package_a/subpackage.py b/test/package_a/subpackage.py
new file mode 100644
index 000000000000..46f729d51852
--- /dev/null
+++ b/test/package_a/subpackage.py
@@ -0,0 +1,3 @@
+result = 'package_a.subpackage'
+class PackageASubpackageObject:
+    pass
diff --git a/test/run_test.py b/test/run_test.py
index 606e20a6f723..d63fc372f9c2 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -89,7 +89,8 @@
     'test_determination',
     'test_futures',
     'test_fx',
-    'test_functional_autograd_benchmark'
+    'test_functional_autograd_benchmark',
+    'test_package',
 ]
 
 WINDOWS_BLOCKLIST = [
diff --git a/test/test_package.py b/test/test_package.py
new file mode 100644
index 000000000000..a25726a53c00
--- /dev/null
+++ b/test/test_package.py
@@ -0,0 +1,309 @@
+from unittest import main, skipIf
+from torch.testing._internal.common_utils import TestCase, IS_WINDOWS
+from tempfile import NamedTemporaryFile
+from torch.package import PackageExporter, PackageImporter
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import torch
+from sys import version_info
+
+try:
+    from torchvision.models import resnet18
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision")
+
+
+
+packaging_directory = Path(__file__).parent
+
+class PackagingTest(TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._temporary_files = []
+
+    def temp(self):
+        t = NamedTemporaryFile()
+        name = t.name
+        if IS_WINDOWS:
+            t.close()  # can't read an open file in windows
+        else:
+            self._temporary_files.append(t)
+        return name
+
+    def tearDown(self):
+        for t in self._temporary_files:
+            t.close()
+        self._temporary_files = []
+
+    def test_saving_source(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.save_source_file('foo', str(packaging_directory / 'module_a.py'))
+            he.save_source_file('foodir', str(packaging_directory / 'package_a'))
+        hi = PackageImporter(filename)
+        foo = hi.import_module('foo')
+        s = hi.import_module('foodir.subpackage')
+        self.assertEqual(foo.result, 'module_a')
+        self.assertEqual(s.result, 'package_a.subpackage')
+
+    def test_saving_string(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            src = """\
+import math
+the_math = math
+"""
+            he.save_source_string('my_mod', src)
+        hi = PackageImporter(filename)
+        m = hi.import_module('math')
+        import math
+        self.assertIs(m, math)
+        my_mod = hi.import_module('my_mod')
+        self.assertIs(my_mod.math, math)
+
+    def test_save_module(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            import module_a
+            import package_a
+            he.save_module(module_a.__name__)
+            he.save_module(package_a.__name__)
+        hi = PackageImporter(filename)
+        module_a_i = hi.import_module('module_a')
+        self.assertEqual(module_a_i.result, 'module_a')
+        self.assertIsNot(module_a, module_a_i)
+        package_a_i = hi.import_module('package_a')
+        self.assertEqual(package_a_i.result, 'package_a')
+        self.assertIsNot(package_a_i, package_a)
+
+    def test_pickle(self):
+        import package_a.subpackage
+        obj = package_a.subpackage.PackageASubpackageObject()
+        obj2 = package_a.PackageAObject(obj)
+
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.save_pickle('obj', 'obj.pkl', obj2)
+        hi = PackageImporter(filename)
+
+        # check we got dependencies
+        sp = hi.import_module('package_a.subpackage')
+        # check we didn't get other stuff
+        with self.assertRaises(ImportError):
+            hi.import_module('module_a')
+
+        obj_loaded = hi.load_pickle('obj', 'obj.pkl')
+        self.assertIsNot(obj2, obj_loaded)
+        self.assertIsInstance(obj_loaded.obj, sp.PackageASubpackageObject)
+        self.assertIsNot(package_a.subpackage.PackageASubpackageObject, sp.PackageASubpackageObject)
+
+    def test_resources(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.save_text('main', 'main', "my string")
+            he.save_binary('main', 'main_binary', "my string".encode('utf-8'))
+            src = """\
+import resources
+t = resources.load_text('main', 'main')
+b = resources.load_binary('main', 'main_binary')
+"""
+            he.save_source_string('main', src, is_package=True)
+        hi = PackageImporter(filename)
+        m = hi.import_module('main')
+        self.assertEqual(m.t, "my string")
+        self.assertEqual(m.b, "my string".encode('utf-8'))
+
+    def test_extern(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.extern_modules(['package_a.subpackage', 'module_a'])
+            he.save_module('package_a')
+        hi = PackageImporter(filename)
+        import package_a.subpackage
+        import module_a
+
+        module_a_im = hi.import_module('module_a')
+        hi.import_module('package_a.subpackage')
+        package_a_im = hi.import_module('package_a')
+
+        self.assertIs(module_a, module_a_im)
+        self.assertIsNot(package_a, package_a_im)
+        self.assertIs(package_a.subpackage, package_a_im.subpackage)
+
+    @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature')
+    def test_mock(self):
+        filename = self.temp()
+        with PackageExporter(filename, verbose=False) as he:
+            he.mock_modules(['package_a.subpackage', 'module_a'])
+            he.save_module('package_a')
+        hi = PackageImporter(filename)
+        import package_a.subpackage
+        _ = package_a.subpackage
+        import module_a
+        _ = module_a
+
+        m = hi.import_module('package_a.subpackage')
+        r = m.result
+        with self.assertRaisesRegex(NotImplementedError, 'was mocked out'):
+            r()
+
+    @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature')
+    def test_custom_requires(self):
+        filename = self.temp()
+
+        class Custom(PackageExporter):
+            def require_module(self, name, dependencies):
+                if name == 'module_a':
+                    self.mock_module('module_a')
+                elif name == 'package_a':
+                    self.save_source_string('package_a', 'import module_a\nresult = 5\n')
+                else:
+                    raise NotImplementedError('wat')
+
+        with Custom(filename, verbose=False) as he:
+            he.save_source_string('main', 'import package_a\n')
+
+        hi = PackageImporter(filename)
+        hi.import_module('module_a').should_be_mocked
+        bar = hi.import_module('package_a')
+        self.assertEqual(bar.result, 5)
+
+    @skipIfNoTorchVision
+    def test_resnet(self):
+        resnet = resnet18()
+
+        f1 = self.temp()
+
+        # create a package that will save it along with its code
+        with PackageExporter(f1, verbose=False) as e:
+            # put the pickled resnet in the package, by default
+            # this will also save all the code files references by
+            # the objects in the pickle
+            e.save_pickle('model', 'model.pkl', resnet)
+
+        # we can now load the saved model
+        i = PackageImporter(f1)
+        r2 = i.load_pickle('model', 'model.pkl')
+
+        # test that it works
+        input = torch.rand(1, 3, 224, 224)
+        ref = resnet(input)
+        self.assertTrue(torch.allclose(r2(input), ref))
+
+        # functions exist also to get at the private modules in each package
+        torchvision = i.import_module('torchvision')
+
+        f2 = self.temp()
+        # if we are doing transfer learning we might want to re-save
+        # things that were loaded from a package
+        with PackageExporter(f2, verbose=False) as e:
+            # We need to tell the exporter about any modules that
+            # came from imported packages so that it can resolve
+            # class names like torchvision.models.resnet.ResNet
+            # to their source code.
+
+            e.importers.insert(0, i.import_module)
+
+            # e.importers is a list of module importing functions
+            # that by default contains importlib.import_module.
+            # it is searched in order until the first success and
+            # that module is taken to be what torchvision.models.resnet
+            # should be in this code package. In the case of name collisions,
+            # such as trying to save a ResNet from two different packages,
+            # we take the first thing found in the path, so only ResNet objects from
+            # one importer will work. This avoids a bunch of name mangling in
+            # the source code. If you need to actually mix ResNet objects,
+            # we suggest reconstructing the model objects using code from a single package
+            # using functions like save_state_dict and load_state_dict to transfer state
+            # to the correct code objects.
+            e.save_pickle('model', 'model.pkl', r2)
+
+        i2 = PackageImporter(f2)
+        r3 = i2.load_pickle('model', 'model.pkl')
+        self.assertTrue(torch.allclose(r3(input), ref))
+
+        # test we can load from a directory
+        import zipfile
+        zf = zipfile.ZipFile(f1, 'r')
+
+        with TemporaryDirectory() as td:
+            zf.extractall(path=td)
+            iz = PackageImporter(str(Path(td) / Path(f1).name))
+            r4 = iz.load_pickle('model', 'model.pkl')
+            self.assertTrue(torch.allclose(r4(input), ref))
+
+    @skipIfNoTorchVision
+    def test_model_save(self):
+
+        # This example shows how you might package a model
+        # so that the creator of the model has flexibility about
+        # how they want to save it but the 'server' can always
+        # use the same API to load the package.
+
+        # The convension is for each model to provide a
+        # 'model' package with a 'load' function that actual
+        # reads the model out of the archive.
+
+        # How the load function is implemented is up to the
+        # the packager.
+
+        # get our normal torchvision resnet
+        resnet = resnet18()
+
+
+        f1 = self.temp()
+        # Option 1: save by pickling the whole model
+        # + single-line, similar to torch.jit.save
+        # - more difficult to edit the code after the model is created
+        with PackageExporter(f1, verbose=False) as e:
+            e.save_pickle('model', 'pickled', resnet)
+            # note that this source is the same for all models in this approach
+            # so it can be made part of an API that just takes the model and
+            # packages it with this source.
+            src = """\
+import resources # gives you access to the importer from within the package
+
+# server knows to call model.load() to get the model,
+# maybe in the future it passes options as arguments by convension
+def load():
+    return resources.load_pickle('model', 'pickled')
+        """
+            e.save_source_string('model', src, is_package=True)
+
+        f2 = self.temp()
+        # Option 2: save with state dict
+        # - more code to write to save/load the model
+        # + but this code can be edited later to adjust adapt the model later
+        with PackageExporter(f2, verbose=False) as e:
+            e.save_pickle('model', 'state_dict', resnet.state_dict())
+            src = """\
+import resources # gives you access to the importer from within the package
+from torchvision.models.resnet import resnet18
+def load():
+    # if you want, you can later edit how resnet is constructed here
+    # to edit the model in the package, while still loading the original
+    # state dict weights
+    r = resnet18()
+    state_dict = resources.load_pickle('model', 'state_dict')
+    r.load_state_dict(state_dict)
+    return r
+        """
+            e.save_source_string('model', src, is_package=True)
+
+
+
+        # regardless of how we chose to package, we can now use the model in a server in the same way
+        input = torch.rand(1, 3, 224, 224)
+        results = []
+        for m in [f1, f2]:
+            importer = PackageImporter(m)
+            the_model = importer.import_module('model').load()
+            r = the_model(input)
+            results.append(r)
+
+        self.assertTrue(torch.allclose(*results))
+
+if __name__ == '__main__':
+    main()
diff --git a/torch/package/__init__.py b/torch/package/__init__.py
new file mode 100644
index 000000000000..be7159a1836d
--- /dev/null
+++ b/torch/package/__init__.py
@@ -0,0 +1,2 @@
+from .importer import PackageImporter
+from .exporter import PackageExporter
diff --git a/torch/package/_custom_import_pickler.py b/torch/package/_custom_import_pickler.py
new file mode 100644
index 000000000000..fd5787b6b3e3
--- /dev/null
+++ b/torch/package/_custom_import_pickler.py
@@ -0,0 +1,78 @@
+from pickle import _Pickler, _getattribute, whichmodule, _extension_registry, _compat_pickle  # type: ignore
+from pickle import GLOBAL, STACK_GLOBAL, EXT1, EXT2, EXT4, PicklingError
+from struct import pack
+
+class CustomImportPickler(_Pickler):
+    def __init__(self, import_module, *args, **kwargs):
+        self.import_module = import_module
+        super().__init__(*args, **kwargs)
+
+    def save_global(self, obj, name=None):
+        # unfortunately the pickler code is factored in a way that
+        # forces us to copy/paste this function. The only change is marked
+        # CHANGED below.
+        write = self.write
+        memo = self.memo
+
+        if name is None:
+            name = getattr(obj, '__qualname__', None)
+        if name is None:
+            name = obj.__name__
+
+        module_name = whichmodule(obj, name)
+        try:
+            # CHANGED: self.import_module rather than
+            # __import__
+            module = self.import_module(module_name)
+            obj2, parent = _getattribute(module, name)
+        except (ImportError, KeyError, AttributeError):
+            raise PicklingError(
+                "Can't pickle %r: it's not found as %s.%s" %
+                (obj, module_name, name)) from None
+        else:
+            if obj2 is not obj:
+                raise PicklingError(
+                    "Can't pickle %r: it's not the same object as %s.%s" %
+                    (obj, module_name, name))
+
+        if self.proto >= 2:
+            code = _extension_registry.get((module_name, name))
+            if code:
+                assert code > 0
+                if code <= 0xff:
+                    write(EXT1 + pack("<B", code))
+                elif code <= 0xffff:
+                    write(EXT2 + pack("<H", code))
+                else:
+                    write(EXT4 + pack("<i", code))
+                return
+        lastname = name.rpartition('.')[2]
+        if parent is module:
+            name = lastname
+        # Non-ASCII identifiers are supported only with protocols >= 3.
+        if self.proto >= 4:
+            self.save(module_name)
+            self.save(name)
+            write(STACK_GLOBAL)
+        elif parent is not module:
+            self.save_reduce(getattr, (parent, lastname))
+        elif self.proto >= 3:
+            write(GLOBAL + bytes(module_name, "utf-8") + b'\n' +
+                  bytes(name, "utf-8") + b'\n')
+        else:
+            if self.fix_imports:
+                r_name_mapping = _compat_pickle.REVERSE_NAME_MAPPING
+                r_import_mapping = _compat_pickle.REVERSE_IMPORT_MAPPING
+                if (module_name, name) in r_name_mapping:
+                    module_name, name = r_name_mapping[(module_name, name)]
+                elif module_name in r_import_mapping:
+                    module_name = r_import_mapping[module_name]
+            try:
+                write(GLOBAL + bytes(module_name, "ascii") + b'\n' +
+                      bytes(name, "ascii") + b'\n')
+            except UnicodeEncodeError:
+                raise PicklingError(
+                    "can't pickle global identifier '%s.%s' using "
+                    "pickle protocol %i" % (module, name, self.proto)) from None
+
+        self.memoize(obj)
diff --git a/torch/package/_importlib.py b/torch/package/_importlib.py
new file mode 100644
index 000000000000..1b521ca1a962
--- /dev/null
+++ b/torch/package/_importlib.py
@@ -0,0 +1,83 @@
+import _warnings
+import os.path
+# note: implementations 
+# copied from cpython's import code
+
+
+# _zip_searchorder defines how we search for a module in the Zip
+# archive: we first search for a package __init__, then for
+# non-package .pyc, and .py entries. The .pyc entries
+# are swapped by initzipimport() if we run in optimized mode. Also,
+# '/' is replaced by path_sep there.
+
+_zip_searchorder = (
+    ('/__init__.py', True),
+    ('.py', False),
+)
+
+# Replace any occurrences of '\r\n?' in the input string with '\n'.
+# This converts DOS and Mac line endings to Unix line endings.
+def _normalize_line_endings(source):
+    source = source.replace(b'\r\n', b'\n')
+    source = source.replace(b'\r', b'\n')
+    return source
+
+def _resolve_name(name, package, level):
+    """Resolve a relative module name to an absolute one."""
+    bits = package.rsplit('.', level - 1)
+    if len(bits) < level:
+        raise ValueError('attempted relative import beyond top-level package')
+    base = bits[0]
+    return '{}.{}'.format(base, name) if name else base
+
+def _sanity_check(name, package, level):
+    """Verify arguments are "sane"."""
+    if not isinstance(name, str):
+        raise TypeError('module name must be str, not {}'.format(type(name)))
+    if level < 0:
+        raise ValueError('level must be >= 0')
+    if level > 0:
+        if not isinstance(package, str):
+            raise TypeError('__package__ not set to a string')
+        elif not package:
+            raise ImportError('attempted relative import with no known parent '
+                              'package')
+    if not name and level == 0:
+        raise ValueError('Empty module name')
+
+def _calc___package__(globals):
+    """Calculate what __package__ should be.
+
+    __package__ is not guaranteed to be defined or could be set to None
+    to represent that its proper value is unknown.
+
+    """
+    package = globals.get('__package__')
+    spec = globals.get('__spec__')
+    if package is not None:
+        if spec is not None and package != spec.parent:
+            _warnings.warn("__package__ != __spec__.parent "
+                           f"({package!r} != {spec.parent!r})",
+                           ImportWarning, stacklevel=3)
+        return package
+    elif spec is not None:
+        return spec.parent
+    else:
+        _warnings.warn("can't resolve package from __spec__ or __package__, "
+                       "falling back on __name__ and __path__",
+                       ImportWarning, stacklevel=3)
+        package = globals['__name__']
+        if '__path__' not in globals:
+            package = package.rpartition('.')[0]
+    return package
+
+def _normalize_path(path):
+    """Normalize a path by ensuring it is a string.
+
+    If the resulting string contains path separators, an exception is raised.
+    """
+    parent, file_name = os.path.split(path)
+    if parent:
+        raise ValueError('{!r} must be only a file name'.format(path))
+    else:
+        return file_name
diff --git a/torch/package/_mock.py b/torch/package/_mock.py
new file mode 100644
index 000000000000..d291bb58ba5e
--- /dev/null
+++ b/torch/package/_mock.py
@@ -0,0 +1,39 @@
+
+_magic_methods = ['__subclasscheck__', '__hex__', '__rmul__',
+                  '__float__', '__idiv__', '__setattr__', '__div__', '__invert__',
+                  '__nonzero__', '__rshift__',
+                  '__eq__', '__pos__', '__round__',
+                  '__rand__', '__or__', '__complex__', '__divmod__',
+                  '__len__', '__reversed__', '__copy__', '__reduce__',
+                  '__deepcopy__', '__rdivmod__', '__rrshift__', '__ifloordiv__',
+                  '__hash__', '__iand__', '__xor__', '__isub__', '__oct__',
+                  '__ceil__', '__imod__', '__add__', '__truediv__',
+                  '__unicode__', '__le__', '__delitem__', '__sizeof__', '__sub__',
+                  '__ne__', '__pow__', '__bytes__', '__mul__',
+                  '__itruediv__', '__bool__', '__iter__', '__abs__',
+                  '__gt__', '__iadd__', '__enter__',
+                  '__floordiv__', '__call__', '__neg__',
+                  '__and__', '__ixor__', '__getitem__', '__exit__', '__cmp__',
+                  '__getstate__', '__index__', '__contains__', '__floor__', '__lt__', '__getattr__',
+                  '__mod__', '__trunc__', '__delattr__', '__instancecheck__', '__setitem__', '__ipow__',
+                  '__ilshift__', '__long__', '__irshift__', '__imul__',
+                  '__lshift__', '__dir__', '__ge__', '__int__', '__ior__']
+
+
+class MockedObject:
+    _name: str
+
+    def __init__(self, name):
+        self.__dict__['_name'] = name
+
+    def __repr__(self):
+        return f"MockedObject({self._name})"
+
+
+def install_method(method_name):
+    def _not_implemented(self, *args, **kwargs):
+        raise NotImplementedError(f"Object '{self._name}' was mocked out during packaging but it is being used in {method_name}")
+    setattr(MockedObject, method_name, _not_implemented)
+
+for method_name in _magic_methods:
+    install_method(method_name)
diff --git a/torch/package/_mock_zipreader.py b/torch/package/_mock_zipreader.py
new file mode 100644
index 000000000000..b273d41fba51
--- /dev/null
+++ b/torch/package/_mock_zipreader.py
@@ -0,0 +1,48 @@
+import torch
+from glob import glob
+import os.path
+from typing import List, Any
+
+_storages : List[Any] = [
+    torch.DoubleStorage,
+    torch.FloatStorage,
+    torch.LongStorage,
+    torch.IntStorage,
+    torch.ShortStorage,
+    torch.CharStorage,
+    torch.ByteStorage,
+    torch.BoolStorage,
+]
+_dtype_to_storage = {
+    data_type(0).dtype: data_type for data_type in _storages
+}
+
+# because get_storage_from_record returns a tensor!?
+class _HasStorage(object):
+    def __init__(self, storage):
+        self._storage = storage
+
+    def storage(self):
+        return self._storage
+
+
+class MockZipReader(object):
+    def __init__(self, directory):
+        self.directory = directory
+
+    def get_record(self, name):
+        filename = f'{self.directory}/{name}'
+        with open(filename, 'rb') as f:
+            return f.read()
+
+    def get_storage_from_record(self, name, numel, dtype):
+        storage = _dtype_to_storage[dtype]
+        filename = f'{self.directory}/{name}'
+        return _HasStorage(storage.from_file(filename=filename, size=numel))
+
+    def get_all_records(self, ):
+        files = []
+        for filename in glob(f'{self.directory}/**', recursive=True):
+            if not os.path.isdir(filename):
+                files.append(filename[len(self.directory) + 1:])
+        return files
diff --git a/torch/package/exporter.py b/torch/package/exporter.py
new file mode 100644
index 000000000000..8530f6f68f3a
--- /dev/null
+++ b/torch/package/exporter.py
@@ -0,0 +1,435 @@
+import torch
+from torch.serialization import normalize_storage_type, location_tag, _should_read_directly
+import io
+import pickle
+import pickletools
+from .find_file_dependencies import find_files_source_depends_on
+from ._custom_import_pickler import CustomImportPickler
+from ._importlib import _normalize_path
+import types
+import importlib
+from typing import List, Any, Callable, Dict
+from distutils.sysconfig import get_python_lib
+from pathlib import Path
+import linecache
+import sys
+
+class PackageExporter:
+    """ Exporters allow you to write packages of code, pickled python data, and
+    arbitrary binary and text resources into a self-contained package.
+
+    Imports can load this code in a hermetic way, such that code is loaded
+    from the package rather than the normal python import system. This allows
+    for the packaging of PyTorch model code and data so that it can be run
+    on a server or used in the future for transfer learning.
+
+    The code contained in packages is copied file-by-file from the original
+    source when it is created, and the file format is a specially organized
+    zip file. Future users of the package can unzip the package, and edit the code
+    in order to perform custom modifications to it.
+
+    The importer for packages ensures that code in the module can only be loaded from
+    within the package, except for modules explicitly listed as external using :method:`extern_module`.
+    The file `extern_modules` in the zip archive lists all the modules that a package externally depends on.
+    This prevents "implicit" dependencies where the package runs locally because it is importing
+    a locally-installed package, but then fails when the package is copied to another machine.
+
+
+    Dependencies
+    ------------
+
+    When source code is added to the package, the exporter optionally can scan it
+    for further code dependencies (`dependencies=True`). It looks for import statements,
+    resolves relative references to qualified module names, and calls :method:`require_module`
+    on each it finds, recursively resolving dependencies.
+
+    """
+
+    importers: List[Callable[[str], Any]]
+    """ A list of functions that will be called in order to find the module assocated
+    with module names referenced by other modules or by pickled objects. Initialized to
+    `[importlib.import_module]` by default. When pickling code or objects that was loaded
+    from an imported packaged, that `importer.import_module` should be put into the importer list.
+    When a name conflict occurs between importers, the first importer in the list takes precedence,
+    and only objects that refer to this first importers class can be saved
+    """
+
+
+    def __init__(self, filename: str, verbose: bool = True):
+        """
+        Create an exporter.
+
+        Args:
+            filename: e.g. my_package.zip
+            verbose: Print information about dependency resolution to stdout.
+                Useful for tracking down why certain files get included.
+        """
+        self.zip_file = torch._C.PyTorchFileWriter(filename)
+        self.serialized_storages : Dict[str, Any] = {}
+        self.external : List[str] = []
+        self.provided : Dict[str, bool] = {}
+        self.verbose = verbose
+        self.importers = [importlib.import_module]
+
+    def save_source_file(self, module_name: str, file_or_directory: str, dependencies=True):
+        """Adds the local file system `file_or_directory` to the source package to provide the code
+        for `module_name`.
+
+        Args:
+            module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package.
+            file_or_directory (str): the path to a file or directory of code. When a directory, all python files in the directory
+                are recursively copied using :meth:`save_source_file`. If a file is named "/__init__.py" the code is treated
+                as a package.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+        """
+        path = Path(file_or_directory)
+        if path.is_dir():
+            to_save = []  # list of tuples with arguments to save_source_string
+            module_path = module_name.replace('.', '/')
+            for filename in path.glob('**/*.py'):
+                relative_path = filename.relative_to(path).as_posix()
+                archivename = module_path + '/' + relative_path
+                if filename.is_dir():
+                    self.provided[archivename] = True
+                else:
+                    submodule_name = None
+                    if filename.name == '__init__.py':
+                        submodule_name = archivename[:-len('/__init__.py')].replace('/', '.')
+                        is_package = True
+                    else:
+                        submodule_name = archivename[:-len('.py')].replace('/', '.')
+                        is_package = False
+
+                    self.provided[submodule_name] = True
+                    # we delay the call to save_source_string so that we record all the source files
+                    # being provided by this directory structure _before_ attempting to resolve the dependencies
+                    # on the source. This makes sure we don't try to copy over modules that will just get
+                    # overwritten by this directory blob
+                    to_save.append((submodule_name, _read_file(str(filename)), is_package, dependencies, str(filename)))
+
+            for item in to_save:
+                self.save_source_string(*item)
+        else:
+            is_package = path.name == '__init__.py'
+            self.save_source_string(module_name, _read_file(file_or_directory), is_package, dependencies, file_or_directory)
+
+    def save_source_string(self, module_name: str, src: str, is_package: bool = False,
+                           dependencies: bool = True, orig_file_name: str = None):
+        """Adds `src` as the source code for `module_name` in the exported package.
+
+        Args:
+            module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package.
+            src (str): The python source code to save for this package
+            is_package (bool, optional): If True, this module is treated as a package. Packages are allowed to have submodules
+                (e.g. my_package.my_subpackage.my_subsubpackage), and resources can be saved inside them. Defaults to False.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+            orig_file_name (str, optional): If present, used in logging to identifying where the source came from. Defaults to None.
+        """
+        self.provided[module_name] = True
+        extension = '/__init__.py' if is_package else '.py'
+        filename = module_name.replace('.', '/') + extension
+        self._write(filename, src)
+        if dependencies:
+            package = module_name if is_package else module_name.rsplit('.', maxsplit=1)[0]
+            dep_list = find_files_source_depends_on(src, package)
+            if self.verbose:
+                def fmt_dep(mod, obj):
+                    return f'{mod}' if obj is None else f'{mod}.{obj}'
+                dep_str = ''.join(f'  {fmt_dep(mod, obj)}\n' for mod, obj in dep_list)
+                file_info = f'(from file {orig_file_name}) ' if orig_file_name is not None else ''
+                print(f"{module_name} {file_info}depends on:\n{dep_str}\n")
+
+            for dep_module_name, dep_module_obj in dep_list:
+                # handle the case where someone did something like `from pack import sub`
+                # where `sub` is a submodule. In this case we don't have to save pack, just sub.
+                # this ensures we don't pick up additional dependencies on pack.
+                # However, in the case where `sub` is not a submodule but an object, then we do have
+                # to save pack.
+                if dep_module_obj is not None:
+                    possible_submodule = f'{dep_module_name}.{dep_module_obj}'
+                    if self._module_exists(possible_submodule):
+                        self.require_module_if_not_provided(possible_submodule)
+                        # we don't need to save `pack`
+                        continue
+                if self._module_exists(dep_module_name):
+                    self.require_module_if_not_provided(dep_module_name)
+
+    def _module_exists(self, module_name: str) -> bool:
+        try:
+            self._import_module(module_name)
+            return True
+        except ModuleNotFoundError:
+            return False
+
+    def _get_source_of_module(self, module: types.ModuleType) -> str:
+        filename = getattr(module, '__file__', None)
+        result = None if filename is None else linecache.getlines(filename, module.__dict__)
+        if result is None:
+            raise ValueError(f'cannot save source for module "{module.__name__}" because '
+                             f'its source file "{filename}" could not be found.')
+        return ''.join(result)
+
+    def require_module_if_not_provided(self, module_name: str, dependencies=True):
+        if self._module_is_already_provided(module_name):
+            return
+        self.require_module(module_name, dependencies)
+
+    def require_module(self, module_name: str, dependencies=True):
+        """This is called by dependencies resolution when it finds that something in the package
+        depends on the module and it is not already present. It then decides how to provide that module.
+        The default resolution rules will mark the module as extern if it is part of the standard library,
+        and call `save_module` otherwise. Clients can subclass this object
+        and override this method to provide other behavior, such as automatically mocking out a whole class
+        of modules"""
+
+        root_name = module_name.split('.', maxsplit=1)[0]
+        if self._can_implicitly_extern(root_name):
+            if self.verbose:
+                print(f'implicitly adding {root_name} to external modules '
+                      f'since it is part of the standard library and is a dependency.')
+            self.extern_module(root_name)
+            return
+
+        self.save_module(module_name, dependencies)
+
+    def save_module(self, module_name: str, dependencies=True):
+        """Save the code for `module_name` into the package. Code for the module is resolved using the `importers` path to find the
+        module object, and then using its `__file__` attribute to find the source code.
+        Args:
+            module_name (str): e.g. `my_package.my_subpackage`, code will be saved to provide code for this package.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+        """
+        module = self._import_module(module_name)
+        source = self._get_source_of_module(module)
+        self.save_source_string(module_name, source, hasattr(module, '__path__'), dependencies, module.__file__)
+
+
+    def _import_module(self, module_name):
+        last_err = None
+        for import_module in self.importers:
+            try:
+                return import_module(module_name)
+            except ModuleNotFoundError as err:
+                last_err = err
+        if last_err is not None:
+            raise last_err
+        else:
+            raise ModuleNotFoundError(module_name)
+
+    def _create_pickler(self, data_buf):
+        if self.importers == [importlib.import_module]:
+            # if we are using the normal import library system, then
+            # we can use the C implementation of pickle which is faster
+            return pickle.Pickler(data_buf, protocol=3)
+        else:
+            return CustomImportPickler(self._import_module, data_buf, protocol=3)
+
+    def save_pickle(self, package: str, resource: str, obj: Any, dependencies: bool = True):
+        """Save a python object to the archive using pickle. Equivalent to :func:`torch.save` but saving into
+        the archive rather than a stand-alone file. Stanard pickle does not save the code, only the objects.
+        If `dependencies` is true, this method will also scan the pickled objects for which modules are required
+        to reconstruct them and save the relevant code.
+
+        To be able to save an object where `type(obj).__name__` is `my_module.MyObject`,
+        `my_module.MyObject` must resolve to the class of the object according to the `importer` order. When saving objects that
+        have previously been packaged, the importer's `import_module` method will need to be present in the `importer` list
+        for this to work.
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage")
+            resource (str): A unique name for the resource, used to indentify it to load.
+            obj (Any): The object to save, must be picklable.
+            dependencies (bool, optional): If True, we scan the source for dependencies (see :ref:`Dependencies`).
+        """
+        filename = self._filename(package, resource)
+        # Write the pickle data for `obj`
+        data_buf = io.BytesIO()
+        pickler = self._create_pickler(data_buf)
+        pickler.persistent_id = self._persistent_id
+        pickler.dump(obj)
+        data_value = data_buf.getvalue()
+
+        if dependencies:
+            all_dependencies = []
+            for opcode, arg, pos in pickletools.genops(data_value):
+                if opcode.name == 'GLOBAL':  # a global reference
+                    assert isinstance(arg, str)
+                    module, field = arg.split(' ')
+                    if module not in all_dependencies:
+                        all_dependencies.append(module)
+
+            if self.verbose:
+                dep_string = ''.join(f'  {dep}\n' for dep in all_dependencies)
+                print(f"{resource} depends on:\n{dep_string}\n")
+
+            for module_name in all_dependencies:
+                self.require_module_if_not_provided(module_name)
+
+        self._write(filename, data_value)
+
+    def save_text(self, package: str, resource: str, text: str):
+        """Save text data to the package
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage")
+            resource (str): A unique name for the resource, used to indentify it to load.
+            text (str): The contents to save
+        """
+        return self.save_binary(package, resource, text.encode('utf-8'))
+
+    def save_binary(self, package, resource, binary: bytes):
+        """Save raw bytes to the package.
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. "my_package.my_subpackage")
+            resource (str): A unique name for the resource, used to indentify it to load.
+            binary (str): The data to save.
+        """
+        filename = self._filename(package, resource)
+        self._write(filename, binary)
+
+    def extern_module(self, module_name: str):
+        """Include `module` in the list of external modules the package can import.
+        This will prevent dependency discover from saving
+        it in the package. The importer will load an external module directly from the standard import system.
+        Code for extern modules must also exist in the process loading the package.
+
+        Args:
+            module_name (str): e.g. "my_package.my_subpackage" the name of the external module
+        """
+        if module_name not in self.external:
+            self.external.append(module_name)
+
+    def extern_modules(self, module_names: List[str]):
+        """Extern a list of modules. Convience wrapper for calling :meth:`extern_module` on many items.
+
+        Args:
+            module_names (List[str]): List of module names
+        """
+        for m in module_names:
+            self.extern_module(m)
+
+    def mock_module(self, module_name: str):
+        """Replace the code for `module_name` in the package with a fake implementation. This module will return a fake
+        object for any attribute accessed from it. Because we copy file-by-file, the dependency resolution will sometimes
+        find files that are imported by model files but whose functionality is never used
+        (e.g. custom serialization code or training helpers).
+        Use this function to mock this functionality out without having to modify the original code.
+
+        Args:
+            module_name (str): e.g. "my_package.my_subpackage" the name of the module to be mocked out.
+        """
+        if '_mock' not in self.provided:
+            self.save_source_file('_mock', str(Path(__file__).parent / '_mock.py'), dependencies=False)
+        is_package = hasattr(self._import_module(module_name), '__path__')
+        self.save_source_string(module_name, _MOCK_IMPL, is_package, dependencies=False)
+
+
+    def mock_modules(self, module_names):
+        """Mock a list of modules. Convience wrapper for calling :meth:`mock_module` on many items.
+
+        Args:
+            module_names (List[str]): List of module names
+        """
+        for module_name in module_names:
+            self.mock_module(module_name)
+
+    def _module_is_already_provided(self, qualified_name: str) -> bool:
+        for mod in self.external:
+            if qualified_name == mod or qualified_name.startswith(mod + '.'):
+                return True
+        return qualified_name in self.provided
+
+    def _persistent_id(self, obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if torch.is_storage(obj):
+            storage_type = normalize_storage_type(type(obj))
+            obj_key = str(obj._cdata)
+            location = location_tag(obj)
+            self.serialized_storages[obj_key] = obj
+
+            return ('storage',
+                    storage_type,
+                    obj_key,
+                    location,
+                    obj.size())
+        return None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.close()
+
+    def _write(self, filename, str_or_bytes):
+        if isinstance(str_or_bytes, str):
+            str_or_bytes = str_or_bytes.encode('utf-8')
+        self.zip_file.write_record(filename, str_or_bytes, len(str_or_bytes))
+
+    def close(self):
+        """Write the package to the filesystem. Any calls after close are now invalid.
+        It is preferable to use resource guard syntax instead:
+
+            with PackageExporter("file.zip") as e:
+                ...
+        """
+        # Write each tensor to a file named tensor/the_tensor_key in the zip archive
+        for key in sorted(self.serialized_storages.keys()):
+            name = 'data/{}'.format(key)
+            storage = self.serialized_storages[key]
+            if storage.device.type == 'cpu':
+                # If it's on the CPU we can directly copy it into the zip file
+                num_bytes = storage.size() * storage.element_size()
+                self.zip_file.write_record(name, storage.data_ptr(), num_bytes)
+            else:
+                # Copy to a buffer, then serialize that
+                buf = io.BytesIO()
+                storage._write_file(buf, _should_read_directly(buf))
+                buf_value = buf.getvalue()
+                self._write(name, buf_value)
+        contents = ('\n'.join(self.external) + '\n')
+        self._write('extern_modules', contents)
+        del self.zip_file
+
+    def _filename(self, package, resource):
+        package_path = package.replace('.', '/')
+        resource = _normalize_path(resource)
+        return f'{package_path}/{resource}'
+
+    def _can_implicitly_extern(self, module_name: str):
+        return module_name == 'torch' or (module_name not in _DISALLOWED_MODULES
+                                          and _is_builtin_or_stdlib_module(self._import_module(module_name)))
+
+
+# even though these are in the standard library, we do not allow them to be
+# automatically externed since they offer a lot of system level access
+_DISALLOWED_MODULES = ['sys', 'io']
+
+def _is_builtin_or_stdlib_module(module: types.ModuleType) -> bool:
+    if module.__name__ in sys.builtin_module_names:
+        return True
+    filename = module.__file__
+    if filename is None:
+        return False
+    standard_lib = get_python_lib(standard_lib=True)
+    # this is often a subdirectory of standard_lib so we have to check
+    # that the file is in the standard_lib directory but not in this one
+    installed_libs = get_python_lib(standard_lib=False)
+    in_standard_lib = filename.startswith(standard_lib + '/')
+    in_installed_libs = filename.startswith(installed_libs + '/')
+    return in_standard_lib and not in_installed_libs
+
+_MOCK_IMPL = """\
+from _mock import MockedObject
+def __getattr__(attr: str):
+    return MockedObject(__name__ + '.' + attr)
+"""
+
+def _read_file(filename: str) -> str:
+    with open(filename, 'rb') as f:
+        b = f.read()
+        return b.decode('utf-8')
diff --git a/torch/package/find_file_dependencies.py b/torch/package/find_file_dependencies.py
new file mode 100644
index 000000000000..25b501e37adc
--- /dev/null
+++ b/torch/package/find_file_dependencies.py
@@ -0,0 +1,42 @@
+from typing import List, Optional, Tuple
+import ast
+from ._importlib import _resolve_name
+
+class _ExtractModuleReferences(ast.NodeVisitor):
+    """
+    Extract the list of global variables a block of code will read and write
+    """
+
+    @classmethod
+    def run(cls, src: str, package: str) -> List[Tuple[str, Optional[str]]]:
+        visitor = cls(package)
+        tree = ast.parse(src)
+        visitor.visit(tree)
+        return list(visitor.references.keys())
+
+    def __init__(self, package):
+        super().__init__()
+        self.package = package
+        self.references = {}
+
+    def _absmodule(self, module_name: str, level: int) -> str:
+        if level > 0:
+            return _resolve_name(module_name, self.package, level)
+        return module_name
+
+    def visit_Import(self, node):
+        for alias in node.names:
+            self.references[(alias.name, None)] = True
+
+    def visit_ImportFrom(self, node):
+        name = self._absmodule(node.module, 0 if node.level is None else node.level)
+        for alias in node.names:
+            # from my_package import foo
+            # foo may be a module, so we have to add it to the list of
+            # potential references, if import of it fails, we will ignore it
+            if alias.name != '*':
+                self.references[(name, alias.name)] = True
+            else:
+                self.references[(name, None)] = True
+
+find_files_source_depends_on = _ExtractModuleReferences.run
diff --git a/torch/package/importer.py b/torch/package/importer.py
new file mode 100644
index 000000000000..59c7cd9d0312
--- /dev/null
+++ b/torch/package/importer.py
@@ -0,0 +1,388 @@
+from typing import List, Callable, Dict, Optional, Any, Union
+import builtins
+import importlib
+from torch.serialization import _load
+import pickle
+import torch
+import _compat_pickle  # type: ignore
+import types
+import os.path
+
+from ._importlib import _normalize_line_endings, _resolve_name, _sanity_check, _calc___package__, \
+    _normalize_path
+from ._mock_zipreader import MockZipReader
+
+class PackageImporter:
+    """Importers allow you to load code written to packages by PackageExporter.
+    Code is loaded in a hermetic way, using files from the package
+    rather than the normal python import system. This allows
+    for the packaging of PyTorch model code and data so that it can be run
+    on a server or used in the future for transfer learning.
+
+    The importer for packages ensures that code in the module can only be loaded from
+    within the package, except for modules explicitly listed as external during export.
+    The file `extern_modules` in the zip archive lists all the modules that a package externally depends on.
+    This prevents "implicit" dependencies where the package runs locally because it is importing
+    a locally-installed package, but then fails when the package is copied to another machine.
+    """
+
+    modules : Dict[str, Optional[types.ModuleType]]
+    """The dictionary of already loaded modules from this package, equivalent to `sys.modules` but
+    local to this importer.
+    """
+
+    def __init__(self, filename: str, module_allowed: Callable[[str], bool] = lambda module_name: True):
+        """Open `filename` for importing. This checks that the imported package only requires modules
+        allowed by `module_allowed`
+
+        Args:
+            filename (str): archive to load. Can also be a directory of the unzipped files in the archive
+                for easy debugging and editing.
+            module_allowed (Callable[[str], bool], optional): A method to determine if a externally provided module
+                should be allowed. Can be used to ensure packages loaded do not depend on modules that the server
+                does not support. Defaults to allowing anything.
+
+        Raises:
+            ImportError: If the package will use a disallowed module.
+        """
+        self.filename = filename
+        self.zip_reader : Any
+        if not os.path.isdir(self.filename):
+            self.zip_reader = torch._C.PyTorchFileReader(self.filename)
+        else:
+            self.zip_reader = MockZipReader(self.filename)
+
+        self.root = _PackageNode(None)
+        self.modules = {}
+        self.extern_modules = self._read_extern()
+
+        for extern_module in self.extern_modules:
+            if not module_allowed(extern_module):
+                raise ImportError(f"package '{filename}' needs the external module '{extern_module}' "
+                                  f"but that module has been disallowed")
+            self._add_extern(extern_module)
+
+        for filename in self.zip_reader.get_all_records():
+            self._add_file(filename)
+
+        self.patched_builtins = builtins.__dict__.copy()
+        self.patched_builtins['__import__'] = self.__import__
+        # allow pickles from archive using `import resources`
+        self.modules['resources'] = self  # type: ignore
+
+        # used for torch.serialization._load
+        self.Unpickler = lambda *args, **kwargs: _UnpicklerWrapper(self, *args, **kwargs)
+
+    def import_module(self, name: str, package=None):
+        """Load a module from the package if it hasn't already been loaded, and then return
+        the module. Modules are loaded locally
+        to the importer and will appear in `self.modules` rather than `sys.modules`
+
+        Args:
+            name (str): Fully qualified name of the module to load.
+            package ([type], optional): Unused, but present to match the signature of importlib.import_module. Defaults to None.
+
+        Returns:
+            types.ModuleType: the (possibly already) loaded module.
+        """
+        return self._gcd_import(name)
+
+    def load_binary(self, package: str, resource: str) -> bytes:
+        """Load raw bytes.
+
+        Args:
+            package (str): The name of module package (e.g. "my_package.my_subpackage")
+            resource (str): The unique name for the resource.
+
+        Returns:
+            bytes: The loaded data.
+        """
+
+        path = self._zipfile_path(package, resource)
+        return self.zip_reader.get_record(path)
+
+    def load_text(self, package: str, resource: str, encoding: str = 'utf-8', errors: str = 'strict') -> str:
+        """Load a string.
+
+        Args:
+            package (str): The name of module package (e.g. "my_package.my_subpackage")
+            resource (str): The unique name for the resource.
+            encoding (str, optional): Passed to `decode`. Defaults to 'utf-8'.
+            errors (str, optional): Passed to `decode`. Defaults to 'strict'.
+
+        Returns:
+            str: The loaded text.
+        """
+        data = self.load_binary(package, resource)
+        return data.decode(encoding, errors)
+
+    def load_pickle(self, package: str, resource: str, map_location=None) -> Any:
+        """Unpickles the resource from the package, loading any modules that are needed to construct the objects
+        using :meth:`import_module`
+
+        Args:
+            package (str): The name of module package (e.g. "my_package.my_subpackage")
+            resource (str): The unique name for the resource.
+            map_location: Passed to `torch.load` to determine how tensors are mapped to devices. Defaults to None.
+
+        Returns:
+            Any: the unpickled object.
+        """
+        pickle_file = self._zipfile_path(package, resource)
+        return _load(self.zip_reader, map_location, self, pickle_file=pickle_file)
+
+
+    def _read_extern(self):
+        return self.zip_reader.get_record('extern_modules').decode('utf-8').splitlines(keepends=False)
+
+    def _make_module(self, name: str, filename: Optional[str], is_package: bool):
+        spec = importlib.machinery.ModuleSpec(name, self, is_package=is_package)  # type: ignore
+        module = importlib.util.module_from_spec(spec)
+        self.modules[name] = module
+        ns = module.__dict__
+        ns['__spec__'] = spec
+        ns['__loader__'] = self
+        ns['__file__'] = filename
+        ns['__cached__'] = None
+        ns['__builtins__'] = self.patched_builtins
+        if filename is not None:
+            code = self._compile_source(filename)
+            exec(code, ns)
+        return module
+
+    def _load_module(self, name: str):
+        cur : _PathNode = self.root
+        for atom in name.split('.'):
+            if not isinstance(cur, _PackageNode) or atom not in cur.children:
+                raise ModuleNotFoundError(
+                    f'No module named "{name}" in self-contained archive "{self.filename}"'
+                    f' and the module is also not in the list of allowed external modules: {self.extern_modules}')
+            cur = cur.children[atom]
+            if isinstance(cur, _ExternNode):
+                module = self.modules[name] = importlib.import_module(name)
+                return module
+        return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode))  # type: ignore
+
+    def _compile_source(self, fullpath):
+        source = self.zip_reader.get_record(fullpath)
+        source = _normalize_line_endings(source)
+        return compile(source, fullpath, 'exec', dont_inherit=True)
+
+    # note: named `get_source` so that linecache can find the source
+    # when this is the __loader__ of a module.
+    def get_source(self, module_name) -> str:
+        module = self.import_module(module_name)
+        return self.zip_reader.get_record(module.__file__).decode('utf-8')
+
+    # note: copied from cpython's import code, with call to create module replaced with _make_module
+    def _do_find_and_load(self, name):
+        path = None
+        parent = name.rpartition('.')[0]
+        if parent:
+            if parent not in self.modules:
+                self._gcd_import(parent)
+            # Crazy side-effects!
+            if name in self.modules:
+                return self.modules[name]
+            parent_module = self.modules[parent]
+            try:
+                path = parent_module.__path__  # type: ignore
+            except AttributeError:
+                msg = (_ERR_MSG + '; {!r} is not a package').format(name, parent)
+                raise ModuleNotFoundError(msg, name=name) from None
+
+        module = self._load_module(name)
+
+        if parent:
+            # Set the module as an attribute on its parent.
+            parent_module = self.modules[parent]
+            if parent_module.__loader__ is self:  # type: ignore
+                setattr(parent_module, name.rpartition('.')[2], module)
+        return module
+
+    # note: copied from cpython's import code
+    def _find_and_load(self, name):
+        module = self.modules.get(name, _NEEDS_LOADING)
+        if module is _NEEDS_LOADING:
+            return self._do_find_and_load(name)
+
+        if module is None:
+            message = ('import of {} halted; '
+                       'None in sys.modules'.format(name))
+            raise ModuleNotFoundError(message, name=name)
+
+        return module
+
+
+    def _gcd_import(self, name, package=None, level=0):
+        """Import and return the module based on its name, the package the call is
+        being made from, and the level adjustment.
+
+        This function represents the greatest common denominator of functionality
+        between import_module and __import__. This includes setting __package__ if
+        the loader did not.
+
+        """
+        _sanity_check(name, package, level)
+        if level > 0:
+            name = _resolve_name(name, package, level)
+
+        return self._find_and_load(name)
+
+    # note: copied from cpython's import code
+    def _handle_fromlist(self, module, fromlist, *, recursive=False):
+        """Figure out what __import__ should return.
+
+        The import_ parameter is a callable which takes the name of module to
+        import. It is required to decouple the function from assuming importlib's
+        import implementation is desired.
+
+        """
+        # The hell that is fromlist ...
+        # If a package was imported, try to import stuff from fromlist.
+        if hasattr(module, '__path__'):
+            for x in fromlist:
+                if not isinstance(x, str):
+                    if recursive:
+                        where = module.__name__ + '.__all__'
+                    else:
+                        where = "``from list''"
+                    raise TypeError(f"Item in {where} must be str, "
+                                    f"not {type(x).__name__}")
+                elif x == '*':
+                    if not recursive and hasattr(module, '__all__'):
+                        self._handle_fromlist(module, module.__all__,
+                                              recursive=True)
+                elif not hasattr(module, x):
+                    from_name = '{}.{}'.format(module.__name__, x)
+                    try:
+                        self._gcd_import(from_name)
+                    except ModuleNotFoundError as exc:
+                        # Backwards-compatibility dictates we ignore failed
+                        # imports triggered by fromlist for modules that don't
+                        # exist.
+                        if (exc.name == from_name and
+                           self.modules.get(from_name, _NEEDS_LOADING) is not None):
+                            continue
+                        raise
+        return module
+
+    def __import__(self, name, globals=None, locals=None, fromlist=(), level=0):
+        if level == 0:
+            module = self._gcd_import(name)
+        else:
+            globals_ = globals if globals is not None else {}
+            package = _calc___package__(globals_)
+            module = self._gcd_import(name, package, level)
+        if not fromlist:
+            # Return up to the first dot in 'name'. This is complicated by the fact
+            # that 'name' may be relative.
+            if level == 0:
+                return self._gcd_import(name.partition('.')[0])
+            elif not name:
+                return module
+            else:
+                # Figure out where to slice the module's name up to the first dot
+                # in 'name'.
+                cut_off = len(name) - len(name.partition('.')[0])
+                # Slice end needs to be positive to alleviate need to special-case
+                # when ``'.' not in name``.
+                return self.modules[module.__name__[:len(module.__name__) - cut_off]]
+        else:
+            return self._handle_fromlist(module, fromlist)
+
+    def _get_package(self, package):
+        """Take a package name or module object and return the module.
+
+        If a name, the module is imported.  If the passed or imported module
+        object is not a package, raise an exception.
+        """
+        if hasattr(package, '__spec__'):
+            if package.__spec__.submodule_search_locations is None:
+                raise TypeError('{!r} is not a package'.format(
+                    package.__spec__.name))
+            else:
+                return package
+        else:
+            module = self.import_module(package)
+            if module.__spec__.submodule_search_locations is None:
+                raise TypeError('{!r} is not a package'.format(package))
+            else:
+                return module
+
+    def _zipfile_path(self, package, resource):
+        package = self._get_package(package)
+        resource = _normalize_path(resource)
+        assert package.__loader__ is self
+        return f"{package.__name__.replace('.', '/')}/{resource}"
+
+    def _get_or_create_package(self, atoms: List[str]) -> 'Union[_PackageNode, _ExternNode]':
+        cur = self.root
+        for i, atom in enumerate(atoms):
+            node = cur.children.get(atom, None)
+            if node is None:
+                node = cur.children[atom] = _PackageNode(None)
+            if isinstance(node, _ExternNode):
+                return node
+            if isinstance(node, _ModuleNode):
+                name = ".".join(atoms[:i])
+                raise ImportError(f'inconsistent module structure. module {name} is not a package, but has submodules')
+            assert isinstance(node, _PackageNode)
+            cur = node
+        return cur
+
+    def _add_file(self, filename: str):
+        *prefix, last = filename.split('/')
+        package = self._get_or_create_package(prefix)
+        if isinstance(package, _ExternNode):
+            raise ImportError(f'inconsistent module structure. package contains a module file {filename}'
+                              f' that is a subpackage of a module marked external.')
+        if last == '__init__.py':
+            package.source_file = filename
+        elif last.endswith('.py'):
+            package_name = last.rstrip('.py')
+            package.children[package_name] = _ModuleNode(filename)
+
+    def _add_extern(self, extern_name: str):
+        *prefix, last = extern_name.split('.')
+        package = self._get_or_create_package(prefix)
+        if isinstance(package, _ExternNode):
+            return  # the shorter extern covers this extern case
+        package.children[last] = _ExternNode()
+
+
+_NEEDS_LOADING = object()
+_ERR_MSG_PREFIX = 'No module named '
+_ERR_MSG = _ERR_MSG_PREFIX + '{!r}'
+
+class _UnpicklerWrapper(pickle._Unpickler):  # type: ignore
+    def __init__(self, importer, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._importer = importer
+
+    def find_class(self, module, name):
+        # Subclasses may override this.
+        if self.proto < 3 and self.fix_imports:
+            if (module, name) in _compat_pickle.NAME_MAPPING:
+                module, name = _compat_pickle.NAME_MAPPING[(module, name)]
+            elif module in _compat_pickle.IMPORT_MAPPING:
+                module = _compat_pickle.IMPORT_MAPPING[module]
+        mod = self._importer.import_module(module)
+        return getattr(mod, name)
+
+class _PathNode:
+    pass
+
+class _PackageNode(_PathNode):
+    def __init__(self, source_file: Optional[str]):
+        self.source_file = source_file
+        self.children : Dict[str, _PathNode] = {}
+
+class _ModuleNode(_PathNode):
+    __slots__ = ['source_file']
+
+    def __init__(self, source_file: str):
+        self.source_file = source_file
+
+class _ExternNode(_PathNode):
+    pass
diff --git a/torch/serialization.py b/torch/serialization.py
index c68c1ff0b60d..1c05767922a8 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -821,7 +821,7 @@ def restore_location(storage, location):
     return restore_location
 
 
-def _load(zip_file, map_location, pickle_module, **pickle_load_args):
+def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', **pickle_load_args):
     restore_location = _get_restore_location(map_location)
 
     loaded_storages = {}
@@ -847,7 +847,7 @@ def persistent_load(saved_id):
         return storage
 
     # Load the data (which may in turn use `persistent_load` to load tensors)
-    data_file = io.BytesIO(zip_file.get_record('data.pkl'))
+    data_file = io.BytesIO(zip_file.get_record(pickle_file))
     unpickler = pickle_module.Unpickler(data_file, **pickle_load_args)
     unpickler.persistent_load = persistent_load
     result = unpickler.load()

From 25ed739ac90cb5fa82963131411cc783de0bd8fd Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Tue, 22 Sep 2020 21:15:15 -0700
Subject: [PATCH 042/449] [packaging] rstrip fix (#45166)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45166

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D23852505

Pulled By: zdevito

fbshipit-source-id: 6bb743b37333ae19fc24629686e8d06aef812c50
---
 torch/package/importer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/package/importer.py b/torch/package/importer.py
index 59c7cd9d0312..1a02e69436fa 100644
--- a/torch/package/importer.py
+++ b/torch/package/importer.py
@@ -340,7 +340,7 @@ def _add_file(self, filename: str):
         if last == '__init__.py':
             package.source_file = filename
         elif last.endswith('.py'):
-            package_name = last.rstrip('.py')
+            package_name = last[:-len('.py')]
             package.children[package_name] = _ModuleNode(filename)
 
     def _add_extern(self, extern_name: str):

From 0a9ac98bed5d3b14566f19e584071764d570cb8c Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Tue, 22 Sep 2020 21:40:37 -0700
Subject: [PATCH 043/449] [reland][pytorch] refine dispatch keys in
 native_functions.yaml (1/N) (#45137)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45137

Reland https://github.com/pytorch/pytorch/pull/45010 - which broke
master due to merge conflict.

Test Plan: Imported from OSS

Reviewed By: bhosmer

Differential Revision: D23843510

Pulled By: ljk53

fbshipit-source-id: 28aabb9da533b6b806ab8779a0ee96b695e9e242
---
 aten/src/ATen/native/native_functions.yaml | 140 ++++++++++++++++++++-
 1 file changed, 135 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index d5a746e2a522..c61f021f8c5f 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -167,13 +167,13 @@
 - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
   variants: function
   dispatch:
-     CUDA: fused_dropout_cuda
+    CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
   use_c10_dispatcher: full
   variants: function
   dispatch:
-     CUDA: masked_scale_cuda
+    CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
   use_c10_dispatcher: full
@@ -299,6 +299,8 @@
   variants: function, method
 
 - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: conj_out
 
 - func: _conj(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -313,6 +315,8 @@
   variants: function, method
 
 - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: acos_out
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
@@ -489,6 +493,8 @@
   variants: function, method
 
 - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: acosh_out
 
 # arccosh, alias for acosh
 - func: arccosh(Tensor self) -> Tensor
@@ -510,6 +516,8 @@
   variants: function, method
 
 - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: asinh_out
 
 # arcsinh, alias for asinh
 - func: arcsinh(Tensor self) -> Tensor
@@ -531,6 +539,8 @@
   variants: function, method
 
 - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atanh_out
 
 # arctanh, alias for atanh
 - func: arctanh(Tensor self) -> Tensor
@@ -591,6 +601,8 @@
   variants: function, method
 
 - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan_out
 
 # arctan, alias of atan
 - func: arctan(Tensor self) -> Tensor
@@ -682,6 +694,8 @@
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: bernoulli_
 
 # This out-of-place version isn't used explicitly, but needed by jit.
 # There is no default valid on `p` here because it would introduce ambiguity
@@ -926,12 +940,16 @@
   variants: function
 
 - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: complex_out
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
   use_c10_dispatcher: full
   variants: function
 
 - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polar_out
 
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
   use_c10_dispatcher: full
@@ -1005,6 +1023,8 @@
   variants: function, method
 
 - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cos_out
 
 - func: cosh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1015,6 +1035,8 @@
   variants: function, method
 
 - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cosh_out
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
@@ -1198,7 +1220,7 @@
 - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   dispatch:
-    CPU:  ctc_loss_cpu
+    CPU: ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
@@ -1464,6 +1486,8 @@
   variants: function, method
 
 - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1474,6 +1498,8 @@
   variants: function, method
 
 - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1484,6 +1510,8 @@
   variants: function, method
 
 - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp_out
 
 - func: exp2(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1494,6 +1522,8 @@
   variants: function, method
 
 - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -1608,6 +1638,8 @@
   variants: function, method
 
 - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: frac_out
 
 - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   device_guard: False
@@ -1626,6 +1658,8 @@
     CPU: from_file
 
 - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: gcd_out
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -1636,6 +1670,8 @@
   variants: function, method
 
 - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: lcm_out
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -2004,12 +2040,16 @@
     CPU, CUDA: log2_out
 
 - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp_out
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logaddexp2_out
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -2724,6 +2764,8 @@
   variants: function, method
 
 - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: reciprocal_out
 
 - func: neg(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2910,6 +2952,8 @@
 
 - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: silu_out
 
 - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2931,6 +2975,8 @@
     MkldnnCPU: mkldnn_sigmoid_
 
 - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sigmoid_out
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
@@ -2945,6 +2991,8 @@
     CPU, CUDA: logit_
 
 - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: logit_out
 
 - func: sin(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -2967,6 +3015,8 @@
   variants: function, method
 
 - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sinh_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
 # This method is OK to call if the `Variable` is a view.
@@ -3177,6 +3227,8 @@
   variants: function, method
 
 - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sqrt_out
 
 - func: square(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3246,6 +3298,8 @@
   variants: function, method
 
 - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3259,6 +3313,8 @@
   variants: function, method
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: tanh_out
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
   use_c10_dispatcher: full
@@ -3606,8 +3662,8 @@
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
   dispatch:
-      SparseCPU: _sparse_sum_backward_cpu
-      SparseCUDA: _sparse_sum_backward_cuda
+    SparseCPU: _sparse_sum_backward_cpu
+    SparseCUDA: _sparse_sum_backward_cuda
 
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
@@ -4809,6 +4865,8 @@
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: atan2_
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4827,6 +4885,8 @@
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
+  dispatch:
+    CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -4916,27 +4976,41 @@
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: random_
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: cauchy_
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: log_normal_
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: exponential_
 
 - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: geometric_
 
 # wrappers for TH functions
 
@@ -5390,6 +5464,8 @@
   use_c10_dispatcher: full
 
 - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcmul_out
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5400,6 +5476,8 @@
   variants: method
 
 - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addcdiv_out
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
   use_c10_dispatcher: full
@@ -5619,12 +5697,16 @@
     CPU, CUDA: lgamma
 
 - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: polygamma_out
 
 - func: polygamma(int n, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5657,6 +5739,8 @@
   variants: function, method
 
 - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: i0_out
 
 - func: sign(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -5684,6 +5768,8 @@
   variants: method, function
 
 - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: atan2_out
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -5750,19 +5836,27 @@
     CUDA: fmod_cuda
 
 - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: hypot_out
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: hypot
 
 - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
 
 - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nextafter_out
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: nextafter
 
 - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
   variants: method
@@ -6487,10 +6581,14 @@
 
 - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_out
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu
 
 - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6500,6 +6598,8 @@
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: elu_backward
 
 - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6533,6 +6633,8 @@
 
 - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_out
 
 - func: hardsigmoid(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6544,6 +6646,8 @@
 - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardsigmoid_
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6572,6 +6676,8 @@
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardtanh_backward
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6582,14 +6688,20 @@
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_out
 
 - func: hardswish(Tensor self) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6613,6 +6725,8 @@
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: leaky_relu_backward
 
 - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6678,10 +6792,14 @@
 
 - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_out
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus
 
 - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6691,13 +6809,19 @@
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softplus_backward
 
 - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_out
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink
 
 - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6707,6 +6831,8 @@
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: softshrink_backward
 
 - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
@@ -7478,6 +7604,8 @@
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: logit_backward
 
 - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -7487,6 +7615,8 @@
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: tanh_backward
 
 # What's a thnn_conv_ versus a slow_conv_?
 #

From 989d877c95a9107fabcee1bda9a6cfacb8098d94 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 22 Sep 2020 21:41:13 -0700
Subject: [PATCH 044/449] [JIT] Do not allow creating generics with None types
 (#44958)

Summary:
Otherwise, invoking something like  `python -c "import torch._C;print(torch._C.ListType(None))"` will result in SIGSEGV

Discovered while trying to create a torch script for function with the following type annotation `Tuple[int, Ellipsis] -> None`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44958

Reviewed By: suo

Differential Revision: D23799906

Pulled By: malfet

fbshipit-source-id: 916a243007d13ed3e7a5b282dd712da3d66e3bf7
---
 aten/src/ATen/core/jit_type.h | 7 ++++++-
 aten/src/ATen/core/type.cpp   | 3 +++
 test/jit/test_list_dict.py    | 5 +++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 74eaa7012ac1..1c9d31dd630c 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -263,7 +263,12 @@ struct SingleElementType : public Type {
   }
 
  protected:
-  SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {}
+  SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {
+    if (!this->elem) {
+      throw std::runtime_error(c10::str(
+            "Can not create ", typeKindToString(Kind), " with None type"));
+    }
+  }
 
  private:
   TypePtr elem;
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 475c59759f78..13e82d434647 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -716,6 +716,9 @@ TupleType::TupleType(
       schema_(std::move(schema)) {
   has_free_variables_ =
       std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) {
+        if (!v) {
+          throw std::runtime_error("Can not create tuple with None type");
+        }
         return v->hasFreeVariables();
       });
   if (schema_) {
diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index a1c378963918..8d0f74349b3b 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -1155,6 +1155,11 @@ def annotated_fn(x: torch.Tensor) -> List:
         with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"):
             torch.jit.script(annotated_fn)
 
+    def test_list_none(self):
+        with self.assertRaisesRegex(RuntimeError, "Can not create ListType with None type"):
+            x = torch._C.ListType(None)
+
+
 
 class TestDict(JitTestCase):
     def dict(self):

From 144dacd8d9aee815692524052ea72a5ceb561fe3 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 22 Sep 2020 22:41:51 -0700
Subject: [PATCH 045/449] CUDA BFloat16 batched gemm (#45167)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45167

Reviewed By: mruberry

Differential Revision: D23860458

Pulled By: ngimel

fbshipit-source-id: 698de424a046963a30017b58d227fa510f85bf3f
---
 aten/src/THC/THCBlas.cu                   | 65 ++++++++++++-----------
 aten/src/THC/THCBlas.h                    |  8 +--
 aten/src/THC/generic/THCTensorMathBlas.cu |  6 +--
 test/test_torch.py                        | 15 +++---
 4 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index fe906ce66fa3..73d411f05ef1 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -107,30 +107,9 @@ void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int64_t k, i
 
 }
 
-// Check https://github.com/pytorch/pytorch/issues/22078
-// for information about the bug. We don't know the exact conditions that trigger it,
-// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a
-// necessary condition.
-static void checkCuda90Bug(int i_m, int i_n, int i_k)
-{
-#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000
-  static std::once_flag alreadyWarned;
-  const int LIMIT = 1 << 21;
-  if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) {
-    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-    if (prop->major == 5 || prop->major == 6) {
-      std::call_once(alreadyWarned, []() {
-        TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later.");
-      });
-    }
-  }
-#endif
-}
-
 /* Level 3 */
 void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc)
 {
-  checkCuda90Bug((int)m, (int)n, (int)k);
   at::cuda::blas::gemm<float>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
@@ -141,11 +120,10 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6
 
 void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc)
 {
-  checkCuda90Bug((int)m, (int)n, (int)k);
   at::cuda::blas::gemm<at::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc)
 {
   at::cuda::blas::gemm<at::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
@@ -157,7 +135,6 @@ void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int6
   at::cuda::blas::gemm<double>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-#if CUDA_VERSION >= 9010  || defined __HIP_PLATFORM_HCC__
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB,
                              at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount)
@@ -205,7 +182,6 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
 #endif  // CUDA_VERSION < 11000
 #endif // __HIP_PLATFORM_HCC__
 }
-#endif // CUDA_VERSION or __HIP_PLATFORM_HCC__
 
 #ifdef __HIP_PLATFORM_HCC__
 void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
@@ -236,6 +212,40 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i
 }
 #endif // __HIP_PLATFORM_HCC__
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
+                             at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB,
+                             at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount)
+{
+  at::globalContext().alertCuBLASConfigNotDeterministic();
+  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+
+  {
+    THError("Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+            "with the bound [val] <= %d", INT_MAX);
+  }
+
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  if (prop->major < 8) {
+    TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU");
+  }
+
+  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  cublasOperation_t opa = convertTransToCublasOperation(transa);
+  cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  float fAlpha = alpha;
+  float fBeta = beta;
+  THCublasCheck(cublasGemmStridedBatchedEx(handle,
+                                   opa, opb, (int)m, (int)n, (int)k,
+                                   (void*)&fAlpha, a, CUDA_R_16BF, (int)lda, strideA,
+                                   b, CUDA_R_16BF, (int)ldb, strideB,
+                                   (void*)&fBeta, c, CUDA_R_16BF, (int)ldc, strideC,
+                                   (int)batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+
 void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
                              float beta, float *c[], int64_t ldc, int64_t batchCount)
@@ -270,7 +280,6 @@ void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t
 #endif
 }
 
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
 void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB,
                              float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount)
@@ -294,7 +303,6 @@ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, i
                                    &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC,
                                    (int)batchCount));
 }
-#endif
 
 void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb,
@@ -330,7 +338,6 @@ void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t
 #endif
 }
 
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
 void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB,
                              double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount)
@@ -353,5 +360,3 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i
                                    &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC,
                                    (int)batchCount));
 }
-#endif
-
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
index cff3180a974a..a9b646a4374f 100644
--- a/aten/src/THC/THCBlas.h
+++ b/aten/src/THC/THCBlas.h
@@ -14,7 +14,7 @@ THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t
 THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
 
 THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc);
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 THC_API void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc);
 #endif
 
@@ -24,22 +24,18 @@ THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb,
 THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb,
                                      double beta, double *c[], int64_t ldc, int64_t batchCount);
-#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__
 THC_API void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB,
                                      float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount);
 THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB,
                                      double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount);
-#endif
 
-#if CUDA_VERSION >= 9010 || defined(__HIP_PLATFORM_HCC__)
 void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB,
                                                                   THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount);
-#endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB,
                                      at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount);
diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu
index 3158e0e267ed..a5d159a9cace 100644
--- a/aten/src/THC/generic/THCTensorMathBlas.cu
+++ b/aten/src/THC/generic/THCTensorMathBlas.cu
@@ -281,7 +281,7 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t,
 #endif //CUDA_VERSION
 
 #elif defined(THC_REAL_IS_BFLOAT16)
-#if defined(__HIP_PLATFORM_HCC__)
+#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
   THCudaBlas_BgemmStridedBatched(
       state,
       transpose_batch1,
@@ -310,15 +310,13 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t,
     THCTensor_(freeCopyTo)(state, result_, result);
   }
 
-#if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
+#if defined(THC_REAL_IS_BFLOAT16) && !(defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000)
   // To avoid "variable was set but never used" warning
   [&transpose_batch1, &transpose_batch2, &lda, &ldb, &ldc]{}();
   TORCH_CHECK(false, "BgemmStridedBatched is not supported with at::BFloat16 type");
 #endif
   }
-#if !defined(THC_REAL_IS_BFLOAT16) || defined(__HIP_PLATFORM_HCC__)
   at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
-#endif
 
 #else
   ERROR_ONLY_FP_TYPES("baddbmm");
diff --git a/test/test_torch.py b/test/test_torch.py
index 440bf30286bb..6b529712ab5c 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -19626,8 +19626,6 @@ def test_movedim_view(self, device):
 # with _float_types when bfloat16 bringup is complete on all platforms
 _float_types2 = _float_types + [torch.bfloat16] if TEST_WITH_ROCM else _float_types
 
-_complex_and_float_types2 = _float_types2 + _complex_types
-
 _signed_types = [
     torch.half, torch.float, torch.double,
     torch.int8, torch.short, torch.int, torch.long
@@ -19798,20 +19796,21 @@ def inner(self, device, dtype):
     ('pow', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d).abs()],
         1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('addbmm', '', _small_2d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
-        1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, [tf32_on_and_off(0.005)]),
+        1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types,
+        _cpu_types, True, [tf32_on_and_off(0.005)]),
     ('addbmm', 'scalar', _small_2d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True,
+        1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True,
         [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
     ('addbmm', 'two_scalars', _small_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True,
+        1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True,
         [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
     ('baddbmm', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2),
+        1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)),
     ('baddbmm', 'scalar', _small_3d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
+        1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
         [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]),
     ('baddbmm', 'two_scalars', _small_3d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
+        1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
         [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]),
     ('bmm', '', _small_3d, lambda t, d: [_small_3d(t, d)],
         1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False),

From 7fba30c2be4c1373c1e4424111e5ec2b878a85da Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Tue, 22 Sep 2020 22:44:41 -0700
Subject: [PATCH 046/449] [quant][fx][bug] Fix error in convert step for QAT
 (#45050)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45050

Update tests to actually test for QAT

Test Plan:
python test/test_quantization.py TestQuantizeFxOps.test_linear

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23808022

fbshipit-source-id: d749ab2d215fe19238ff9d539307ffce9ef0ca9b
---
 torch/quantization/fx/quantize.py              | 7 ++++++-
 torch/testing/_internal/common_quantization.py | 6 ++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 8d742255838a..6254120999f0 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -132,6 +132,10 @@ def is_activation_post_process(module):
     return (isinstance(module, torch.quantization.ObserverBase) or
             isinstance(module, torch.quantization.FakeQuantize))
 
+def is_submodule_of_fake_quant(name, module, named_modules):
+    parent_name, _ = _parent_name(name)
+    return is_activation_post_process(named_modules[parent_name])
+
 # A dictionary for querying the weight index for a given op
 WEIGHT_INDEX_DICT = {
     torch.nn.functional.conv2d : [1],
@@ -529,9 +533,10 @@ def load_arg(a):
                 env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg)
         act_post_process_removed_graph.output(map_arg(self.quantized_graph.result, load_arg))
 
+        module_dict = dict(model.named_modules())
         to_be_removed = []
         for name, module in model.named_modules():
-            if is_activation_post_process(module):
+            if is_activation_post_process(module) and not is_submodule_of_fake_quant(name, module, module_dict):
                 to_be_removed.append(name)
         for n in to_be_removed:
             delattr(model, n)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 8339335bd04b..3edbd5dd7fcd 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -11,7 +11,8 @@
 from torch.testing._internal.common_utils import TestCase
 from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \
     default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
-    propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig
+    propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig, \
+    get_default_qat_qconfig
 from torch.quantization.quantization_mappings import (
     get_dynamic_quant_module_mappings,
     get_qconfig_propagation_list,
@@ -614,12 +615,13 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         if type(inputs) == list:
             inputs = inputs[0]
         if quant_type == QuantType.QAT:
+            qconfig_dict = {'': get_default_qat_qconfig(torch.backends.quantized.engine)}
             model.train()
         else:
+            qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
             model.eval()
         original = symbolic_trace(model)
 
-        qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
         if quant_type == QuantType.DYNAMIC:
             prepare = prepare_dynamic_fx
             convert = convert_dynamic_fx

From 215679573ebff5a03238a7f9aa801a6c00826f19 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Tue, 22 Sep 2020 23:46:32 -0700
Subject: [PATCH 047/449] [TensorExpr] Fix operator order in combineMultilane
 (#45157)

Summary:
combineMultilane used the wrong order when ramp was on the left hand side,
which matters for subtract.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45157

Test Plan: test_tensorexpr --gtest_filter=TensorExprTest.SimplifyRampSubBroadcast

Reviewed By: ailzhang

Differential Revision: D23851751

Pulled By: asuhan

fbshipit-source-id: 864d1611e88769fb43327ef226bb3310017bf858
---
 test/cpp/tensorexpr/test_simplify.cpp       | 14 ++++++++++++++
 test/cpp/tensorexpr/tests.h                 |  1 +
 torch/csrc/jit/tensorexpr/ir_simplifier.cpp |  2 +-
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index b88aa17efd3e..f8c5cdd3546d 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -3950,5 +3950,19 @@ void testSimplifySyncThreads() {
   }
 }
 
+void testSimplifyRampSubBroadcast() {
+  KernelScope kernel_scope;
+  int num_lanes = 4;
+  ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes);
+  ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes);
+  ExprHandle simplified = IRSimplifier::simplify(ramp - broadcast);
+  Ramp* newRamp = simplified.AsNode<Ramp>();
+  IS_NODE_WITH_NAME(IntImm, newRamp->base(), base);
+  ASSERT_EQ(base->value(), 5);
+  IS_NODE_WITH_NAME(IntImm, newRamp->stride(), stride);
+  ASSERT_EQ(stride->value(), 6);
+  ASSERT_EQ(newRamp->lanes(), num_lanes);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
index c38a368af13c..56831c8db663 100644
--- a/test/cpp/tensorexpr/tests.h
+++ b/test/cpp/tensorexpr/tests.h
@@ -216,6 +216,7 @@ namespace jit {
   _(SimplifyReorderForCond)                 \
   _(SimplifyFuseConditions)                 \
   _(SimplifySyncThreads)                    \
+  _(SimplifyRampSubBroadcast)               \
   _(RegisterizerSimple)                     \
   _(RegisterizerLoop)                       \
   _(RegisterizerLoopFixedLoad)              \
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 3429239a4491..f6852b627969 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -123,7 +123,7 @@ const Expr* combineMultilane(const Expr* lhs, const Expr* rhs) {
         throw malformed_input("multilane lane mismatch");
       }
       const Expr* ret = new Ramp(
-          new Op(bc->value(), ramp->base()), ramp->stride(), ramp->lanes());
+          new Op(ramp->base(), bc->value()), ramp->stride(), ramp->lanes());
       return ret;
     }
   }

From 76dc50e9c8698da338334ecdc80bb00e60186849 Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Wed, 23 Sep 2020 00:44:20 -0700
Subject: [PATCH 048/449] [RPC] Infer backend type if only options are given
 (#45065)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45065

To preserve backwards compatibility with applications that were passing in some ProcessGroupRpcBackendOptions but were not explicitly setting backend=BackendType.PROCESS_GROUP, we're here now inferring the backend type from the options if only the latter ones are passed. If neither are passed, we'll default to TensorPipe, as before this change.
ghstack-source-id: 112586258

Test Plan: Added new unit tests.

Reviewed By: pritamdamania87

Differential Revision: D23814289

fbshipit-source-id: f4be7919e0817a4f539a50ab12216dc3178cb752
---
 torch/csrc/distributed/rpc/init.cpp           |   5 +
 torch/distributed/rpc/__init__.py             |  59 +++++++-
 torch/distributed/rpc/backend_registry.py     |  13 ++
 .../_internal/distributed/rpc/rpc_test.py     | 129 ++++++++++++++++++
 4 files changed, 203 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index f85adb88dc09..ea1db04225c7 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -50,6 +50,11 @@ PyObject* rpc_init(PyObject* /* unused */) {
             :meth:`~torch.distributed.rpc.init_rpc` in order to initialize RPC
             with specific configurations, such as the RPC timeout and
             ``init_method`` to be used. )")
+          .def(py::init<>())
+          .def(
+              py::init<float, std::string>(),
+              py::arg("rpc_timeout") = kDefaultRpcTimeoutSeconds,
+              py::arg("init_method") = kDefaultInitMethod)
           .def_readwrite(
               "rpc_timeout",
               &RpcBackendOptions::rpcTimeoutSeconds,
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index 4f1180bf954f..4598c78e72fe 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -1,7 +1,12 @@
+import logging
+import threading
 
 import torch
 import torch.distributed as dist
-import threading
+
+
+logger = logging.getLogger(__name__)
+
 
 _init_counter = 0
 _init_counter_lock = threading.Lock()
@@ -36,7 +41,7 @@ def is_available():
 
     def init_rpc(
         name,
-        backend=BackendType.TENSORPIPE,
+        backend=None,
         rank=-1,
         world_size=None,
         rpc_backend_options=None,
@@ -71,7 +76,55 @@ def init_rpc(
                 are available.
         """
 
-        if not rpc_backend_options:
+        if backend is not None and not isinstance(backend, backend_registry.BackendType):
+            raise TypeError(
+                "Argument backend must be a member of BackendType"
+            )
+
+        if rpc_backend_options is not None and not isinstance(rpc_backend_options, RpcBackendOptions):
+            raise TypeError(
+                "Argument rpc_backend_options must be an instance of RpcBackendOptions"
+            )
+
+        # To avoid breaking users that passed a ProcessGroupRpcBackendOptions
+        # without specifying the backend as PROCESS_GROUP when that was the
+        # default, we try to detect the backend from the options when only the
+        # latter is passed.
+        if backend is None and rpc_backend_options is not None:
+            for candidate_backend in BackendType:
+                if isinstance(
+                    rpc_backend_options,
+                    type(
+                        backend_registry.construct_rpc_backend_options(
+                            candidate_backend
+                        )
+                    ),
+                ):
+                    backend = candidate_backend
+                    break
+            else:
+                raise TypeError(
+                    f"Could not infer backend for options {rpc_backend_options}"
+                )
+            if backend != BackendType.TENSORPIPE:
+                logger.warning(
+                    f"RPC was initialized with no explicit backend but with options "
+                    f"corresponding to {backend}, hence that backend will be used "
+                    f"instead of the default {BackendType.TENSORPIPE}. To silence this "
+                    f"warning pass `backend={backend}` explicitly."
+                )
+
+        if backend is None:
+            backend = BackendType.TENSORPIPE
+
+        if backend == BackendType.PROCESS_GROUP:
+            logger.warning(
+                "RPC was initialized with the PROCESS_GROUP backend which is "
+                "deprecated and slated to be removed and superseded by the TENSORPIPE "
+                "backend. It is recommended to migrate to the TENSORPIPE backend."
+            )
+
+        if rpc_backend_options is None:
             # default construct a set of RPC backend options.
             rpc_backend_options = backend_registry.construct_rpc_backend_options(
                 backend
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 8ca185ab1ff1..6dac7cb0863a 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -134,8 +134,21 @@ def _init_process_group(store, rank, world_size):
 def _process_group_init_backend_handler(
     store, name, rank, world_size, rpc_backend_options
 ):
+    from . import ProcessGroupRpcBackendOptions
     from . import ProcessGroupAgent
 
+    if not isinstance(store, dist.Store):
+        raise TypeError("`store` must be a c10d::Store. {}".format(store))
+
+    if not isinstance(
+        rpc_backend_options, ProcessGroupRpcBackendOptions
+    ):
+        raise TypeError(
+            "`rpc_backend_options` must be a `ProcessGroupRpcBackendOptions`. {}".format(
+                rpc_backend_options
+            )
+        )
+
     group = _init_process_group(store, rank, world_size)
 
     # TODO: add try-except and destroy _agent in all processes if any fails.
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index f469dd32ea04..e343ffc1939b 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1,6 +1,7 @@
 import concurrent.futures
 import contextlib
 import json
+import logging
 import sys
 from threading import Lock
 import time
@@ -459,6 +460,14 @@ def return_future():
     return torch.futures.Future()
 
 
+class FooBackendOptions(rpc.RpcBackendOptions):
+    def __init__(self, init_method):
+        # Must call the __init__ of the superclass (and do so directly,
+        # without using super()) because... pybind.
+        rpc.RpcBackendOptions.__init__(self)
+        self.init_method = init_method
+
+
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -3298,9 +3307,98 @@ def test_init_rpc_twice(self):
 
         rpc.shutdown()
 
+    def test_wrong_types(self):
+        with self.assertRaisesRegex(
+            TypeError,
+            "Argument backend must be a member of BackendType",
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend="TENSORPIPE",
+            )
+
+        with self.assertRaisesRegex(
+            TypeError,
+            "Argument rpc_backend_options must be an instance of RpcBackendOptions",
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=self.rpc_backend,
+                rpc_backend_options={"init_method": self.init_method}
+            )
+
+    def test_cannot_infer_backend_from_options(self):
+        # An exception should be raised if the backend isn't specified but
+        # options are given which are not an instance of any of the known
+        # agents' option classes.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(TypeError, "Could not infer backend for options"):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                # Do _not_ pass backend.
+                rpc_backend_options=rpc_backend_options,
+            )
+
 
 class ProcessGroupAgentRpcTest(RpcAgentTestFixture):
 
+    def test_mismatched_type_for_options(self):
+        # An exception should be raised if the options are not an instance of
+        # ProcessGroupRpcBackendOptions.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(
+            TypeError, "`rpc_backend_options` must be a `ProcessGroupRpcBackendOptions`"
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=rpc.BackendType.PROCESS_GROUP,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    def test_infer_backend_from_options(self):
+        rpc_backend_options = rpc.ProcessGroupRpcBackendOptions(
+            init_method=self.init_method
+        )
+
+        with self.assertLogs("torch.distributed.rpc", logging.WARNING) as cm:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                # Do _not_ pass backend.
+                rpc_backend_options=rpc_backend_options,
+            )
+        self.assertIn(
+            "To silence this warning pass `backend=BackendType.PROCESS_GROUP` explicitly.",
+            "\n".join(cm.output),
+        )
+
+        self.assertIsInstance(rpc.api._get_current_rpc_agent(), rpc.ProcessGroupAgent)
+
+    def test_logs_deprecation_warning(self):
+        with self.assertLogs("torch.distributed.rpc", logging.WARNING) as cm:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=rpc.BackendType.PROCESS_GROUP,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+        self.assertIn(
+            "It is recommended to migrate to the TENSORPIPE backend.",
+            "\n".join(cm.output),
+        )
+
     @skip_if_lt_x_gpu(2)
     @dist_init
     def test_cuda(self):
@@ -3895,6 +3993,37 @@ def test_rpc_script_timeout(self):
 
 class TensorPipeAgentRpcTest(RpcAgentTestFixture):
 
+    def test_mismatched_type_for_options(self):
+        # An exception should be raised if the options are not an instance of
+        # TensorPipeRpcBackendOptions.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(
+            TypeError, "`rpc_backend_options` must be a `TensorPipeRpcBackendOptions`"
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=rpc.BackendType.TENSORPIPE,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    def test_infer_backend_from_options(self):
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+            init_method=self.init_method
+        )
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            rank=self.rank,
+            world_size=self.world_size,
+            # Do _not_ pass backend.
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        self.assertIsInstance(rpc.api._get_current_rpc_agent(), rpc.TensorPipeAgent)
+
     # FIXME Merge this test with the corresponding one in RpcTest.
     @dist_init(setup_rpc=False)
     def test_set_and_get_num_worker_threads(self):

From e5bade7b2cccf5cccfc812778059578c15c3e8ab Mon Sep 17 00:00:00 2001
From: Martin Yuan <myuan@fb.com>
Date: Wed, 23 Sep 2020 07:40:51 -0700
Subject: [PATCH 049/449] [PyTorch Mobile] Move string op registrations to prim
 and make them selective (#44960)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44960

Since we have templated selective build, it should be safe to move the operators to prim so that they can be selectively built in mobile

Test Plan: CI

Reviewed By: linbinyu

Differential Revision: D23772025

fbshipit-source-id: 52cebae76e4df5a6b2b51f2cd82f06f75e2e45d0
---
 aten/src/ATen/templates/TypeDefault.cpp       |  63 +-
 tools/build_variables.bzl                     |   1 -
 torch/csrc/jit/runtime/register_prim_ops.cpp  | 621 ++++++++++++++++--
 .../csrc/jit/runtime/register_string_ops.cpp  | 499 --------------
 4 files changed, 599 insertions(+), 585 deletions(-)
 delete mode 100644 torch/csrc/jit/runtime/register_string_ops.cpp

diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index 6f2b988619c7..c1e7c9ac0c64 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -27,38 +27,37 @@ TORCH_LIBRARY(aten, m) {
   ${function_registrations};
 
   // String Ops
-  // Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp
-  m.def("splitlines(str self, bool keepends=False) -> str[]");
-  m.def(
-      "slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str");
-  m.def("isupper(str self) -> bool");
-  m.def("islower(str self) -> bool");
-  m.def("capitalize(str self) -> str");
-  m.def("title(str self) -> str");
-  m.def("center(str self, int width, str fillchar=' ') -> str");
-  m.def("count(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("endswith(str self, str substr, int start=0, int end=-1) -> bool");
-  m.def("startswith(str self, str substr, int start=0, int end=-1) -> bool");
-  m.def("expandtabs(str self, int tabsize=8) -> str");
-  m.def("find(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("rfind(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("index.str(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("rindex(str self, str substr, int start=0, int end=-1) -> int");
-  m.def("isidentifier(str self) -> bool");
-  m.def("istitle(str self) -> bool");
-  m.def("isprintable(str self) -> bool");
-  m.def("ljust(str self, int width, str fillchar=' ') -> str");
-  m.def("rjust(str self, int width, str fillchar=' ') -> str");
-  m.def("zfill(str self, int width) -> str");
-  m.def("lstrip(str self, str chars=' \\n\\t\\f\\v') -> str");
-  m.def("rstrip(str self, str chars=' \\n\\t\\f\\v') -> str");
-  m.def("strip(str self, str chars=' \\n\\t\\f\\v') -> str");
-  m.def("replace(str self, str old, str new, int max=-1) -> str");
-  m.def("partition(str self, str separator) -> (str, str, str)");
-  m.def("rpartition(str self, str separator) -> (str, str, str)");
-  m.def("split.str(str self, str? separator=None, int max=-1) -> str[]");
-  m.def("rsplit(str self, str separator=' ', int max=-1) -> str[]");
-  m.def("join(str self, str[] values) -> str");
+  // Implementations located in torch/csrc/jit/runtime/register_prim_ops.cpp
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::splitlines(str self, bool keepends=False) -> str[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::isupper(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::islower(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::capitalize(str self) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::title(str self) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::center(str self, int width, str fillchar=' ') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::count(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::endswith(str self, str substr, int start=0, int end=-1) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::startswith(str self, str substr, int start=0, int end=-1) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::expandtabs(str self, int tabsize=8) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::find(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rfind(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::index.str(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rindex(str self, str substr, int start=0, int end=-1) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::isidentifier(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::istitle(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::isprintable(str self) -> bool"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::ljust(str self, int width, str fillchar=' ') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rjust(str self, int width, str fillchar=' ') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::zfill(str self, int width) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::lstrip(str self, str chars=' \\n\\t\\f\\v') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rstrip(str self, str chars=' \\n\\t\\f\\v') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::strip(str self, str chars=' \\n\\t\\f\\v') -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::replace(str self, str old, str new, int max=-1) -> str"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::partition(str self, str separator) -> (str, str, str)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rpartition(str self, str separator) -> (str, str, str)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::split.str(str self, str? separator=None, int max=-1) -> str[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::rsplit(str self, str separator=' ', int max=-1) -> str[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("aten::join(str self, str[] values) -> str"));
 
   // Integer Ops
   // Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cp
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 3f5126358804..b1a2967f5dea 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -302,7 +302,6 @@ jit_sources_full = [
     "torch/csrc/jit/runtime/register_prim_ops.cpp",
     "torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp",
     "torch/csrc/jit/runtime/register_special_ops.cpp",
-    "torch/csrc/jit/runtime/register_string_ops.cpp",
     "torch/csrc/jit/passes/remove_inplace_ops.cpp",
     "torch/csrc/jit/passes/utils/check_alias_annotation.cpp",
 ]
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index ed3e2aceb19a..98f328a43240 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/operator.h>
 #include <torch/csrc/jit/runtime/register_ops_utils.h>
 #include <torch/library.h>
@@ -75,59 +76,6 @@ c10::List<std::string> splitNoneSeparator(const std::string& string) {
   return splits;
 }
 
-TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
-  m.impl("slice.str", TORCH_FN(stringSlice));
-  m.impl("strip", [](std::string string, const std::string& chars) {
-    auto rindex = string.find_last_not_of(chars);
-    if (rindex != std::string::npos) {
-      string = string.substr(0, rindex + 1);
-    } else {
-      string = "";
-    }
-    auto lindex = string.find_first_not_of(chars);
-    if (lindex != std::string::npos) {
-      string = string.substr(lindex, string.size());
-    } else {
-      string = "";
-    }
-    return string;
-  });
-  m.impl(
-      "split.str",
-      [](const std::string& string,
-         c10::optional<std::string> separator,
-         int64_t max) {
-        if (!separator.has_value()) {
-          // if separator is not specified,
-          // a different splitting algorithm is applied as Python
-          return splitNoneSeparator(string);
-          ;
-        }
-        if (separator.value().empty()) {
-          throw std::runtime_error("ValueError: empty separator");
-        }
-
-        std::string::size_type prev_pos = 0;
-        std::string::size_type pos = 0;
-        c10::List<std::string> splits;
-        auto count = 0;
-
-        while ((pos = string.find(separator.value(), pos)) !=
-               std::string::npos) {
-          count++;
-          if (max >= 0 && count > max) {
-            break;
-          } else {
-            splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
-          }
-          pos += separator.value().size();
-          prev_pos = pos;
-        }
-        splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos));
-        return splits;
-      });
-}
-
 RegisterOperators reg(
     {OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA("aten::str(t elem) -> str"),
@@ -1238,6 +1186,573 @@ RegisterOperators reg_dict_ops({
     CREATE_DICT_OPS("Tensor"),
 });
 
+c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+// Convert an python index (which may be negative) into an index usable for a
+// C++ container
+int64_t normalizeIndex(int64_t idx, int64_t list_size) {
+  if (idx < 0) {
+    // Handle negative indexing
+    idx = list_size + idx;
+  }
+  return idx;
+}
+
+int64_t stringFindImpl(
+    std::string string,
+    std::string substr,
+    int64_t start,
+    int64_t end,
+    bool reverse = false) {
+  int64_t size = string.size();
+  if (start < 0) {
+    start = std::max(int64_t(0), int64_t(size + start));
+  }
+  if (end < 0) {
+    end = std::max(int64_t(0), int64_t(size + end + 1));
+  }
+  if (end > start) {
+    string = string.substr(start, end - start);
+  } else {
+    string = "";
+  }
+
+  int64_t result = -1;
+  if (string.size() >= substr.size()) {
+    auto pos = string.find(substr, 0);
+    if (reverse) {
+      auto rpos = pos;
+      do {
+        pos = rpos;
+        rpos = string.find(substr, pos + 1);
+      } while (rpos != std::string::npos);
+    }
+    if (pos != std::string::npos) {
+      result = pos + start;
+    }
+  }
+  return result;
+}
+
+// String Ops
+// Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp
+TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
+  m.impl(TORCH_SELECTIVE_NAME("aten::slice.str"), TORCH_FN(stringSlice));
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::strip"),
+      [](std::string string, const std::string& chars) {
+        auto rindex = string.find_last_not_of(chars);
+        if (rindex != std::string::npos) {
+          string = string.substr(0, rindex + 1);
+        } else {
+          string = "";
+        }
+        auto lindex = string.find_first_not_of(chars);
+        if (lindex != std::string::npos) {
+          string = string.substr(lindex, string.size());
+        } else {
+          string = "";
+        }
+        return string;
+      });
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::split.str"),
+      [](const std::string& string,
+         c10::optional<std::string> separator,
+         int64_t max) {
+        if (!separator.has_value()) {
+          // if separator is not specified,
+          // a different splitting algorithm is applied as Python
+          return splitNoneSeparator(string);
+          ;
+        }
+        if (separator.value().empty()) {
+          throw std::runtime_error("ValueError: empty separator");
+        }
+
+        std::string::size_type prev_pos = 0;
+        std::string::size_type pos = 0;
+        c10::List<std::string> splits;
+        auto count = 0;
+
+        while ((pos = string.find(separator.value(), pos)) !=
+               std::string::npos) {
+          count++;
+          if (max >= 0 && count > max) {
+            break;
+          } else {
+            splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
+          }
+          pos += separator.value().size();
+          prev_pos = pos;
+        }
+        splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos));
+        return splits;
+      });
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::splitlines"),
+      [](std::string string, bool keepends) {
+        std::string delimiters =
+            "\n\r\r\n\v\x0b\f\x0c\x1c\x1d\x1e\x85\u2028\u2029";
+        c10::List<std::string> splits;
+
+        std::string::size_type prev_pos = 0;
+        std::string::size_type pos = 0;
+        while ((pos = string.find_first_of(delimiters, pos)) !=
+               std::string::npos) {
+          splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
+          if (keepends) {
+            splits.emplace_back(string.substr(pos, 1));
+          }
+          pos++;
+          prev_pos = pos;
+        }
+        if (prev_pos != string.size()) {
+          splits.emplace_back(
+              string.substr(prev_pos, string.size() - prev_pos));
+        }
+
+        return splits;
+      });
+
+  // upper and lower require there to be at least one alpha character,
+  // and ignore all other characters
+  m.impl(TORCH_SELECTIVE_NAME("aten::isupper"), [](std::string string) {
+    bool found_alpha = false;
+    bool is_upper = true;
+    for (size_t i = 0; i < string.size() && is_upper; ++i) {
+      char c = string[i];
+      found_alpha |= static_cast<bool>(::isalpha(c));
+      is_upper &= (!::isalpha(c) || ::isupper(c));
+    }
+    return found_alpha && is_upper;
+  });
+  m.impl(TORCH_SELECTIVE_NAME("aten::islower"), [](std::string string) {
+    bool found_alpha = false;
+    bool is_lower = true;
+    for (size_t i = 0; i < string.size() && is_lower; ++i) {
+      char c = string[i];
+      found_alpha |= static_cast<bool>(::isalpha(c));
+      is_lower &= (!::isalpha(c) || ::islower(c));
+    }
+    return found_alpha && is_lower;
+  });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::capitalize"), [](std::string string) {
+    std::stringstream ss;
+    auto first_char = true;
+    for (char c : string) {
+      if (first_char) {
+        ss << static_cast<char>(::toupper(c));
+        first_char = false;
+      } else {
+        ss << static_cast<char>(::tolower(c));
+      }
+    }
+    return ss.str();
+  });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::title"), [](std::string string) {
+    std::stringstream ss;
+    bool prev_is_nonalpha = true;
+    for (char c : string) {
+      if (prev_is_nonalpha) {
+        ss << static_cast<char>(::toupper(c));
+      } else {
+        ss << static_cast<char>(::tolower(c));
+      }
+      if (::isalpha(c)) {
+        prev_is_nonalpha = false;
+      } else {
+        prev_is_nonalpha = true;
+      }
+    }
+    return ss.str();
+  });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::center"),
+      [](std::string string, int64_t width, std::string fillchar) {
+        if (fillchar.size() != 1) {
+          // TODO: this should be a TypeError
+          throw std::runtime_error(
+              "TypeError: The fill character must be exactly one character long");
+        }
+        if (string.size() > static_cast<std::string::size_type>(width)) {
+          return string;
+        }
+        std::stringstream ss;
+        std::string::size_type full_padding = width - string.size();
+        std::string::size_type l_pad = full_padding / 2;
+        std::string::size_type r_pad = (full_padding + 1) / 2;
+        if (width % 2) {
+          auto tmp = r_pad;
+          r_pad = l_pad;
+          l_pad = tmp;
+        }
+        for (std::string::size_type i = 0; i < l_pad; ++i) {
+          ss << fillchar;
+        }
+        ss << string;
+        for (std::string::size_type i = 0; i < r_pad; ++i) {
+          ss << fillchar;
+        }
+        return ss.str();
+      });
+
+  // Adapted from
+  // https://stackoverflow.com/questions/22489073/counting-the-number-of-occurrences-of-a-string-within-a-string
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::count"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        int64_t size = string.size();
+        if (start > size) {
+          return int64_t(0);
+        }
+        if (start < 0) {
+          start = std::max(int64_t(0), int64_t(size + start));
+        }
+        if (end < 0) {
+          end = std::max(int64_t(0), int64_t(size + end + 1));
+        }
+
+        int64_t occurrences = 0;
+        std::string::size_type pos = start;
+        while ((pos = string.find(substr, pos)) != std::string::npos) {
+          if (pos < static_cast<std::string::size_type>(end)) {
+            ++occurrences;
+          } else {
+            break;
+          }
+          pos += substr.length();
+        }
+        return occurrences;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::endswith"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        int64_t size = string.size();
+        if (start < 0) {
+          start = std::max(int64_t(0), int64_t(size + start));
+        }
+        if (end < 0) {
+          end = std::max(int64_t(0), int64_t(size + end + 1));
+        }
+
+        string = string.substr(start, end - start);
+
+        auto result = false;
+        if (string.length() >= substr.length()) {
+          result = !string.compare(
+              string.length() - substr.length(), substr.length(), substr);
+        }
+        return result;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::startswith"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        int64_t size = string.size();
+        if (start < 0) {
+          start = std::max(int64_t(0), int64_t(size + start));
+        }
+        if (end < 0) {
+          end = std::max(int64_t(0), int64_t(size + end + 1));
+        }
+
+        string = string.substr(start, end - start);
+
+        auto result = false;
+        if (string.length() >= substr.length()) {
+          result = !string.compare(0, substr.length(), substr);
+        }
+        return result;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::expandtabs"),
+      [](std::string string, int64_t tabsize) {
+        std::stringstream ss;
+        size_t index = 0;
+        for (const auto& c : string) {
+          if (c != '\t') {
+            ss << c;
+            index++;
+          } else {
+            if (tabsize <= 0) {
+              continue;
+            }
+            do {
+              ss << ' ';
+              index++;
+            } while (index % tabsize);
+          }
+        }
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::find"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        return stringFindImpl(string, substr, start, end);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rfind"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        return stringFindImpl(string, substr, start, end, true);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::index.str"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        auto result = stringFindImpl(string, substr, start, end);
+        if (result < 0) {
+          throw std::runtime_error("ValueError: substring not found");
+        }
+        return result;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rindex"),
+      [](std::string string, std::string substr, int64_t start, int64_t end) {
+        auto result = stringFindImpl(string, substr, start, end, true);
+        if (result < 0) {
+          throw std::runtime_error("ValueError: substring not found");
+        }
+        return result;
+      });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::isidentifier"), [](std::string string) {
+    LOG(WARNING)
+        << "The isidentifier() implementation being used is from Python 2\n";
+    if (string.size() < 1) {
+      return false;
+    }
+    if (::isdigit(string[0])) {
+      return false;
+    }
+    auto result = std::all_of(
+        string.begin(), string.end(), [](char c) { return ::isalnum(c); });
+    return result;
+  });
+
+  m.impl(TORCH_SELECTIVE_NAME("aten::istitle"), [](std::string string) {
+    auto result = false;
+
+    bool prev_is_alpha = false;
+    for (char c : string) {
+      if (prev_is_alpha) {
+        if (c != static_cast<char>(::tolower(c))) {
+          result = false;
+          break;
+        }
+      } else {
+        if (c != static_cast<char>(::toupper(c))) {
+          result = false;
+          break;
+        }
+        // Only true if there exists at least one alpha
+        if (::isalpha(c)) {
+          result = true;
+        }
+      }
+      if (::isalpha(c)) {
+        prev_is_alpha = true;
+      } else {
+        prev_is_alpha = false;
+      }
+    }
+    return result;
+  });
+
+  // Can't reuse DEFINE_STRING_IS_OP because "" is printable
+  m.impl(TORCH_SELECTIVE_NAME("aten::isprintable"), [](std::string string) {
+    auto result = std::all_of(string.begin(), string.end(), [](char c) {
+      return ::isalnum(c) || ::ispunct(c) || c == ' ';
+    });
+    return result;
+  });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::ljust"),
+      [](std::string string, int64_t width, std::string fillchar) {
+        if (fillchar.size() != 1) {
+          // TODO: this should be a TypeError
+          throw std::runtime_error(
+              "TypeError: The fill character must be exactly one character long");
+        }
+        auto to_append =
+            std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
+
+        std::stringstream ss;
+        ss << string;
+        for (auto i = 0; i < to_append; ++i) {
+          ss << fillchar;
+        }
+
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rjust"),
+      [](std::string string, int64_t width, std::string fillchar) {
+        if (fillchar.size() != 1) {
+          // TODO: this should be a TypeError
+          throw std::runtime_error(
+              "TypeError: The fill character must be exactly one character long");
+        }
+        auto to_append =
+            std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
+
+        std::stringstream ss;
+        for (auto i = 0; i < to_append; ++i) {
+          ss << fillchar;
+        }
+        ss << string;
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::zfill"),
+      [](std::string string, int64_t width) {
+        auto to_append =
+            std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
+
+        std::stringstream ss;
+        for (auto i = 0; i < to_append; ++i) {
+          ss << '0';
+        }
+        ss << string;
+
+        return ss.str();
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::lstrip"),
+      [](std::string string, std::string chars) {
+        auto index = string.find_first_not_of(chars);
+        if (index != std::string::npos) {
+          string = string.substr(index, string.size());
+        } else {
+          string = "";
+        }
+        return string;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rstrip"),
+      [](std::string string, std::string chars) {
+        auto index = string.find_last_not_of(chars);
+        if (index != std::string::npos) {
+          string = string.substr(0, index + 1);
+        } else {
+          string = "";
+        }
+        return string;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::replace"),
+      [](std::string string,
+         std::string old_str,
+         std::string new_str,
+         int64_t max) {
+        int64_t occurrences = 0;
+        std::string::size_type pos = 0;
+        while ((pos = string.find(old_str, pos)) != std::string::npos) {
+          if (max >= 0 && ++occurrences > max) {
+            break;
+          }
+          string = string.replace(pos, old_str.length(), new_str);
+          pos += new_str.length();
+        }
+
+        return string;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::partition"),
+      [](std::string string, std::string separator) {
+        auto pos = string.find(separator, 0);
+        if (pos == std::string::npos) {
+          pos = string.size();
+          separator = "";
+        }
+        auto pre_partition = string.substr(0, pos);
+        auto post_partition =
+            string.substr(pos + separator.size(), string.size());
+
+        return std::make_tuple(pre_partition, separator, post_partition);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rpartition"),
+      [](std::string string, std::string separator) {
+        auto pos = string.find(separator, 0);
+        auto rpos = pos;
+        do {
+          pos = rpos;
+          rpos = string.find(separator, pos + 1);
+        } while (rpos != std::string::npos);
+
+        if (pos == std::string::npos) {
+          pos = 0;
+          separator = "";
+        }
+
+        auto pre_partition = string.substr(0, pos);
+        auto post_partition =
+            string.substr(pos + separator.size(), string.size());
+
+        return std::make_tuple(pre_partition, separator, post_partition);
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::rsplit"),
+      [](std::string string, std::string separator, int64_t max) {
+        std::reverse(separator.begin(), separator.end());
+        std::reverse(string.begin(), string.end());
+
+        std::string::size_type prev_pos = 0;
+        std::string::size_type pos = 0;
+        c10::List<std::string> splits;
+        auto count = 0;
+        while ((pos = string.find(separator, pos)) != std::string::npos) {
+          count++;
+          if (max >= 0 && count > max) {
+            break;
+          } else {
+            auto substr = string.substr(prev_pos, pos - prev_pos);
+            std::reverse(substr.begin(), substr.end());
+            splits.emplace(splits.begin(), substr);
+          }
+          pos += separator.size();
+          prev_pos = pos;
+        }
+        auto substr = string.substr(prev_pos, string.size() - prev_pos);
+        std::reverse(substr.begin(), substr.end());
+        splits.emplace(splits.begin(), substr);
+        return splits;
+      });
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::join"),
+      [](const std::string& string, const c10::List<std::string>& values) {
+        std::stringstream ss;
+        for (auto it = values.begin(); it != values.end(); ++it) {
+          ss << static_cast<std::string>(*it);
+          if (it != values.end() - 1) {
+            ss << string;
+          }
+        }
+        return ss.str();
+      });
+}
+
 } // namespace
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/register_string_ops.cpp b/torch/csrc/jit/runtime/register_string_ops.cpp
deleted file mode 100644
index 244893b10393..000000000000
--- a/torch/csrc/jit/runtime/register_string_ops.cpp
+++ /dev/null
@@ -1,499 +0,0 @@
-#include <torch/csrc/jit/runtime/custom_operator.h>
-#include <torch/csrc/jit/runtime/operator.h>
-#include <torch/library.h>
-
-namespace torch {
-namespace jit {
-namespace {
-
-c10::AliasAnalysisKind aliasAnalysisFromSchema() {
-  return c10::AliasAnalysisKind::FROM_SCHEMA;
-}
-
-// Convert an python index (which may be negative) into an index usable for a
-// C++ container
-int64_t normalizeIndex(int64_t idx, int64_t list_size) {
-  if (idx < 0) {
-    // Handle negative indexing
-    idx = list_size + idx;
-  }
-  return idx;
-}
-
-int64_t stringFindImpl(
-    std::string string,
-    std::string substr,
-    int64_t start,
-    int64_t end,
-    bool reverse = false) {
-  int64_t size = string.size();
-  if (start < 0) {
-    start = std::max(int64_t(0), int64_t(size + start));
-  }
-  if (end < 0) {
-    end = std::max(int64_t(0), int64_t(size + end + 1));
-  }
-  if (end > start) {
-    string = string.substr(start, end - start);
-  } else {
-    string = "";
-  }
-
-  int64_t result = -1;
-  if (string.size() >= substr.size()) {
-    auto pos = string.find(substr, 0);
-    if (reverse) {
-      auto rpos = pos;
-      do {
-        pos = rpos;
-        rpos = string.find(substr, pos + 1);
-      } while (rpos != std::string::npos);
-    }
-    if (pos != std::string::npos) {
-      result = pos + start;
-    }
-  }
-  return result;
-}
-
-// String Ops
-// Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp
-TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
-  m.impl("splitlines", [](std::string string, bool keepends) {
-    std::string delimiters = "\n\r\r\n\v\x0b\f\x0c\x1c\x1d\x1e\x85\u2028\u2029";
-    c10::List<std::string> splits;
-
-    std::string::size_type prev_pos = 0;
-    std::string::size_type pos = 0;
-    while ((pos = string.find_first_of(delimiters, pos)) != std::string::npos) {
-      splits.emplace_back(string.substr(prev_pos, pos - prev_pos));
-      if (keepends) {
-        splits.emplace_back(string.substr(pos, 1));
-      }
-      pos++;
-      prev_pos = pos;
-    }
-    if (prev_pos != string.size()) {
-      splits.emplace_back(string.substr(prev_pos, string.size() - prev_pos));
-    }
-
-    return splits;
-  });
-
-  // upper and lower require there to be at least one alpha character,
-  // and ignore all other characters
-  m.impl("isupper", [](std::string string) {
-    bool found_alpha = false;
-    bool is_upper = true;
-    for (size_t i = 0; i < string.size() && is_upper; ++i) {
-      char c = string[i];
-      found_alpha |= static_cast<bool>(::isalpha(c));
-      is_upper &= (!::isalpha(c) || ::isupper(c));
-    }
-    return found_alpha && is_upper;
-  });
-  m.impl("islower", [](std::string string) {
-    bool found_alpha = false;
-    bool is_lower = true;
-    for (size_t i = 0; i < string.size() && is_lower; ++i) {
-      char c = string[i];
-      found_alpha |= static_cast<bool>(::isalpha(c));
-      is_lower &= (!::isalpha(c) || ::islower(c));
-    }
-    return found_alpha && is_lower;
-  });
-
-  m.impl("capitalize", [](std::string string) {
-    std::stringstream ss;
-    auto first_char = true;
-    for (char c : string) {
-      if (first_char) {
-        ss << static_cast<char>(::toupper(c));
-        first_char = false;
-      } else {
-        ss << static_cast<char>(::tolower(c));
-      }
-    }
-    return ss.str();
-  });
-
-  m.impl("title", [](std::string string) {
-    std::stringstream ss;
-    bool prev_is_nonalpha = true;
-    for (char c : string) {
-      if (prev_is_nonalpha) {
-        ss << static_cast<char>(::toupper(c));
-      } else {
-        ss << static_cast<char>(::tolower(c));
-      }
-      if (::isalpha(c)) {
-        prev_is_nonalpha = false;
-      } else {
-        prev_is_nonalpha = true;
-      }
-    }
-    return ss.str();
-  });
-
-  m.impl("center", [](std::string string, int64_t width, std::string fillchar) {
-    if (fillchar.size() != 1) {
-      // TODO: this should be a TypeError
-      throw std::runtime_error(
-          "TypeError: The fill character must be exactly one character long");
-    }
-    if (string.size() > static_cast<std::string::size_type>(width)) {
-      return string;
-    }
-    std::stringstream ss;
-    std::string::size_type full_padding = width - string.size();
-    std::string::size_type l_pad = full_padding / 2;
-    std::string::size_type r_pad = (full_padding + 1) / 2;
-    if (width % 2) {
-      auto tmp = r_pad;
-      r_pad = l_pad;
-      l_pad = tmp;
-    }
-    for (std::string::size_type i = 0; i < l_pad; ++i) {
-      ss << fillchar;
-    }
-    ss << string;
-    for (std::string::size_type i = 0; i < r_pad; ++i) {
-      ss << fillchar;
-    }
-    return ss.str();
-  });
-
-  // Adapted from
-  // https://stackoverflow.com/questions/22489073/counting-the-number-of-occurrences-of-a-string-within-a-string
-  m.impl(
-      "count",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        int64_t size = string.size();
-        if (start > size) {
-          return int64_t(0);
-        }
-        if (start < 0) {
-          start = std::max(int64_t(0), int64_t(size + start));
-        }
-        if (end < 0) {
-          end = std::max(int64_t(0), int64_t(size + end + 1));
-        }
-
-        int64_t occurrences = 0;
-        std::string::size_type pos = start;
-        while ((pos = string.find(substr, pos)) != std::string::npos) {
-          if (pos < static_cast<std::string::size_type>(end)) {
-            ++occurrences;
-          } else {
-            break;
-          }
-          pos += substr.length();
-        }
-        return occurrences;
-      });
-
-  m.impl(
-      "endswith",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        int64_t size = string.size();
-        if (start < 0) {
-          start = std::max(int64_t(0), int64_t(size + start));
-        }
-        if (end < 0) {
-          end = std::max(int64_t(0), int64_t(size + end + 1));
-        }
-
-        string = string.substr(start, end - start);
-
-        auto result = false;
-        if (string.length() >= substr.length()) {
-          result = !string.compare(
-              string.length() - substr.length(), substr.length(), substr);
-        }
-        return result;
-      });
-
-  m.impl(
-      "startswith",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        int64_t size = string.size();
-        if (start < 0) {
-          start = std::max(int64_t(0), int64_t(size + start));
-        }
-        if (end < 0) {
-          end = std::max(int64_t(0), int64_t(size + end + 1));
-        }
-
-        string = string.substr(start, end - start);
-
-        auto result = false;
-        if (string.length() >= substr.length()) {
-          result = !string.compare(0, substr.length(), substr);
-        }
-        return result;
-      });
-
-  m.impl("expandtabs", [](std::string string, int64_t tabsize) {
-    std::stringstream ss;
-    size_t index = 0;
-    for (const auto& c : string) {
-      if (c != '\t') {
-        ss << c;
-        index++;
-      } else {
-        if (tabsize <= 0) {
-          continue;
-        }
-        do {
-          ss << ' ';
-          index++;
-        } while (index % tabsize);
-      }
-    }
-    return ss.str();
-  });
-
-  m.impl(
-      "find",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        return stringFindImpl(string, substr, start, end);
-      });
-
-  m.impl(
-      "rfind",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        return stringFindImpl(string, substr, start, end, true);
-      });
-
-  m.impl(
-      "index.str",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        auto result = stringFindImpl(string, substr, start, end);
-        if (result < 0) {
-          throw std::runtime_error("ValueError: substring not found");
-        }
-        return result;
-      });
-
-  m.impl(
-      "rindex",
-      [](std::string string, std::string substr, int64_t start, int64_t end) {
-        auto result = stringFindImpl(string, substr, start, end, true);
-        if (result < 0) {
-          throw std::runtime_error("ValueError: substring not found");
-        }
-        return result;
-      });
-
-  m.impl("isidentifier", [](std::string string) {
-    LOG(WARNING)
-        << "The isidentifier() implementation being used is from Python 2\n";
-    if (string.size() < 1) {
-      return false;
-    }
-    if (::isdigit(string[0])) {
-      return false;
-    }
-    auto result = std::all_of(
-        string.begin(), string.end(), [](char c) { return ::isalnum(c); });
-    return result;
-  });
-
-  m.impl("istitle", [](std::string string) {
-    auto result = false;
-
-    bool prev_is_alpha = false;
-    for (char c : string) {
-      if (prev_is_alpha) {
-        if (c != static_cast<char>(::tolower(c))) {
-          result = false;
-          break;
-        }
-      } else {
-        if (c != static_cast<char>(::toupper(c))) {
-          result = false;
-          break;
-        }
-        // Only true if there exists at least one alpha
-        if (::isalpha(c)) {
-          result = true;
-        }
-      }
-      if (::isalpha(c)) {
-        prev_is_alpha = true;
-      } else {
-        prev_is_alpha = false;
-      }
-    }
-    return result;
-  });
-
-  // Can't reuse DEFINE_STRING_IS_OP because "" is printable
-  m.impl("isprintable", [](std::string string) {
-    auto result = std::all_of(string.begin(), string.end(), [](char c) {
-      return ::isalnum(c) || ::ispunct(c) || c == ' ';
-    });
-    return result;
-  });
-
-  m.impl("ljust", [](std::string string, int64_t width, std::string fillchar) {
-    if (fillchar.size() != 1) {
-      // TODO: this should be a TypeError
-      throw std::runtime_error(
-          "TypeError: The fill character must be exactly one character long");
-    }
-    auto to_append =
-        std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
-
-    std::stringstream ss;
-    ss << string;
-    for (auto i = 0; i < to_append; ++i) {
-      ss << fillchar;
-    }
-
-    return ss.str();
-  });
-
-  m.impl("rjust", [](std::string string, int64_t width, std::string fillchar) {
-    if (fillchar.size() != 1) {
-      // TODO: this should be a TypeError
-      throw std::runtime_error(
-          "TypeError: The fill character must be exactly one character long");
-    }
-    auto to_append =
-        std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
-
-    std::stringstream ss;
-    for (auto i = 0; i < to_append; ++i) {
-      ss << fillchar;
-    }
-    ss << string;
-    return ss.str();
-  });
-
-  m.impl("zfill", [](std::string string, int64_t width) {
-    auto to_append =
-        std::max(int64_t(0), width - static_cast<int64_t>(string.size()));
-
-    std::stringstream ss;
-    for (auto i = 0; i < to_append; ++i) {
-      ss << '0';
-    }
-    ss << string;
-
-    return ss.str();
-  });
-
-  m.impl("lstrip", [](std::string string, std::string chars) {
-    auto index = string.find_first_not_of(chars);
-    if (index != std::string::npos) {
-      string = string.substr(index, string.size());
-    } else {
-      string = "";
-    }
-    return string;
-  });
-
-  m.impl("rstrip", [](std::string string, std::string chars) {
-    auto index = string.find_last_not_of(chars);
-    if (index != std::string::npos) {
-      string = string.substr(0, index + 1);
-    } else {
-      string = "";
-    }
-    return string;
-  });
-
-  m.impl(
-      "replace",
-      [](std::string string,
-         std::string old_str,
-         std::string new_str,
-         int64_t max) {
-        int64_t occurrences = 0;
-        std::string::size_type pos = 0;
-        while ((pos = string.find(old_str, pos)) != std::string::npos) {
-          if (max >= 0 && ++occurrences > max) {
-            break;
-          }
-          string = string.replace(pos, old_str.length(), new_str);
-          pos += new_str.length();
-        }
-
-        return string;
-      });
-
-  m.impl("partition", [](std::string string, std::string separator) {
-    auto pos = string.find(separator, 0);
-    if (pos == std::string::npos) {
-      pos = string.size();
-      separator = "";
-    }
-    auto pre_partition = string.substr(0, pos);
-    auto post_partition = string.substr(pos + separator.size(), string.size());
-
-    return std::make_tuple(pre_partition, separator, post_partition);
-  });
-
-  m.impl("rpartition", [](std::string string, std::string separator) {
-    auto pos = string.find(separator, 0);
-    auto rpos = pos;
-    do {
-      pos = rpos;
-      rpos = string.find(separator, pos + 1);
-    } while (rpos != std::string::npos);
-
-    if (pos == std::string::npos) {
-      pos = 0;
-      separator = "";
-    }
-
-    auto pre_partition = string.substr(0, pos);
-    auto post_partition = string.substr(pos + separator.size(), string.size());
-
-    return std::make_tuple(pre_partition, separator, post_partition);
-  });
-
-  m.impl("rsplit", [](std::string string, std::string separator, int64_t max) {
-    std::reverse(separator.begin(), separator.end());
-    std::reverse(string.begin(), string.end());
-
-    std::string::size_type prev_pos = 0;
-    std::string::size_type pos = 0;
-    c10::List<std::string> splits;
-    auto count = 0;
-    while ((pos = string.find(separator, pos)) != std::string::npos) {
-      count++;
-      if (max >= 0 && count > max) {
-        break;
-      } else {
-        auto substr = string.substr(prev_pos, pos - prev_pos);
-        std::reverse(substr.begin(), substr.end());
-        splits.emplace(splits.begin(), substr);
-      }
-      pos += separator.size();
-      prev_pos = pos;
-    }
-    auto substr = string.substr(prev_pos, string.size() - prev_pos);
-    std::reverse(substr.begin(), substr.end());
-    splits.emplace(splits.begin(), substr);
-    return splits;
-  });
-
-  m.impl(
-      "join",
-      [](const std::string& string, const c10::List<std::string>& values) {
-        std::stringstream ss;
-        for (auto it = values.begin(); it != values.end(); ++it) {
-          ss << static_cast<std::string>(*it);
-          if (it != values.end() - 1) {
-            ss << string;
-          }
-        }
-        return ss.str();
-      });
-}
-
-} // namespace
-} // namespace jit
-} // namespace torch

From 94c3cdd994dfe276b89ffb412cfe8a80b2f37bc7 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Wed, 23 Sep 2020 08:04:27 -0700
Subject: [PATCH 050/449] Let rpc._all_gather use default RPC timeout (#44983)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44983

`_all_gather` was converted from `_wait_all_workers` and inherited its
5 seconds fixed timeout. As `_all_gather` meant to support a broader
set of use cases, the timeout configuration should be more flexible.
This PR makes `rpc._all_gather` use the global default RPC timeout.

Test Plan: Imported from OSS

Reviewed By: pritamdamania87

Differential Revision: D23794383

Pulled By: mrshenli

fbshipit-source-id: 382f52c375f0f25c032c5abfc910f72baf4c5ad9
---
 torch/csrc/distributed/rpc/init.cpp           |  1 -
 .../distributed/rpc/process_group_agent.cpp   |  8 +-----
 torch/csrc/distributed/rpc/rpc_agent.h        |  3 +++
 .../csrc/distributed/rpc/tensorpipe_agent.cpp |  8 ++----
 .../testing/faulty_process_group_agent.cpp    |  4 ---
 torch/distributed/rpc/api.py                  | 17 ++++++------
 torch/distributed/rpc/constants.py            |  1 +
 .../_internal/distributed/rpc/rpc_test.py     | 27 +++++++++++++++++++
 8 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index ea1db04225c7..34023afdce91 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -27,7 +27,6 @@ namespace rpc {
 namespace {
 
 constexpr std::chrono::milliseconds kDeleteAllUsersTimeout(100000);
-constexpr float kSecToMsConversion = 1000;
 
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp
index fe93e43d01f3..d97577724a55 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@@ -8,12 +8,6 @@
 namespace torch {
 namespace distributed {
 namespace rpc {
-const std::string kRPCTimeoutErrorStr =
-    "RPC ran for more than {} milliseconds and timed out.";
-
-namespace {
-constexpr auto kSecToMsConversion = 1000;
-}
 
 //////////////////////////  MessageCounter  /////////////////////////////////
 
@@ -802,7 +796,7 @@ void ProcessGroupAgent::pollTimedOutRPCs() {
 
     for (const auto& timedOutFuture : timedOutFutures) {
       auto errStr =
-          fmt::format(kRPCTimeoutErrorStr, timedOutFuture.timeout_.count());
+          fmt::format(kRpcTimeoutErrorStr, timedOutFuture.timeout_.count());
       auto err = makeRPCError(errStr, RPCErrorType::TIMEOUT);
 
       if (!timedOutFuture.future_->hasError()) {
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index 605744a1f227..34b77a085510 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -17,6 +17,9 @@ constexpr float kDefaultRpcTimeoutSeconds = 60;
 // timeout for RPCs.
 constexpr float kUnsetRpcTimeout = -1;
 constexpr auto kDefaultInitMethod = "env://";
+constexpr float kSecToMsConversion = 1000;
+constexpr auto kRpcTimeoutErrorStr =
+    "RPC ran for more than set timeout ({} ms) and will now be marked with an error";
 
 using steady_clock_time_point =
     std::chrono::time_point<std::chrono::steady_clock>;
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index d9ce2c3b27eb..11c5408c2c35 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -22,16 +22,12 @@ namespace {
 const std::string kSocketIfnameEnvVar = "TP_SOCKET_IFNAME";
 const std::string kDefaultUvAddress = "127.0.0.1";
 
-constexpr long kToMilliseconds = 1000;
-
 const std::string kGilAverageWaitTime = "agent.gil_average_wait_time_us";
 const std::string kThreadPoolSize = "agent.thread_pool_size";
 const std::string kNumIdleThreads = "agent.num_idle_threads";
 const std::string kClientActiveCalls = "agent.client_active_calls";
 const std::string kServerActiveCalls = "agent.server_active_calls";
 const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls";
-const std::string kRpcTimeoutErrorStr =
-    "RPC ran for more than set timeout ({} ms) and will now be marked with an error";
 
 inline void checkCPUTensor(const torch::Tensor& tensor) {
   TORCH_CHECK(
@@ -273,7 +269,7 @@ TensorPipeAgent::TensorPipeAgent(
           WorkerInfo(std::move(selfName), selfId),
           std::move(cb),
           std::chrono::milliseconds(
-              (long)(opts.rpcTimeoutSeconds * kToMilliseconds))),
+              (long)(opts.rpcTimeoutSeconds * kSecToMsConversion))),
       opts_(std::move(opts)),
       threadPool_(opts_.numWorkerThreads),
       context_(std::make_shared<tensorpipe::Context>(
@@ -685,7 +681,7 @@ std::shared_ptr<FutureMessage> TensorPipeAgent::send(
   auto timeout = rpcTimeoutSeconds == kUnsetRpcTimeout
       ? getRpcTimeout()
       : std::chrono::milliseconds(
-            static_cast<int>(rpcTimeoutSeconds * kToMilliseconds));
+            static_cast<int>(rpcTimeoutSeconds * kSecToMsConversion));
 
   // We only add to the timeoutMap_ if the timeout is not 0. Per our
   // documentation, a user-provided timeout of 0 indicates the RPC should never
diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
index a03ff5cafecd..a1be688a285e 100644
--- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
@@ -6,10 +6,6 @@ namespace torch {
 namespace distributed {
 namespace rpc {
 
-namespace {
-constexpr auto kSecToMsConversion = 1000;
-}
-
 std::string fromVec(const std::vector<char>& vec) {
   return std::string(vec.begin(), vec.end());
 }
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index af28e6023c60..d1b62a5b0ab4 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -12,6 +12,7 @@
     PyRRef,
     RemoteProfilerManager,
     WorkerInfo,
+    get_rpc_timeout,
     _cleanup_python_rpc_handler,
     _delete_all_user_and_unforked_owner_rrefs,
     _destroy_rref_context,
@@ -34,7 +35,7 @@
     _build_rpc_profiling_key,
 )
 
-from .constants import UNSET_RPC_TIMEOUT
+from .constants import DEFAULT_SHUTDOWN_TIMEOUT, UNSET_RPC_TIMEOUT
 
 
 logger = logging.getLogger(__name__)
@@ -142,7 +143,7 @@ def _broadcast_to_followers(sequence_id, objects_map):
 
 
 @_require_initialized
-def _all_gather(obj):
+def _all_gather(obj, timeout=UNSET_RPC_TIMEOUT):
     r"""
     This is similar to torch.distributed.all_gather(), but is using RPC. It
     picks the worker with the smallest name (alphabetic order) as the leader.
@@ -163,8 +164,8 @@ def _all_gather(obj):
         _all_gather_sequence_id += 1
 
     is_leader = leader_name == self_name
-    # Set a long enough timeout for all shutdown messages to be processed.
-    timeout = 5  # second
+    if timeout == UNSET_RPC_TIMEOUT:
+        timeout = get_rpc_timeout()
 
     # Phase 1: Followers send it's object to the leader
     if is_leader:
@@ -178,9 +179,7 @@ def _all_gather(obj):
         )
 
     with _all_gather_dict_lock:
-        states = _all_gather_sequence_id_to_states[
-            sequence_id
-        ]
+        states = _all_gather_sequence_id_to_states[sequence_id]
     states.proceed_signal.wait()
 
     # Phase 2: Leader broadcast gathered results to all followers
@@ -207,7 +206,7 @@ def _all_gather(obj):
         if errors:
             raise RuntimeError(
                 f"Followers {[e[0] for e in errors]} timed out in _all_gather "
-                f"after {timeout} seconds. The first exception is {errors[0][1]}"
+                f"after {timeout:.2f} seconds. The first exception is {errors[0][1]}"
             )
 
     return states.gathered_objects
@@ -223,7 +222,7 @@ def _wait_all_workers():
     framework will work after this method returns.
     """
     try:
-        _all_gather(None)
+        _all_gather(None, timeout=DEFAULT_SHUTDOWN_TIMEOUT)
     except RuntimeError as ex:
         logger.error(
             f"Failed to respond to 'Shutdown Proceed' in time, got error {ex}"
diff --git a/torch/distributed/rpc/constants.py b/torch/distributed/rpc/constants.py
index ecd9552ce40b..c2dd804e4c81 100644
--- a/torch/distributed/rpc/constants.py
+++ b/torch/distributed/rpc/constants.py
@@ -12,6 +12,7 @@
 # For any RpcAgent.
 DEFAULT_RPC_TIMEOUT_SEC = _DEFAULT_RPC_TIMEOUT_SEC
 DEFAULT_INIT_METHOD = _DEFAULT_INIT_METHOD
+DEFAULT_SHUTDOWN_TIMEOUT = 5.0
 
 # For ProcessGroupAgent.
 DEFAULT_NUM_SEND_RECV_THREADS = _DEFAULT_NUM_SEND_RECV_THREADS
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index e343ffc1939b..797e5a010b86 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -131,6 +131,19 @@ def set(self, val):
         self.t = val
 
 
+class SlowPickleClass:
+    def __init__(self, t):
+        self.t = t
+
+    def __getstate__(self):
+        time.sleep(self.t)
+        return (self.t, )
+
+    def __setstate__(self, obj):
+        self.t = obj[0]
+        time.sleep(self.t)
+
+
 class MyClass:
     def __init__(self, a):
         self.a = a
@@ -931,6 +944,20 @@ def test_all_gather(self):
 
         self.assertEqual(expected, results)
 
+    @dist_init
+    def test_all_gather_timeout(self):
+        rpc._set_rpc_timeout(0.1)
+
+        if self.rank == 0:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "timed out in _all_gather after 0\\.10 seconds"
+            ):
+                rpc.api._all_gather(SlowPickleClass(0.5))
+        else:
+            with self.assertRaisesRegex(RuntimeError, "timeout.*100 ms"):
+                rpc.api._all_gather(SlowPickleClass(0.5))
+
     @dist_init
     def test_graceful_shutdown_with_uneven_workload(self):
         """Test graceful termination."""

From 5b20bf4fd97c9acd899ea780cda8738aa775cdec Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Wed, 23 Sep 2020 08:24:08 -0700
Subject: [PATCH 051/449] Added support for complex input for Cholesky
 decomposition (#44895)

Summary:
Cholesky decomposition now works for complex inputs.

Fixes https://github.com/pytorch/pytorch/issues/44637.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44895

Reviewed By: ailzhang

Differential Revision: D23841583

Pulled By: anjali411

fbshipit-source-id: 3b1f34a7af17827884540696f8771a0d5b1df478
---
 .../ATen/native/cuda/BatchLinearAlgebra.cu    | 36 ++++++++++++++-
 test/test_torch.py                            | 45 +++++++++++++++----
 2 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 5394c2a23239..c86f355a67c2 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -344,6 +344,24 @@ void magmaCholesky<float>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaCholesky<c10::complex<double>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<double>* dA,
+    magma_int_t ldda, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zpotrf_gpu(uplo, n, reinterpret_cast<magmaDoubleComplex*>(dA), ldda, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaCholesky<c10::complex<float>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<float>* dA,
+    magma_int_t ldda, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cpotrf_gpu(uplo, n, reinterpret_cast<magmaFloatComplex*>(dA), ldda, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaCholeskyBatched<double>(
     magma_uplo_t uplo, magma_int_t n, double** dA_array, magma_int_t ldda,
@@ -360,6 +378,22 @@ void magmaCholeskyBatched<float>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaCholeskyBatched<c10::complex<double>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<double>** dA_array, magma_int_t ldda,
+    magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+  magma_zpotrf_batched(uplo, n, reinterpret_cast<magmaDoubleComplex**>(dA_array), ldda, info_array, batchsize, magma_queue.get_queue());
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaCholeskyBatched<c10::complex<float>>(
+    magma_uplo_t uplo, magma_int_t n, c10::complex<float>** dA_array, magma_int_t ldda,
+    magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+  magma_cpotrf_batched(uplo, n, reinterpret_cast<magmaFloatComplex**>(dA_array), ldda, info_array, batchsize, magma_queue.get_queue());
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaTriangularSolve<double>(
     magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
@@ -904,7 +938,7 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) {
     self_working_copy = cloneBatchedColumnMajor(self);
   }
 
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_cuda", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "cholesky_cuda", [&]{
     apply_cholesky<scalar_t>(self_working_copy, false, infos);
   });
   if (self.dim() > 2) {
diff --git a/test/test_torch.py b/test/test_torch.py
index 6b529712ab5c..12a72c2f11c8 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -7739,14 +7739,29 @@ def cholesky_test_helper(n, batchsize, device, upper):
         for upper, batchsize in product([True, False], [262144, 524288]):
             cholesky_test_helper(2, batchsize, device, upper)
 
+    @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4})
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_cholesky_batched(self, device, dtype):
-        from torch.testing._internal.common_utils import random_symmetric_pd_matrix
+        from torch.testing._internal.common_utils import \
+            (random_symmetric_pd_matrix,
+             random_fullrank_matrix_distinct_singular_value)
 
         def cholesky_test_helper(n, batch_dims, upper):
-            A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device)
+            # This is a workaround while there is no support for complex random_symmetric_pd_matrix
+            if dtype.is_complex:
+                real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+                A_real = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device)
+                A_imag = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device)
+                A = A_real + 1j * A_imag
+                # There is no support for complex batched matmul yet
+                matmul_list = []
+                for mat in A.contiguous().view(-1, n, n):
+                    matmul_list.append(mat @ mat.t().conj())
+                A = torch.stack(matmul_list).view(*batch_dims, n, n)
+            else:
+                A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device)
             cholesky_exp = torch.stack([m.cholesky(upper=upper) for m in A.reshape(-1, n, n)])
             cholesky_exp = cholesky_exp.reshape_as(A)
             self.assertEqual(cholesky_exp, torch.cholesky(A, upper=upper))
@@ -7754,26 +7769,38 @@ def cholesky_test_helper(n, batch_dims, upper):
         for upper, batchsize in product([True, False], [(3,), (3, 4), (2, 3, 4)]):
             cholesky_test_helper(3, batchsize, upper)
 
+    @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4})
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_cholesky(self, device, dtype):
-        x = torch.rand(10, 10, dtype=dtype, device=device) + 1e-1
-        A = torch.mm(x, x.t())
+        from torch.testing._internal.common_utils import \
+            (random_symmetric_pd_matrix,
+             random_fullrank_matrix_distinct_singular_value)
+
+        # This is a workaround while there is no support for complex random_symmetric_pd_matrix
+        if dtype.is_complex:
+            real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+            A_real = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device)
+            A_imag = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device)
+            A = A_real + 1j * A_imag
+            A = A @ A.t().conj()
+        else:
+            A = random_symmetric_pd_matrix(10, dtype=dtype, device=device)
 
         # default Case
         C = torch.cholesky(A)
-        B = torch.mm(C, C.t())
+        B = torch.mm(C, C.t().conj())
         self.assertEqual(A, B, atol=1e-14, rtol=0)
 
         # test Upper Triangular
         U = torch.cholesky(A, True)
-        B = torch.mm(U.t(), U)
+        B = torch.mm(U.t().conj(), U)
         self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (upper) did not allow rebuilding the original matrix')
 
         # test Lower Triangular
         L = torch.cholesky(A, False)
-        B = torch.mm(L, L.t())
+        B = torch.mm(L, L.t().conj())
         self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (lower) did not allow rebuilding the original matrix')
 
     def test_view(self, device):

From 9e30a76697ddedb46887a92559b317840bce6804 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 23 Sep 2020 09:47:05 -0700
Subject: [PATCH 052/449] Filter `strtod_l` is undeclared errors from sccache
 log (#45183)

Summary:
This prevents DrCI from misidentifying test failures for the compilation failures, such as:
```
/var/lib/jenkins/workspace/build/CMakeFiles/CMakeTmp/CheckSymbolExists.c:8:19: error: use of undeclared identifier \'strtod_l\'
  return ((int*)(&strtod_l))[argc];
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45183

Reviewed By: ezyang

Differential Revision: D23859267

Pulled By: malfet

fbshipit-source-id: 283d9bd2ab712f23239b72f3758d121e2d026fb0
---
 .jenkins/pytorch/print_sccache_log.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.jenkins/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py
index c91472876c33..81c7e0752328 100644
--- a/.jenkins/pytorch/print_sccache_log.py
+++ b/.jenkins/pytorch/print_sccache_log.py
@@ -6,6 +6,7 @@
     lines = f.readlines()
 
 for line in lines:
-    # Ignore errors from CPU instruction set testing
-    if 'src.c' not in line:
+    # Ignore errors from CPU instruction set or symbol existing testing
+    keywords = ['src.c', 'CheckSymbolExists.c']
+    if all([keyword not in line for keyword in keywords]):
         print(line)

From 9db38712889f049ed97d252db504dc598f696522 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Wed, 23 Sep 2020 10:48:42 -0700
Subject: [PATCH 053/449] Update true_divide_out to use at::. (#45079)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45079

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D23821701

Pulled By: ailzhang

fbshipit-source-id: 562eac10faba7a503eda0029a0b026c1fb85fe1e
---
 aten/src/ATen/native/BinaryOps.cpp |  2 +-
 test/test_torch.py                 | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index fc55379578ff..cab77c25b885 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -175,7 +175,7 @@ Tensor& divide_(Tensor& self, Scalar other) {
 
 // true_divide, an alias for div
 Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) {
-  return native::div_out(result, self, divisor);
+  return at::div_out(result, self, divisor);
 }
 
 Tensor true_divide(const Tensor& self, const Tensor& divisor) {
diff --git a/test/test_torch.py b/test/test_torch.py
index 12a72c2f11c8..4b08697a908c 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -12681,7 +12681,7 @@ def test_scatter_reduce_non_unique_index(self, device, dtype):
             input.scatter_(0, index, src, reduce=operation)
             self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}")
 
-    @skipCUDAIfRocm            
+    @skipCUDAIfRocm
     @onlyOnCPUAndCUDA
     @dtypesIfCUDA(*(torch.testing.get_all_complex_dtypes() +
                     torch.testing.get_all_int_dtypes()))
@@ -16857,6 +16857,15 @@ def test_div(self, device, dtype):
                                  atol=0.01, rtol=0)
                 self.assertEqual(method(a1, a2), op(a1, a2))
 
+    @dtypes(torch.bfloat16, torch.float)
+    def test_true_divide_out(self, device, dtype):
+        a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device)
+        a2 = torch.tensor([2., 2.], dtype=dtype, device=device)
+        res = torch.empty_like(a1)
+        self.assertEqual(torch.true_divide(a1, a2, out=res),
+                         torch.tensor([2.1, 3.1], dtype=dtype, device=device),
+                         atol=0.01, rtol=0)
+
     @onlyCUDA
     @dtypes(torch.half)
     def test_divmul_scalar(self, device, dtype):

From a5a4924c2793b64bf68491c136d83f319c133a2d Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Wed, 23 Sep 2020 10:52:54 -0700
Subject: [PATCH 054/449] Warn if `import torch` is called from the source
 root. (#39995)

Summary:
This is a small developer quality of life improvement. I commonly try to run some snippet of python as I'm working on a PR and forget that I've cd-d into the local clone to run some git commands, resulting in annoying failures like:
`ImportError: cannot import name 'default_generator' from 'torch._C' (unknown location)`

This actually took a non-trivial amount of time to figure out the first time I hit it, and even now it's annoying because it happens just infrequently enough to not sit high in the mental cache.

This PR adds a check to `torch/__init__.py` and warns if `import torch` is likely resolving to the wrong thing:

```
WARNING:root:You appear to be importing PyTorch from a clone of the git repo:
  /data/users/taylorrobie/repos/pytorch
  This will prevent `import torch` from resolving to the PyTorch install
  (instead it will try to load /data/users/taylorrobie/repos/pytorch/torch/__init__.py)
  and will generally lead to other failures such as a failure to load C extensions.
```

so that the soon to follow internal import failure makes some sense. I elected to make this a warning rather than an exception because I'm not 100% sure that it's **always** wrong. (e.g. weird `PYTHONPATH` or `importlib` corner cases.)

EDIT: There are now separate cases for `cwd` vs. `PYTHONPATH`, and failure is an `ImportError`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/39995

Reviewed By: malfet

Differential Revision: D23817209

Pulled By: robieta

fbshipit-source-id: d9ac567acb22d9c8c567a8565a7af65ac624dbf7
---
 torch/__init__.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/torch/__init__.py b/torch/__init__.py
index 6523ab126c0d..da9eecad7df5 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -12,6 +12,7 @@
 import os
 import sys
 import platform
+import textwrap
 import ctypes
 
 if sys.version_info < (3,):
@@ -193,6 +194,31 @@ def _load_global_deps():
 if TYPE_CHECKING:
     import torch._C as _C
 
+# Check to see if we can load C extensions, and if not provide some guidance
+# on what the problem might be.
+try:
+    # _initExtension is chosen (arbitrarily) as a sentinel.
+    from torch._C import _initExtension
+except ImportError:
+    import torch._C as _C_for_compiled_check
+
+    # The __file__ check only works for Python 3.7 and above.
+    if sys.version_info >= (3, 7) and _C_for_compiled_check.__file__ is None:
+        raise ImportError(textwrap.dedent('''
+            Failed to load PyTorch C extensions:
+                It appears that PyTorch has loaded the `torch/_C` folder
+                of the PyTorch repository rather than the C extensions which
+                are expected in the `torch._C` namespace. This can occur when
+                using the `install` workflow. e.g.
+                    $ python setup.py install && python -c "import torch"
+
+                This error can generally be solved using the `develop` workflow
+                    $ python setup.py develop && python -c "import torch"  # This should succeed
+                or by running Python from a different directory.
+            ''').strip()) from None
+    raise  # If __file__ is not None the cause is unknown, so just re-raise.
+
+
 __all__ += [name for name in dir(_C)
             if name[0] != '_' and
             not name.endswith('Base')]
@@ -477,9 +503,9 @@ def manager_path():
 # is not a good way to fix this problem.  Perhaps, try to redesign VariableFunctions
 # so that this import is good enough
 if TYPE_CHECKING:
-    # Some type signatures pulled in from _VariableFunctions here clash with 
+    # Some type signatures pulled in from _VariableFunctions here clash with
     # signatures already imported. For now these clashes are ignored; see
-    # PR #43339 for details.  
+    # PR #43339 for details.
     from torch._C._VariableFunctions import *  # type: ignore
 
 for name in dir(_C._VariableFunctions):

From da4033d32adafd6fe0fa3a3727fcb2b55c19e2e9 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Wed, 23 Sep 2020 11:03:53 -0700
Subject: [PATCH 055/449] Make cudaHostRegister actually useful on cudart.
 (#45159)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45159

By default, pybind11 binds void* to be capsules.  After a lot of
Googling, I have concluded that this is not actually useful:
you can't actually create a capsule from Python land, and our
data_ptr() function returns an int, which means that the
function is effectively unusable.  It didn't help that we had no
tests exercising it.

I've replaced the void* with uintptr_t, so that we now accept int
(and you can pass data_ptr() in directly).  I'm not sure if we
should make these functions accept ctypes types; unfortunately,
pybind11 doesn't seem to have any easy way to do this.

Fixes #43006

Also added cudaHostUnregister which was requested.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: lw

Differential Revision: D23849731

Pulled By: ezyang

fbshipit-source-id: 8a79986f3aa9546abbd2a6a5828329ae90fd298f
---
 test/test_cuda.py                 | 12 ++++++++++++
 torch/csrc/cuda/shared/cudart.cpp |  7 ++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 011e8c374645..2d23954cfcf8 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -279,6 +279,18 @@ def assert_change(comp=1, empty_cache=False, reset_peak=False):
         assert_change(0, empty_cache=True)
         assert_change(0, reset_peak=True)
 
+    @skipIfRocm
+    def test_cudart_register(self):
+        t = torch.ones(20)
+        self.assertFalse(t.is_pinned())
+        cudart = torch.cuda.cudart()
+        r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0)
+        self.assertEquals(r, 0)
+        self.assertTrue(t.is_pinned())
+        r = cudart.cudaHostUnregister(t.data_ptr())
+        self.assertEquals(r, 0)
+        self.assertFalse(t.is_pinned())
+
     def test_memory_stats(self):
         gc.collect()
         torch.cuda.empty_cache()
diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp
index efada16a49c8..a8f80a35855d 100644
--- a/torch/csrc/cuda/shared/cudart.cpp
+++ b/torch/csrc/cuda/shared/cudart.cpp
@@ -29,7 +29,12 @@ void initCudartBindings(PyObject* module) {
   cudart.def("cuda" "GetErrorString", cudaGetErrorString);
   cudart.def("cuda" "ProfilerStart", cudaProfilerStart);
   cudart.def("cuda" "ProfilerStop", cudaProfilerStop);
-  cudart.def("cuda" "HostRegister", cudaHostRegister);
+  cudart.def("cuda" "HostRegister", [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t {
+    return cudaHostRegister((void*)ptr, size, flags);
+  });
+  cudart.def("cuda" "HostUnregister", [](uintptr_t ptr) -> cudaError_t {
+    return cudaHostUnregister((void*)ptr);
+  });
 #ifndef __HIP_PLATFORM_HCC__
   cudart.def("cuda" "ProfilerInitialize", cudaProfilerInitialize);
 #endif

From 4d80c8c64885eb383d3241bd3ff3d272e5be4cd1 Mon Sep 17 00:00:00 2001
From: Zino Benaissa <zinob@fb.com>
Date: Wed, 23 Sep 2020 11:12:56 -0700
Subject: [PATCH 056/449] Fix inlining interface call in fork subgraph (#43790)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43790

Interface calls were not handled properly when they are used in fork
subgraph. This PR fixes this issue.

Test Plan: Imported from OSS

Reviewed By: eellison

Differential Revision: D23402039

Pulled By: bzinodev

fbshipit-source-id: 41adc5ee7d942250e732e243ab30e356d78d9bf7
---
 test/jit/test_freezing.py               | 52 +++++++++++++++++++++++--
 test/jit/test_module_interface.py       | 52 +++++++++++++++++++++++++
 torch/csrc/jit/passes/freeze_module.cpp | 37 ++++++++++--------
 3 files changed, 121 insertions(+), 20 deletions(-)

diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 2d2c404051f6..4ec8f7e46d1b 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -237,8 +237,8 @@ def forward(self, x):
 
     def test_freeze_module_with_fork2(self):
         @torch.jit.script
-        def foo(x, y):
-            return x * y
+        def foo(x):
+            return x * 2
 
         class TestModule(nn.Module):
             def __init__(self):
@@ -247,8 +247,8 @@ def __init__(self):
                 self.b = torch.ones(20, 20)
 
             def forward(self, x):
-                fut = torch.jit._fork(foo, self.a, self.b)
-                y_hat = foo(self.a, self.b)
+                fut = torch.jit._fork(foo, self.a)
+                y_hat = foo(self.b)
                 y = torch.jit._wait(fut)
                 return y_hat + y
 
@@ -272,6 +272,50 @@ def forward(self, x):
         # conservatively assumes there is a mutation because attributes are
         # passed to fork subgraph. both 'a' and 'b' are preserved.
         self.assertTrue(mf.hasattr('a'))
+        self.assertFalse(mf.hasattr('b'))
+        output_f = mf.forward(input)
+        self.assertEqual(output_s, output_f)
+
+    def test_freeze_module_with_fork_calling_module_method(self):
+        @torch.jit.script
+        def foo(x, y):
+            return x * y
+
+        class TestModule(nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.a = torch.ones(20, 20)
+                self.b = torch.ones(20, 20)
+
+            @torch.jit.export
+            def foo(self, x):
+                return x * self.a
+
+            @torch.jit.export
+            def bar(self, x):
+                return x * self.b
+
+            def forward(self, x):
+                fut = torch.jit._fork(self.foo, self.b)
+                y_hat = self.bar(self.a)
+                y = torch.jit._wait(fut)
+                return y_hat + y
+
+        m = torch.jit.script(TestModule())
+        m.eval()
+        input = torch.randn(2, 2)
+        output_s = m.forward(input)
+        mf = torch._C._freeze_module(m._c)
+        # Check if frozen module looks as below:
+        # module m {
+        #   attributes {
+        #     self.b = ..
+        #   }
+        #   ...
+        # TODO:  Although there are no mutation, the alias analysis
+        # conservatively assumes there is a mutation because attributes are
+        # passed to fork subgraph. 'b' is preserved.
+        self.assertFalse(mf.hasattr('a'))
         self.assertTrue(mf.hasattr('b'))
         output_f = mf.forward(input)
         self.assertEqual(output_s, output_f)
diff --git a/test/jit/test_module_interface.py b/test/jit/test_module_interface.py
index 963c1ede8323..f06dafbc1ba2 100644
--- a/test/jit/test_module_interface.py
+++ b/test/jit/test_module_interface.py
@@ -595,6 +595,58 @@ def forward(self, x):
         with self.assertRaisesRegex(RuntimeError, "failed to freeze interface attribute 'proxy_mod'"):
             mf = torch._C._freeze_module(m._c, freezeInterfaces = True)
 
+    def test_freeze_module_with_interface_and_fork(self):
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.b = torch.tensor([1.5])
+
+            def forward(self, x):
+                self.b[0] += 3.2
+                return self.b
+
+        class OrigMod(torch.nn.Module):
+            def __init__(self):
+                super(OrigMod, self).__init__()
+                self.a = torch.tensor([0.5])
+
+            def forward(self, x):
+                return self.a
+
+        @torch.jit.interface
+        class ModInterface(torch.nn.Module):
+            def forward(self, x):
+                # type:  (Tensor) -> Tensor
+                pass
+
+        class TestModule(torch.nn.Module):
+            proxy_mod : ModInterface
+
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.proxy_mod = OrigMod()
+                self.sub = SubModule()
+
+            def forward(self, x):
+                y = self.proxy_mod(x);
+                z= self.sub(x)
+                return y + z
+
+        class MainModule(torch.nn.Module):
+            def __init__(self):
+                super(MainModule, self).__init__()
+                self.test= TestModule();
+
+            def forward(self, x):
+                fut = torch.jit._fork(self.test.forward, x)
+                y = self.test(x)
+                z = torch.jit._wait(fut)
+                return y + z
+
+        m = torch.jit.script(MainModule())
+        m.eval()
+        mf = torch._C._freeze_module(m._c, freezeInterfaces = True)
+
     def test_module_apis_interface(self):
         @torch.jit.interface
         class ModuleInterface(nn.Module):
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 4e95c9af40e3..bec7bf144201 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -97,12 +97,7 @@ class AttributePropagator {
       auto graph = function->graph();
       optimizeSubGraphs(graph, applyInline);
       if (freezeInterfaces_) {
-        optimizeSubGraphs(
-            graph,
-            std::bind(
-                &AttributePropagator::inlineInterfaceCalls,
-                *this,
-                std::placeholders::_1));
+        inlineInterfaceCalls(graph);
       }
       // Record Attributes that are explicitly set in the module.
       // They cannot be folded.
@@ -379,6 +374,14 @@ class AttributePropagator {
           inlineInterfaceCall(n, attr);
           // Reset the GetAttr to concrete module type.
           n->output()->setType(attr.type());
+        } else if (n->kind() == prim::fork) {
+          applyToForkSubgraph(
+              n,
+              graph,
+              std::bind(
+                  &AttributePropagator::inlineInterfaceCalls,
+                  *this,
+                  std::placeholders::_1));
         }
       }
     }
@@ -476,18 +479,20 @@ class AttributePropagator {
     auto node = n->inputs()[0]->node();
     // Check if first parameter of fork is a module. This module is used
     // as the base module (similar to 'self' in forward) to resolve GetAttrs.
-    if (node->kind() != prim::GetAttr) {
-      return;
-    }
-    auto name = node->s(attr::name);
-    auto input = node->inputs()[0];
-    if (!findConstantAttr(input, name, attrModule, graph)) {
-      // Module needs to be preserved.
-      return;
+    //  Otherwise freezing is applied using module_
+    if (node->kind() == prim::GetAttr &&
+        node->output()->type()->cast<ClassType>()) {
+      auto name = node->s(attr::name);
+      auto input = node->inputs()[0];
+      if (!findConstantAttr(input, name, attrModule, graph)) {
+        // Module needs to be preserved.
+        return;
+      }
+      attrModule = attrModule.attr(name).toModule();
+      std::swap(module_, attrModule);
     }
-    attrModule = attrModule.attr(name).toModule();
+
     auto subgraph = n->g(attr::Subgraph);
-    std::swap(module_, attrModule);
     func(subgraph);
     module_ = attrModule;
   }

From 99242eca1db7ac50ae809f3dd57e3d5ae2b88284 Mon Sep 17 00:00:00 2001
From: Tim Nieradzik <tim@nieradzik.me>
Date: Wed, 23 Sep 2020 11:36:33 -0700
Subject: [PATCH 057/449] Dockerfile: Support CUDA 11 (#45071)

Summary:
Although PyTorch already supports CUDA 11, the Dockerfile still relies on CUDA 10. This pull request upgrades all the necessary versions such that recent NVIDIA GPUs like A100 can be used.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45071

Reviewed By: ezyang

Differential Revision: D23873224

Pulled By: seemethere

fbshipit-source-id: 822c25f183dcc3b4c5b780c00cd37744d34c6e00
---
 Dockerfile      | 4 ++--
 docker.Makefile | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index d5619e1a8011..5bae3ec14ea6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -44,13 +44,13 @@ WORKDIR /opt/pytorch
 COPY --from=conda /opt/conda /opt/conda
 COPY --from=submodule-update /opt/pytorch /opt/pytorch
 RUN --mount=type=cache,target=/opt/ccache \
-    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
     python setup.py install
 
 FROM conda as conda-installs
 ARG INSTALL_CHANNEL=pytorch-nightly
-RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=10.1 && \
+RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=11.0.221 && \
     /opt/conda/bin/conda clean -ya
 
 FROM ${BASE_IMAGE} as official
diff --git a/docker.Makefile b/docker.Makefile
index ba53b94d7898..18acced1de8d 100644
--- a/docker.Makefile
+++ b/docker.Makefile
@@ -9,7 +9,7 @@ DOCKER_ORG       = $(shell whoami)
 endif
 
 BASE_RUNTIME     = ubuntu:18.04
-BASE_DEVEL       = nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+BASE_DEVEL       = nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
 
 # The conda channel to use to install pytorch / torchvision
 INSTALL_CHANNEL  = pytorch

From 21fabae47a44a95b7840266a31c89dab1731ef6c Mon Sep 17 00:00:00 2001
From: Bradley Davis <bradleyhd@fb.com>
Date: Wed, 23 Sep 2020 13:50:26 -0700
Subject: [PATCH 058/449] Remove expensive call to PyObject_GetAttrString in
 PyTorch_LookupSpecial (#44684)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44684

The ad-hoc quantization benchmarking script in D23689062 recently highlighted that quantized ops were surprisingly slow after the introduction of support for custom ops in torch.fx in D23203204 (https://github.com/pytorch/pytorch/commit/f15e27265ff76f49844b0ccc6ca387cb564824bf).

Using strobelight, it's immediately clear that up to 66% of samples were seen in `c10::get_backtrace`, which is descends from `torch::is_tensor_and_apppend_overloaded -> torch::check_has_torch_function ->  torch::PyTorch_LookupSpecial -> PyObject_HasAttrString ->  PyObject_GetAttrString`.

I'm no expert by any means so please correct any/all misinterpretation, but it appears that:
- `check_has_torch_function` only needs to return a bool
- `PyTorch_LookupSpecial` should return `NULL` if a matching method is not found on the object
- in the impl of `PyTorch_LookupSpecial` the return value from `PyObject_HasAttrString` only serves as a bool to return early, but ultimately ends up invoking `PyObject_GetAttrString`, which raises, spawning the generation of a backtrace
- `PyObject_FastGetAttrString` returns `NULL` (stolen ref to an empty py::object if the if/else if isn't hit) if the method is not found, anyway, so it could be used singularly instead of invoking both `GetAttrString` and `FastGetAttrString`
- D23203204 (https://github.com/pytorch/pytorch/commit/f15e27265ff76f49844b0ccc6ca387cb564824bf) compounded (but maybe not directly caused) the problem by increasing the number of invocations

so, removing it in this diff and seeing how many things break :)

before:
strobelight: see internal section
output from D23689062 script:
```
$ ./buck-out/gen/scripts/v/test_pt_quant_perf.par
Sequential(
  (0): Quantize(scale=tensor([0.0241]), zero_point=tensor([60]), dtype=torch.quint8)
  (1): QuantizedLinear(in_features=4, out_features=4, scale=0.017489388585090637, zero_point=68, qscheme=torch.per_tensor_affine)
  (2): DeQuantize()
)
fp 0.010896682739257812
q 0.11908197402954102
```

after:
strobelight: see internal section
output from D23689062 script:
```
$ ./buck-out/gen/scripts/v/test_pt_quant_perf.par
Sequential(
  (0): Quantize(scale=tensor([0.0247]), zero_point=tensor([46]), dtype=torch.quint8)
  (1): QuantizedLinear(in_features=4, out_features=4, scale=0.012683945707976818, zero_point=41, qscheme=torch.per_tensor_affine)
  (2): DeQuantize()
)
fp 0.011141300201416016
q 0.022639036178588867
```

which roughly restores original performance seen in P142370729

UPDATE: 9/22 mode/opt benchmarks
```
buck run //scripts/x:test_pt_quant_perf mode/opt
Sequential(
  (0): Quantize(scale=tensor([0.0263]), zero_point=tensor([82]), dtype=torch.quint8)
  (1): QuantizedLinear(in_features=4, out_features=4, scale=0.021224206313490868, zero_point=50, qscheme=torch.per_tensor_affine)
  (2): DeQuantize()
)
fp 0.002968311309814453
q 0.5138928890228271
```

with patch:
```
buck run //scripts/x:test_pt_quant_perf mode/opt
Sequential(
  (0): Quantize(scale=tensor([0.0323]), zero_point=tensor([70]), dtype=torch.quint8)
  (1): QuantizedLinear(in_features=4, out_features=4, scale=0.017184294760227203, zero_point=61, qscheme=torch.per_tensor_affine)
  (2): DeQuantize()
)
fp 0.0026655197143554688
q 0.0064449310302734375
```

Reviewed By: ezyang

Differential Revision: D23697334

fbshipit-source-id: f756d744688615e01c94bf5c48c425747458fb33
---
 torch/csrc/utils/python_arg_parser.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index a641fbda2013..78efb6cf2db3 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -712,9 +712,6 @@ static py::object PyTorch_LookupSpecial(PyObject *obj, char* name)
   if (_is_basic_python_type(tp)) {
     return py::object();
   }
-  if(PyObject_HasAttrString(obj, name) == 0){
-    return py::object();
-  }
   return PyObject_FastGetAttrString((PyObject *)tp, name);
 }
 

From adb2b380baf1d78a5e4a48d8a6999b94aaeff403 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 23 Sep 2020 13:53:52 -0700
Subject: [PATCH 059/449] [quant][graphmode][fx] qconfig_dict support more
 types of configurations (#44856)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44856

Support following format of qconfig_dict
```python
qconfig_dict = {
    # optional, global config
    "": qconfig?,

    # optional, used for module and function types
    # could also be split into module_types and function_types if we prefer
    "object_type": [
      (nn.Conv2d, qconfig?),
      (F.add, qconfig?),
      ...,
    ],

    # optional, used for module names
    "module_name": [
      ("foo.bar", qconfig?)
      ...,
    ],

    # optional, matched in order, first match takes precedence
    "module_name_regex": [
      ("foo.*bar.*conv[0-9]+", qconfig?)
      ...,
    ]
    # priority (in increasing order): global, object_type, module_name_regex, module_name
    # qconfig == None means fusion and quantization should be skipped for anything
    # matching the rule
}
```

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D23751304

fbshipit-source-id: 5b98f4f823502b12ae2150c93019c7b229c49c50
---
 test/quantization/test_quantize_fx.py | 126 +++++++++++++++++++++++++-
 torch/quantization/fx/quantize.py     | 120 ++++++++++++++++++++++--
 torch/quantization/quantize_fx.py     |  38 ++++++--
 3 files changed, 263 insertions(+), 21 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index fc4a735854ef..c1641ae3e194 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -61,6 +61,7 @@
 import operator
 import unittest
 
+@skipIfNoFBGEMM
 class TestQuantizeFx(QuantizationTestCase):
     def _get_conv_linear_test_cases(self):
         ''' Returns a list of test cases, with format:
@@ -334,7 +335,8 @@ def forward(self, x):
 
         m = M().eval()
         m = symbolic_trace(m)
-        qconfig_dict = {'': default_qconfig, 'conv2': None}
+        qconfig_dict = {"": default_qconfig,
+                        "module_name": [("conv2", None)]}
         m = prepare_static_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
@@ -344,11 +346,131 @@ def forward(self, x):
         node_list = [
             ns.call_function(torch.quantize_per_tensor),
             ns.call_module(nnq.Conv2d),
-            ns.call_method('dequantize'),
+            ns.call_method("dequantize"),
             ns.call_module(nn.Conv2d),
         ]
         self.checkGraphModuleNodes(m, expected_node_list=node_list)
 
+    def test_qconfig_module_type(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(1, 1, 1)
+                self.conv2 = nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        qconfig_dict = {"object_type": [(torch.nn.Conv2d, default_qconfig)]}
+        m = prepare_static_fx(m, qconfig_dict)
+        data = torch.randn(1, 1, 1, 1)
+        m(data)
+        m = convert_static_fx(m)
+        m(data)
+        # first conv is quantized, second conv is not quantized
+        node_list = [
+            ns.call_function(torch.quantize_per_tensor),
+            ns.call_module(nnq.Conv2d),
+            ns.call_module(nnq.Conv2d),
+            ns.call_method("dequantize"),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    def test_qconfig_function(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+
+            def forward(self, x, y):
+                return x + y
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        qconfig_dict = {"object_type": [(operator.add, default_qconfig)]}
+        m = prepare_static_fx(m, qconfig_dict)
+        data = torch.randn(1, 1, 1, 1)
+        m(data, data)
+        m = convert_static_fx(m)
+        m(data, data)
+        # first conv is quantized, second conv is not quantized
+        node_list = [
+            ns.call_function(torch.quantize_per_tensor),
+            ns.call_function(torch.ops.quantized.add),
+            ns.call_method("dequantize"),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    def test_qconfig_module_name_regex(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(1, 1, 1)
+                self.conv2 = nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        qconfig_dict = {"module_name_regex": [("conv*", default_qconfig)]}
+        m = prepare_static_fx(m, qconfig_dict)
+        data = torch.randn(1, 1, 1, 1)
+        m(data)
+        m = convert_static_fx(m)
+        m(data)
+        # first conv is quantized, second conv is not quantized
+        node_list = [
+            ns.call_function(torch.quantize_per_tensor),
+            ns.call_module(nnq.Conv2d),
+            ns.call_module(nnq.Conv2d),
+            ns.call_method("dequantize"),
+        ]
+        self.checkGraphModuleNodes(m, expected_node_list=node_list)
+
+    def test_qconfig_precedence(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.linear = nn.Linear(1, 1)
+                self.conv = nn.Conv2d(1, 1, 1)
+                self.module_conv1 = nn.Conv2d(1, 1, 1)
+                self.module_conv2 = nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                # global
+                x = self.linear(x)
+                # global + object_type --> object_type
+                x = self.conv(x)
+                # global + object_type + module_name_regex --> module_name_regex
+                x = self.module_conv1(x)
+                # global + object_type + module_name_regex + module_name --> module_name
+                x = self.module_conv2(x)
+                return x
+
+        m = M().eval()
+        m = symbolic_trace(m)
+        global_qconfig = default_qconfig
+        object_type_qconfig = default_dynamic_qconfig
+        module_name_regex_qconfig = float16_dynamic_qconfig
+        module_name_qconfig = default_qat_qconfig
+        qconfig_dict = {
+            "": global_qconfig,
+            "object_type": [(nn.Conv2d, object_type_qconfig)],
+            "module_name_regex": [("module_conv*", module_name_regex_qconfig)],
+            "module_name": [("module_conv2", module_name_qconfig)]}
+        m = prepare_static_fx(m, qconfig_dict)
+        self.assertEqual(m.linear.qconfig, global_qconfig)
+        self.assertEqual(m.conv.qconfig, object_type_qconfig)
+        self.assertEqual(m.module_conv1.qconfig, module_name_regex_qconfig)
+        self.assertEqual(m.module_conv2.qconfig, module_name_qconfig)
+
+
     def test_remove_qconfig(self):
         class M(torch.nn.Module):
             def __init__(self):
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 6254120999f0..67e538b40433 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -40,7 +40,9 @@
     quantize_node,
 )
 
+from collections import OrderedDict
 import copy
+import re
 
 # ------------------------
 # Helper Functions
@@ -136,6 +138,54 @@ def is_submodule_of_fake_quant(name, module, named_modules):
     parent_name, _ = _parent_name(name)
     return is_activation_post_process(named_modules[parent_name])
 
+def get_flattened_qconfig_dict(qconfig_dict):
+    """ flatten the global, object_type and module_name qconfig
+    to the same qconfig_dict so that it can be used by
+    propagate_qconfig_ function.
+    "module_name_regex" is ignored for now since it's not supported
+    in propagate_qconfig_, but it can be fixed later.
+
+    For example:
+    Input: {
+      "": qconfig,
+      "object_type": [
+        (torch.add, qconfig)
+      ],
+      "module_name": [
+        ("conv", qconfig)
+      ]
+    }
+
+    Output: {
+      "": qconfig,
+      torch.add: qconfig,
+      "conv": qconfig
+    }
+    """
+    flattened = dict()
+    if '' in qconfig_dict:
+        flattened[''] = qconfig_dict['']
+
+    def flatten_key(key):
+        if key in qconfig_dict:
+            for obj, qconfig in qconfig_dict[key]:
+                flattened[obj] = qconfig
+
+    flatten_key('object_type')
+    flatten_key('module_name')
+    return flattened
+
+def convert_dict_to_ordered_dict(qconfig_dict):
+    """ Convert dict in qconfig_dict to ordered dict
+    """
+    # convert a qconfig list for a type to OrderedDict
+    def _convert_to_ordered_dict(key, qconfig_dict):
+        qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, []))
+
+    _convert_to_ordered_dict('object_type', qconfig_dict)
+    _convert_to_ordered_dict('module_name_regex', qconfig_dict)
+    _convert_to_ordered_dict('module_name', qconfig_dict)
+
 # A dictionary for querying the weight index for a given op
 WEIGHT_INDEX_DICT = {
     torch.nn.functional.conv2d : [1],
@@ -181,23 +231,72 @@ def __init__(self):
     def _qat_swap_modules(self, root):
         convert(root, mapping=get_qat_module_mappings(), inplace=True, remove_qconfig=False)
 
-    def _generate_qconfig_map(self, root, input_graph):
-        def get_qconfig(module):
-            return module.qconfig if hasattr(module, 'qconfig') else None
+    def _generate_qconfig_map(self,
+                              root,
+                              input_graph,
+                              qconfig_dict):
+        global_qconfig = qconfig_dict.get('', None)
+
+        def get_module_type_qconfig(
+                module_type, fallback_qconfig=global_qconfig):
+            return qconfig_dict['object_type'].get(module_type, fallback_qconfig)
+
+        def get_function_qconfig(
+                function, fallback_qconfig=global_qconfig):
+            return qconfig_dict['object_type'].get(function, fallback_qconfig)
+
+        def get_module_name_regex_qconfig(
+                module_name, fallback_qconfig=global_qconfig):
+            for regex_pattern, qconfig in qconfig_dict['module_name_regex'].items():
+                if re.match(regex_pattern, module_name):
+                    # first match wins
+                    return qconfig
+            return fallback_qconfig
+
+        def get_module_name_qconfig(
+                module_name, fallback_qconfig=global_qconfig):
+            if module_name == '':
+                # module name qconfig not found
+                return fallback_qconfig
+            if module_name in qconfig_dict['module_name']:
+                return qconfig_dict['module_name'][module_name]
+            else:
+                parent, _ = _parent_name(module_name)
+                return get_module_name_qconfig(parent, fallback_qconfig)
+
+        # get qconfig for module_name,
+        # fallback to module_name_regex_qconfig, module_type_qconfig, global_qconfig
+        # if necessary
+        def get_qconfig(module_name):
+            module_type_qconfig = \
+                get_module_type_qconfig(type(self.modules[module_name]))
+            module_name_regex_qconfig = \
+                get_module_name_regex_qconfig(module_name, module_type_qconfig)
+            module_name_qconfig = \
+                get_module_name_qconfig(module_name, module_name_regex_qconfig)
+            return module_name_qconfig
 
         self.qconfig_map = dict()
         for node in input_graph.nodes:
             if node.op == 'get_attr':
-                parent, _ = _parent_name(node.target)
-                self.qconfig_map[node.name] = get_qconfig(self.modules[parent])
+                module_name, _ = _parent_name(node.target)
+                self.qconfig_map[node.name] = get_qconfig(module_name)
             elif node.op == 'call_function':
-                self.qconfig_map[node.name] = get_qconfig(root)
+                # precedence: [TODO] module_name_qconfig (need scope support from fx)
+                # > function_qconfig > global_qconfig
+                function_qconfig = get_function_qconfig(node.target)
+                self.qconfig_map[node.name] = function_qconfig
             elif node.op == 'call_method':
                 self_obj = node.args[0]
                 # qconfig for call_method should be the same as the `self` object for the call
                 self.qconfig_map[node.name] = self.qconfig_map[self_obj.name]
             elif node.op == 'call_module':
-                self.qconfig_map[node.name] = get_qconfig(self.modules[node.target])
+                module_qconfig = get_qconfig(node.target)
+                # regex is not supported eager mode propagate_qconfig_, we'll need to
+                # set the qconfig explicitly here in case regex
+                # is used
+                self.modules[node.target].qconfig = module_qconfig
+                self.qconfig_map[node.name] = module_qconfig
 
     def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         if not inplace:
@@ -208,14 +307,17 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         else:
             self.patterns = get_quant_patterns()
 
-        propagate_qconfig_(model, qconfig_dict)
+        flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict)
+        # TODO: support regex as well
+        propagate_qconfig_(model, flattened_qconfig_dict)
         if model.training:
             self._qat_swap_modules(model)
 
         self.modules = dict(model.named_modules())
 
+        convert_dict_to_ordered_dict(qconfig_dict)
         # map from node name to qconfig, used in _find_matches
-        self._generate_qconfig_map(model, model.graph)
+        self._generate_qconfig_map(model, model.graph, qconfig_dict)
 
         # match the patterns that will get quantized
         matches = self._find_matches(model.graph, self.modules, self.patterns)
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 77178552ee71..0f68f2e0e9e9 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -115,16 +115,34 @@ def quantize_static_fx(model, qconfig_dict, run_fn, run_args, inplace=False, deb
 
     Args:
         `model`: input float TorchScript model
-        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
-        qconfig for that module as value, empty key means the qconfig will be applied
-        to whole model unless it’s overwritten by more specific configurations, the
-        qconfig for each module is either found in the dictionary or fallback to
-         the qconfig of parent module.
-
-        Right now qconfig_dict is the only way to configure how the model is quantized,
-        and it is done in the granularity of module, that is, we only support one type
-        of qconfig for each torch.nn.Module, and the qconfig for sub module will
-        override the qconfig for parent module, empty string means global configuration.
+        `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
+        qconfig_dict = {
+        # optional, global config
+        "": qconfig?,
+
+        # optional, used for module and function types
+        # could also be split into module_types and function_types if we prefer
+        "object_type": [
+          (torch.nn.Conv2d, qconfig?),
+          (torch.nn.functional.add, qconfig?),
+          ...,
+         ],
+
+        # optional, used for module names
+        "module_name": [
+          ("foo.bar", qconfig?)
+          ...,
+        ],
+
+        # optional, matched in order, first match takes precedence
+        "module_name_regex": [
+          ("foo.*bar.*conv[0-9]+", qconfig?)
+          ...,
+        ]
+        # priority (in increasing order): global, object_type, module_name_regex, module_name
+        # qconfig == None means fusion and quantization should be skipped for anything
+        # matching the rule
+        }
         `run_fn`: a calibration function for calibrating the prepared model
         `run_args`: positional arguments for `run_fn`
         `inplace`: carry out model transformations in-place, the original module is

From 9e206ee9f1b287d95970abc4a1bcd1756527d012 Mon Sep 17 00:00:00 2001
From: Nick Gibson <nickg@fb.com>
Date: Wed, 23 Sep 2020 13:55:01 -0700
Subject: [PATCH 060/449] [NNC] Fix a bug in SplitWithMask when splitting
 multiple times (#45141)

Summary:
When doing a splitWithMask we only mask if the loop extent is not cleanly divide by the split factor. However, the logic does not simplify so any nontrivial loop extents will always cause a mask to be added, e.g. if the loop had been previously split. Unlike splitWithTail, the masks added by splitWithMask are always overhead and we don't have the analysis to optimize them out if they are unnecessary, so it's good to avoid inserting them if we can.

The fix is just to simplify the loop extents before doing the extent calculation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45141

Reviewed By: ezyang

Differential Revision: D23869170

Pulled By: nickgg

fbshipit-source-id: 44686fd7b802965ca4f5097b0172a41cf837a1f5
---
 test/cpp/tensorexpr/test_loopnest.cpp  | 34 ++++++++++++++++++++++++++
 test/cpp/tensorexpr/tests.h            |  1 +
 torch/csrc/jit/tensorexpr/loopnest.cpp |  9 ++++---
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 602eb116e7b9..201a7e57820b 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -609,6 +609,40 @@ void testExprSplitWithMask01() {
   ExpectAllNear(c_v, c_ref, 1e-5);
 }
 
+// Tests the case where we split a loop cleanly multiple times, we should not
+// insert any masks.
+void testExprSplitWithMaskRepeatedNoMask() {
+  KernelScope kernel_scope;
+  const int M = 64;
+  Buffer a_buf("a", kFloat, {M});
+  Buffer b_buf("b", kFloat, {M});
+  Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
+    return a_buf(m) + b_buf(m) + 1.0f;
+  });
+
+  LoopNest l({tensor});
+  std::vector<For*> loops = l.getLoopStmtsFor(tensor);
+  For *outer, *mid, *inner;
+  l.splitWithMask(loops[0], 4, &outer, &inner);
+  l.splitWithMask(outer, 4, &outer, &mid);
+
+  Stmt* stmt1 = IRSimplifier::simplify(l.root_stmt());
+  std::ostringstream oss;
+  oss << *stmt1;
+
+  // Two splits mean 3 loops, but should need no masks in this case.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (
+# CHECK-NOT: if (
+# CHECK:   for (
+# CHECK-NOT: if (
+# CHECK:     for (
+# CHECK-NOT: if (
+# CHECK:       f[)IR";
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+}
+
 void testSplitWithTailWithLoopOptions() {
   KernelScope kernel_scope;
   const int M = 21;
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
index 56831c8db663..d0a9aa840b91 100644
--- a/test/cpp/tensorexpr/tests.h
+++ b/test/cpp/tensorexpr/tests.h
@@ -55,6 +55,7 @@ namespace jit {
   _(ExprSplitWithTail)                      \
   _(ExprSplitWithTailNone)                  \
   _(ExprSplitWithMask01)                    \
+  _(ExprSplitWithMaskRepeatedNoMask)        \
   _(SplitWithTailWithLoopOptions)           \
   _(SplitWithMaskWithLoopOptions)           \
   _(ScheduleBroadcastAddBuffer)             \
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index f80e4585b790..b7862fb953c1 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -1007,10 +1007,11 @@ void LoopNest::splitWithMask(For* f, int factor, For** outer, For** inner) {
   }
 
   bool tail_is_needed = true;
-  if (dynamic_cast<const IntImm*>(f->start()) &&
-      dynamic_cast<const IntImm*>(f->stop())) {
-    int start_val = dynamic_cast<const IntImm*>(f->start())->value();
-    int stop_val = dynamic_cast<const IntImm*>(f->stop())->value();
+  const Expr* start = IRSimplifier::simplify(f->start());
+  const Expr* stop = IRSimplifier::simplify(f->stop());
+  if (start->isConstant() && stop->isConstant()) {
+    int start_val = immediateAs<int>(start);
+    int stop_val = immediateAs<int>(stop);
     int size_val = stop_val - start_val;
     int tail_size = size_val % factor;
     if (tail_size == 0) {

From 3f89b779c4152cec48a9ed2baa704cbc183e8afc Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Wed, 23 Sep 2020 14:01:43 -0700
Subject: [PATCH 061/449] [jit] allow submodule methods inference rule be
 different (#43872)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43872

This PR allows the recursive scripting to have a separate
submodule_stubs_fn to create its submodule with specific user provided
rules.

Fixes https://github.com/pytorch/pytorch/issues/43729

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D23430176

Pulled By: wanchaol

fbshipit-source-id: 20530d7891ac3345b36f1ed813dc9c650b28d27a
---
 test/jit/test_tracer.py | 33 +++++++++++++++++++++++++++++++++
 torch/jit/_recursive.py | 17 +++++++++++++++--
 torch/jit/_script.py    |  7 +++++--
 torch/jit/_trace.py     | 21 ++++-----------------
 4 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 22921f7d684a..518af2f95a4c 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -1310,6 +1310,39 @@ def check(mod):
         imported = self.getExportImportCopy(traced)
         check(imported.foo)
 
+        # Note that Bar's forward can only be traced, but not scripted
+        class Bar(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            @torch.jit.export
+            def addTwo(self, x):
+                return x + 2
+
+            def forward(self, input):
+                return (lambda a: a + 1)(input)
+
+        # When tracing Bar as a submodule, we only want to script the
+        # exported methods, and we want to keep the forwards still
+        # being traced.
+        class WrapperExports(torch.nn.Module):
+            def __init__(self):
+                super(WrapperExports, self).__init__()
+                self.bar = Bar()
+
+            @torch.jit.export
+            def addOne(self, x):
+                return x + 1
+
+            def forward(self, x):
+                return self.bar(x)
+
+        f = WrapperExports()
+
+        traced = torch.jit.trace(f, (torch.rand(3, 4),))
+        expected_names = ['addOne']
+        check(traced)
+
     def test_trace_autograd_function(self):
         class TestFunc(torch.autograd.Function):
             @staticmethod
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index 85853cd1b1ee..0eb423516f6f 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -52,6 +52,19 @@ def make_stub_from_method(nn_module, method_name):
     return make_stub(func, method_name)
 
 
+def make_stubs_from_exported_methods(mod):
+    stubs = []
+    for name in dir(mod):
+        item = getattr(mod, name, None)
+        if (
+            _jit_internal.get_torchscript_modifier(item)
+            is _jit_internal.FunctionModifiers.EXPORT
+        ):
+            stubs.append(make_stub_from_method(mod, name))
+
+    return stubs
+
+
 # base types that can be constants
 # in addition, tuples and lists of these base types are also considered constants
 # If you edit this list, then you also need to edit the handlers in
@@ -371,8 +384,8 @@ def init_fn(script_module):
             elif isinstance(orig_value, torch.jit.ScriptModule):
                 scripted = orig_value
             else:
-                # use the default recursive rule to compile the module
-                scripted = create_script_module_impl(orig_value, sub_concrete_type, infer_methods_to_compile)
+                # always reuse the provided stubs_fn to infer the methods to compile
+                scripted = create_script_module_impl(orig_value, sub_concrete_type, stubs_fn)
 
             cpp_module.setattr(name, scripted)
             script_module._modules[name] = scripted
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index fb0465288e3f..4d28a5f2ad13 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -18,7 +18,7 @@
 import torch
 import torch._jit_internal as _jit_internal
 from torch.utils import set_module
-from torch.jit._recursive import ScriptMethodStub, wrap_cpp_module
+from torch.jit._recursive import ScriptMethodStub, wrap_cpp_module, infer_methods_to_compile
 from torch.nn import Module
 from torch.jit._state import _enabled
 from torch.jit._builtins import _register_builtin
@@ -200,7 +200,10 @@ def init_then_script(self, *args, **kwargs):
 
                 def make_stubs(module):
                     cls = type(module)
-                    return [v for k, v in sorted(cls._methods.items())]
+                    if hasattr(cls, "_methods"):
+                        return [v for k, v in sorted(cls._methods.items())]
+                    else:
+                        return infer_methods_to_compile(module)
 
                 self.__dict__[
                     "_actual_script_module"
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 3b312c7e2161..e73785e15aea 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -22,7 +22,6 @@
 from torch.jit._script import ScriptModule, _CachedForward, script
 from torch._jit_internal import _qualified_name
 from torch.autograd import function
-from torch import _jit_internal
 from torch.nn import Module
 
 _flatten = torch._C._jit_flatten
@@ -549,23 +548,11 @@ def make_module(mod, _module_class, _compilation_unit):
         return mod
     elif torch._jit_internal.module_has_exports(mod):
 
-        def make_stubs_from_exported_methods(mod):
-            exported = []
-            for name in dir(mod):
-                item = getattr(mod, name, None)
-                if (
-                    torch._jit_internal.get_torchscript_modifier(item)
-                    is _jit_internal.FunctionModifiers.EXPORT
-                ):
-                    exported.append(name)
-
-            stubs = []
-            for method in exported:
-                stubs.append(torch.jit._recursive.make_stub_from_method(mod, method))
-            return stubs
-
+        infer_methods_stubs_fn = torch.jit._recursive.make_stubs_from_exported_methods
         return torch.jit._recursive.create_script_module(
-            mod, make_stubs_from_exported_methods, share_types=False
+            mod,
+            infer_methods_stubs_fn,
+            share_types=False
         )
     else:
         if _module_class is None:

From d2b045030eb60283b8aeeb2956c7ebe91628fece Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Wed, 23 Sep 2020 14:26:03 -0700
Subject: [PATCH 062/449] gtest-ify JIT tests, through the letter c (#45020)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45020

See https://github.com/pytorch/pytorch/pull/45018 for context.

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D23802296

Pulled By: suo

fbshipit-source-id: 20c9798a414e9ba30869a862012cbdee0613c8b1
---
 test/cpp/jit/test_autodiff.cpp                |   9 +-
 test/cpp/jit/test_class_import.cpp            |  12 +-
 test/cpp/jit/test_class_parser.cpp            |   4 +-
 test/cpp/jit/test_cleanup_passes.cpp          |  37 +-
 test/cpp/jit/test_code_template.cpp           |  50 ++-
 test/cpp/jit/test_constant_pooling.cpp        |  87 ++---
 .../jit/test_create_autodiff_subgraphs.cpp    |   5 +-
 test/cpp/jit/test_custom_class.cpp            |   4 +-
 test/cpp/jit/test_custom_operators.cpp        | 342 +++++++++---------
 test/cpp/jit/test_misc.cpp                    |  10 +
 test/cpp/jit/tests.h                          |  16 -
 11 files changed, 282 insertions(+), 294 deletions(-)

diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp
index 7d431776a971..3993c63b1708 100644
--- a/test/cpp/jit/test_autodiff.cpp
+++ b/test/cpp/jit/test_autodiff.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/frontend/tracer.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
@@ -83,7 +84,7 @@ variable_list grad(
       fmap(inputs, get_edge));
 }
 
-void testADFormulas() {
+TEST(AutodiffTest, ADFormulas) {
   const auto cast = [](const Variable& v) {
     return static_cast<at::Tensor>(v);
   };
@@ -174,7 +175,7 @@ void testADFormulas() {
   }
 }
 
-void testDifferentiate() {
+TEST(AutodiffTest, Differentiate) {
   // Note: can't use IRParser for this test due to issue #23989
   auto graph = std::make_shared<Graph>();
   std::vector<int64_t> sizes{2, 3, 4};
@@ -229,7 +230,7 @@ void testDifferentiate() {
       ->run(*grad_spec.df);
 }
 
-void testDifferentiateWithRequiresGrad() {
+TEST(AutodiffTest, DifferentiateWithRequiresGrad) {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp
index 82bc0cf3bccc..ffa845b3e2a8 100644
--- a/test/cpp/jit/test_class_import.cpp
+++ b/test/cpp/jit/test_class_import.cpp
@@ -1,7 +1,7 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
+#include <gtest/gtest.h>
 
 #include <ATen/core/qualified_name.h>
+#include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 #include <torch/csrc/jit/serialization/import_source.h>
 #include <torch/torch.h>
@@ -45,7 +45,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-void testClassImport() {
+TEST(ClassImportTest, Basic) {
   auto cu1 = std::make_shared<CompilationUnit>();
   auto cu2 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
@@ -80,7 +80,7 @@ void testClassImport() {
   ASSERT_FALSE(c);
 }
 
-void testScriptObject() {
+TEST(ClassImportTest, ScriptObject) {
   Module m1("m1");
   Module m2("m2");
   std::vector<at::IValue> constantTable;
@@ -114,7 +114,7 @@ def __init__(self, x):
     return x
 )JIT";
 
-void testClassDerive() {
+TEST(ClassImportTest, ClassDerive) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu);
   const auto self = SimpleSelf(cls);
@@ -142,7 +142,7 @@ class FooBar1234(Module):
     return (self.f).top()
 )JIT";
 
-void testSaveLoadTorchbind() {
+TEST(ClassImportTest, CustomClass) {
   auto cu1 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
   // Import different versions of FooTest into two namespaces.
diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp
index 45e37103bb5a..a5b19f63fd3f 100644
--- a/test/cpp/jit/test_class_parser.cpp
+++ b/test/cpp/jit/test_class_parser.cpp
@@ -1,3 +1,5 @@
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_base.h>
 #include <torch/csrc/jit/frontend/parser.h>
 #include <torch/csrc/jit/frontend/resolver.h>
@@ -15,7 +17,7 @@ const auto testSource = R"JIT(
     an_attribute : Tensor
 )JIT";
 
-void testClassParser() {
+TEST(ClassParserTest, Basic) {
   Parser p(std::make_shared<Source>(testSource));
   std::vector<Def> definitions;
   std::vector<Resolver> resolvers;
diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp
index 2f2ca4e0a19b..38ceef932eb0 100644
--- a/test/cpp/jit/test_cleanup_passes.cpp
+++ b/test/cpp/jit/test_cleanup_passes.cpp
@@ -1,19 +1,19 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 namespace torch {
 namespace jit {
 
-void testCleanUpPasses() {
+TEST(CleanupPassTest, Basic) {
   // Tests stability of clean up passes when dealing with constant pooling
   // and constant propagation.
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%cond.1 : Tensor,
       %suffix.1 : str):
   %3 : bool = aten::Bool(%cond.1) # o.py:6:7
@@ -31,20 +31,19 @@ graph(%cond.1 : Tensor,
       -> (%12)
   return (%25)
   )IR",
-        &*graph);
-    runCleanupPasses(graph);
-    testing::FileCheck()
-        .check_count(
-            "prim::Constant[value=\"same string with a twist\"]",
-            1,
-            /*exactly=*/true)
-        ->run(*graph);
+      &*graph);
+  runCleanupPasses(graph);
+  testing::FileCheck()
+      .check_count(
+          "prim::Constant[value=\"same string with a twist\"]",
+          1,
+          /*exactly=*/true)
+      ->run(*graph);
 
-    auto graph_after_pass_once = graph->toString();
-    runCleanupPasses(graph);
-    auto graph_after_pass_twice = graph->toString();
-    ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
-  }
+  auto graph_after_pass_once = graph->toString();
+  runCleanupPasses(graph);
+  auto graph_after_pass_twice = graph->toString();
+  ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp
index e4d7d1ef856e..35897474f1f2 100644
--- a/test/cpp/jit/test_code_template.cpp
+++ b/test/cpp/jit/test_code_template.cpp
@@ -1,6 +1,6 @@
-#include "test/cpp/jit/test_base.h"
-#include "test/cpp/jit/test_utils.h"
+#include <gtest/gtest.h>
 
+#include <test/cpp/jit/test_utils.h>
 #include "torch/csrc/jit/frontend/code_template.h"
 
 namespace torch {
@@ -33,31 +33,29 @@ static const auto ct_expect = R"(
   int notest(int a)
   )";
 
-void testCodeTemplate() {
-  {
-    TemplateEnv e;
-    e.s("hi", "foo");
-    e.v("what", {"is", "this"});
-    TemplateEnv c(e);
-    c.s("hi", "foo2");
-    ASSERT_EQ(e.s("hi"), "foo");
-    ASSERT_EQ(c.s("hi"), "foo2");
-    ASSERT_EQ(e.v("what")[0], "is");
-  }
+TEST(TestCodeTemplate, Copying) {
+  TemplateEnv e;
+  e.s("hi", "foo");
+  e.v("what", {"is", "this"});
+  TemplateEnv c(e);
+  c.s("hi", "foo2");
+  ASSERT_EQ(e.s("hi"), "foo");
+  ASSERT_EQ(c.s("hi"), "foo2");
+  ASSERT_EQ(e.v("what")[0], "is");
+}
 
-  {
-    TemplateEnv e;
-    e.v("args", {"hi", "8"});
-    e.v("bar", {"what\non many\nlines...", "7"});
-    e.s("a", "3");
-    e.s("b", "4");
-    e.v("stuff", {"things...", "others"});
-    e.v("empty", {});
-    auto s = ct.format(e);
-    // std::cout << "'" << s << "'\n";
-    // std::cout << "'" << ct_expect << "'\n";
-    ASSERT_EQ(s, ct_expect);
-  }
+TEST(TestCodeTemplate, Formatting) {
+  TemplateEnv e;
+  e.v("args", {"hi", "8"});
+  e.v("bar", {"what\non many\nlines...", "7"});
+  e.s("a", "3");
+  e.s("b", "4");
+  e.v("stuff", {"things...", "others"});
+  e.v("empty", {});
+  auto s = ct.format(e);
+  // std::cout << "'" << s << "'\n";
+  // std::cout << "'" << ct_expect << "'\n";
+  ASSERT_EQ(s, ct_expect);
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp
index b949c9a45b25..c8cb58e1886a 100644
--- a/test/cpp/jit/test_constant_pooling.cpp
+++ b/test/cpp/jit/test_constant_pooling.cpp
@@ -1,9 +1,10 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 #include <sstream>
 #include <string>
@@ -11,26 +12,26 @@
 namespace torch {
 namespace jit {
 
-void testConstantPooling() {
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(ConstantPoolingTest, Int) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %8 : int = prim::Constant[value=1]()
   %10 : int = prim::Constant[value=1]()
   return (%8, %10)
   )IR",
-        &*graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count("prim::Constant", 1, /*exactly*/ true)
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count("prim::Constant", 1, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(ConstantPoolingTest, PoolingAcrossBlocks) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%cond : Tensor):
   %a : str = prim::Constant[value="bcd"]()
   %3 : bool = aten::Bool(%cond)
@@ -44,17 +45,18 @@ graph(%cond : Tensor):
   %7 : (str, str) = prim::TupleConstruct(%a, %b)
   return (%7)
   )IR",
-        &*graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
-        ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
+      ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(ConstantPoolingTest, PoolingDifferentDevices) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %2 : int = prim::Constant[value=2]()
   %1 : int = prim::Constant[value=1]()
@@ -70,22 +72,21 @@ graph():
   prim::Print(%x, %y, %z)
   return (%1)
   )IR",
-        &*graph);
-    // three tensors created - two different devices among the three
-    // don't have good support for parsing tensor constants
-    ConstantPropagation(graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count(
-            "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
-            1,
-            /*exactly*/ true)
-        ->check_count(
-            "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
-            1,
-            /*exactly*/ true)
-        ->run(*graph);
-  }
+      &*graph);
+  // three tensors created - two different devices among the three
+  // don't have good support for parsing tensor constants
+  ConstantPropagation(graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count(
+          "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
+          1,
+          /*exactly*/ true)
+      ->check_count(
+          "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
+          1,
+          /*exactly*/ true)
+      ->run(*graph);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
index 8da6d9d6a1b2..e97043f84d24 100644
--- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp
+++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
@@ -6,7 +7,7 @@
 namespace torch {
 namespace jit {
 
-void testCreateAutodiffSubgraphs() {
+TEST(CreateAutodiffSubgraphsTest, Basic) {
   auto graph = build_lstm();
   CreateAutodiffSubgraphs(graph, /*threshold=*/2);
   // all of the ops are within the DifferentiableGraph
diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp
index 543fbc20eb3d..25c518d3142c 100644
--- a/test/cpp/jit/test_custom_class.cpp
+++ b/test/cpp/jit/test_custom_class.cpp
@@ -1,3 +1,5 @@
+#include <gtest/gtest.h>
+
 #include <torch/custom_class.h>
 #include <torch/script.h>
 
@@ -318,7 +320,7 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
 
 } // namespace
 
-void testTorchbindIValueAPI() {
+TEST(CustomClassTest, TorchbindIValueAPI) {
   script::Module m("m");
 
   // test make_custom_class API
diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp
index 529b36385bd4..d3f61268e8f1 100644
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/ir/alias_analysis.h"
@@ -11,134 +12,135 @@
 namespace torch {
 namespace jit {
 
-void testCustomOperators() {
-  {
-    torch::RegisterOperators reg(
-        "foo::bar", [](double a, at::Tensor b) { return a + b; });
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, InferredSchema) {
+  torch::RegisterOperators reg(
+      "foo::bar", [](double a, at::Tensor b) { return a + b; });
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+  ASSERT_EQ(ops.size(), 1);
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::bar");
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::bar");
 
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
 
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
-  {
-    torch::RegisterOperators reg(
-        "foo::bar_with_schema(float a, Tensor b) -> Tensor",
-        [](double a, at::Tensor b) { return a + b; });
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+}
 
-    auto& ops =
-        getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, ExplicitSchema) {
+  torch::RegisterOperators reg(
+      "foo::bar_with_schema(float a, Tensor b) -> Tensor",
+      [](double a, at::Tensor b) { return a + b; });
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
+  auto& ops =
+      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+  ASSERT_EQ(ops.size(), 1);
 
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
 
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
-  {
-    // Check that lists work well.
-    torch::RegisterOperators reg(
-        "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
-        [](torch::List<int64_t> ints,
-           torch::List<double> floats,
-           torch::List<at::Tensor> tensors) { return floats; });
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
-    ASSERT_EQ(ops.size(), 1);
-
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::lists");
-
-    ASSERT_EQ(op->schema().arguments().size(), 3);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
-    ASSERT_TRUE(
-        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
-    ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
-    ASSERT_TRUE(
-        op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
-    ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
-    ASSERT_TRUE(
-        op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
-
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_TRUE(
-        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
-
-    Stack stack;
-    push(stack, c10::List<int64_t>({1, 2}));
-    push(stack, c10::List<double>({1.0, 2.0}));
-    push(stack, c10::List<at::Tensor>({at::ones(5)}));
-    op->getOperation()(&stack);
-    c10::List<double> output;
-    pop(stack, output);
-
-    ASSERT_EQ(output.size(), 2);
-    ASSERT_EQ(output.get(0), 1.0);
-    ASSERT_EQ(output.get(1), 2.0);
-  }
-  {
-    torch::RegisterOperators reg(
-        "foo::lists2(Tensor[] tensors) -> Tensor[]",
-        [](torch::List<at::Tensor> tensors) { return tensors; });
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
+
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+}
 
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, ListParameters) {
+  // Check that lists work well.
+  torch::RegisterOperators reg(
+      "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
+      [](torch::List<int64_t> ints,
+         torch::List<double> floats,
+         torch::List<at::Tensor> tensors) { return floats; });
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+  ASSERT_EQ(ops.size(), 1);
+
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::lists");
+
+  ASSERT_EQ(op->schema().arguments().size(), 3);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
+  ASSERT_TRUE(
+      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
+  ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
+  ASSERT_TRUE(
+      op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
+  ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
+  ASSERT_TRUE(
+      op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
+
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_TRUE(
+      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
+
+  Stack stack;
+  push(stack, c10::List<int64_t>({1, 2}));
+  push(stack, c10::List<double>({1.0, 2.0}));
+  push(stack, c10::List<at::Tensor>({at::ones(5)}));
+  op->getOperation()(&stack);
+  c10::List<double> output;
+  pop(stack, output);
+
+  ASSERT_EQ(output.size(), 2);
+  ASSERT_EQ(output.get(0), 1.0);
+  ASSERT_EQ(output.get(1), 2.0);
+}
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::lists2");
+TEST(CustomOperatorTest, ListParameters2) {
+  torch::RegisterOperators reg(
+      "foo::lists2(Tensor[] tensors) -> Tensor[]",
+      [](torch::List<at::Tensor> tensors) { return tensors; });
 
-    ASSERT_EQ(op->schema().arguments().size(), 1);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
-    ASSERT_TRUE(
-        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+  ASSERT_EQ(ops.size(), 1);
 
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_TRUE(
-        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::lists2");
 
-    Stack stack;
-    push(stack, c10::List<at::Tensor>({at::ones(5)}));
-    op->getOperation()(&stack);
-    c10::List<at::Tensor> output;
-    pop(stack, output);
+  ASSERT_EQ(op->schema().arguments().size(), 1);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
+  ASSERT_TRUE(
+      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
 
-    ASSERT_EQ(output.size(), 1);
-    ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
-  }
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_TRUE(
+      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
+
+  Stack stack;
+  push(stack, c10::List<at::Tensor>({at::ones(5)}));
+  op->getOperation()(&stack);
+  c10::List<at::Tensor> output;
+  pop(stack, output);
+
+  ASSERT_EQ(output.size(), 1);
+  ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
 }
 
-void testCustomOperatorAliasing() {
+TEST(CustomOperatorTest, Aliasing) {
   torch::RegisterOperators reg(
       "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor {
         a.add_(b);
@@ -182,77 +184,65 @@ graph(%x: Tensor, %y: Tensor):
   }
 }
 
-void testIValueKWargs() {
-  const auto text = R"(
-    def foo(a : int, b : int, c : int = 4):
-      return a + 2*b + 3*c
-  )";
-  auto cu = compile(text);
-  auto result = cu->get_function("foo")({1}, {{"b", 3}});
-  ASSERT_EQ(result.toInt(), 19);
-}
-
-void testTemplatedOperatorCreator() {
-  constexpr char op_list[] = "foofoo::bar.template;foo::another";
+static constexpr char op_list[] = "foofoo::bar.template;foo::another";
 #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n)                                   \
   torch::detail::SelectiveStr<c10::impl::op_whitelist_contains_name_in_schema( \
       l, n)>(n)
 
-  {
-    // Try to register an op name that does not exist in op_list.
-    // Expected: the op name is not registered.
-    torch::jit::RegisterOperators reg({OperatorGenerator(
-        TORCH_SELECTIVE_NAME_IN_SCHEMA(
-            op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
-        [](Stack* stack) {
-          double a;
-          at::Tensor b;
-          pop(stack, a, b);
-          push(stack, a + b);
-        },
-        aliasAnalysisFromSchema())});
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
-    ASSERT_EQ(ops.size(), 0);
-  }
+TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
+  // Try to register an op name that does not exist in op_list.
+  // Expected: the op name is not registered.
+  torch::jit::RegisterOperators reg({OperatorGenerator(
+      TORCH_SELECTIVE_NAME_IN_SCHEMA(
+          op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
+      [](Stack* stack) {
+        double a;
+        at::Tensor b;
+        pop(stack, a, b);
+        push(stack, a + b);
+      },
+      aliasAnalysisFromSchema())});
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+  ASSERT_EQ(ops.size(), 0);
+}
 
-  {
-    // The operator should be successfully registered since its name is in the
-    // whitelist.
-    torch::jit::RegisterOperators reg({OperatorGenerator(
-        TORCH_SELECTIVE_NAME_IN_SCHEMA(
-            op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
-        [](Stack* stack) {
-          double a;
-          at::Tensor b;
-          pop(stack, a, b);
-          push(stack, a + b);
-        },
-        aliasAnalysisFromSchema())});
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
-    ASSERT_EQ(ops.size(), 1);
-
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foofoo::bar");
-
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
-
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
-
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
-
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
+TEST(TestCustomOperator, OperatorGeneratorBasic) {
+  // The operator should be successfully registered since its name is in the
+  // whitelist.
+  torch::jit::RegisterOperators reg({OperatorGenerator(
+      TORCH_SELECTIVE_NAME_IN_SCHEMA(
+          op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
+      [](Stack* stack) {
+        double a;
+        at::Tensor b;
+        pop(stack, a, b);
+        push(stack, a + b);
+      },
+      aliasAnalysisFromSchema())});
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+  ASSERT_EQ(ops.size(), 1);
+
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foofoo::bar");
+
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
+
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 953d1bf42fc0..92baba1168da 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2225,5 +2225,15 @@ void testProfilerDisableInCallback() {
   t.join();
 }
 
+void testIValueKWargs() {
+  const auto text = R"(
+    def foo(a : int, b : int, c : int = 4):
+      return a + 2*b + 3*c
+  )";
+  auto cu = compile(text);
+  auto result = cu->get_function("foo")({1}, {{"b", 3}});
+  ASSERT_EQ(result.toInt(), 19);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 45d7f48b1f8a..8f43882c9e22 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -9,22 +9,14 @@
 namespace torch {
 namespace jit {
 #define TH_FORALL_TESTS(_)                        \
-  _(ADFormulas)                                   \
   _(Attributes)                                   \
   _(Blocks)                                       \
   _(CallStack)                                    \
   _(CallStackCaching)                             \
-  _(CodeTemplate)                                 \
   _(ControlFlow)                                  \
-  _(CreateAutodiffSubgraphs)                      \
-  _(CustomOperators)                              \
-  _(CustomOperatorAliasing)                       \
-  _(TemplatedOperatorCreator)                     \
   _(IValueKWargs)                                 \
   _(CustomFusion)                                 \
   _(SchemaMatching)                               \
-  _(Differentiate)                                \
-  _(DifferentiateWithRequiresGrad)                \
   _(FromQualString)                               \
   _(InternedStrings)                              \
   _(PassManagement)                               \
@@ -35,12 +27,9 @@ namespace jit {
   _(SubgraphUtils)                                \
   _(SubgraphUtilsVmap)                            \
   _(IRParser)                                     \
-  _(ConstantPooling)                              \
-  _(CleanUpPasses)                                \
   _(THNNConv)                                     \
   _(ATenNativeBatchNorm)                          \
   _(NoneSchemaMatch)                              \
-  _(ClassParser)                                  \
   _(UnifyTypes)                                   \
   _(Profiler)                                     \
   _(FallbackGraphs)                               \
@@ -61,15 +50,11 @@ namespace jit {
   _(ModuleDeepcopyAliasing)                       \
   _(ModuleDefine)                                 \
   _(QualifiedName)                                \
-  _(ClassImport)                                  \
-  _(ScriptObject)                                 \
   _(ExtraFilesHookPreference)                     \
   _(SaveExtraFilesHook)                           \
   _(TypeTags)                                     \
   _(DCE)                                          \
   _(CustomFusionNestedBlocks)                     \
-  _(ClassDerive)                                  \
-  _(SaveLoadTorchbind)                            \
   _(ModuleInterfaceSerialization)                 \
   _(ModuleCloneWithModuleInterface)               \
   _(ClassTypeAddRemoveAttr)                       \
@@ -100,7 +85,6 @@ namespace jit {
   _(LiteInterpreterHierarchyModuleInfo)           \
   _(LiteInterpreterDuplicatedClassTypeModuleInfo) \
   _(LiteInterpreterEval)                          \
-  _(TorchbindIValueAPI)                           \
   _(LiteInterpreterDict)                          \
   _(LiteInterpreterFindAndRunMethod)              \
   _(LiteInterpreterFindWrongMethodName)           \

From 246bd9422a1f64965ad9082798c8b17f96bc2924 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Wed, 23 Sep 2020 14:26:03 -0700
Subject: [PATCH 063/449] gtestify dce and fuser tests (#45055)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45055

See https://github.com/pytorch/pytorch/pull/45018 for context.

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D23811085

Pulled By: suo

fbshipit-source-id: 45008e41f2394d2ba319745b0340392e1b3d3172
---
 test/cpp/jit/test_dce.cpp   |  6 +++---
 test/cpp/jit/test_fuser.cpp | 41 +++++++++++++++++++------------------
 test/cpp/jit/tests.h        |  7 +------
 3 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp
index 5799913c316a..6f9161d0d9ae 100644
--- a/test/cpp/jit/test_dce.cpp
+++ b/test/cpp/jit/test_dce.cpp
@@ -1,12 +1,12 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
+#include <gtest/gtest.h>
 
+#include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
 namespace torch {
 namespace jit {
-void testDCE() {
+TEST(EliminateDeadCodeTest, Basic) {
   auto graph = std::make_shared<Graph>();
 
   // Consider the following loop:
diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp
index ee0ea060f02f..ef595215b882 100644
--- a/test/cpp/jit/test_fuser.cpp
+++ b/test/cpp/jit/test_fuser.cpp
@@ -1,4 +1,4 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
 
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include "ATen/core/interned_strings.h"
@@ -56,28 +56,27 @@
 namespace torch {
 namespace jit {
 
-void testFusion() {
-  auto testSimple = [&] {
-    const auto graph_string = R"IR(
+TEST(FuserTest, TestSimple_CUDA) {
+  const auto graph_string = R"IR(
       graph(%0 : Tensor,
             %1 : Tensor):
         %2 : Tensor = aten::mul(%0, %1)
         return (%2))IR";
-    Graph graph;
-    torch::jit::parseIR(graph_string, &graph);
-
-    auto a = at::rand({3, 4}, at::kCUDA);
-    auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
-    auto o = at::zeros({3, 4}, at::kCUDA);
-    auto outputs = debugLaunchGraph(graph, {a, b});
-    ASSERT_EQ(outputs.size(), 1);
-    auto o2 = a * b;
-    float max_diff = (o2 - outputs[0]).abs().max().item<double>();
-    // std::cout << "max diff: " << max_diff << "\n";
-    ASSERT_EQ(max_diff, 0);
-  };
-  testSimple();
+  Graph graph;
+  torch::jit::parseIR(graph_string, &graph);
+
+  auto a = at::rand({3, 4}, at::kCUDA);
+  auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
+  auto o = at::zeros({3, 4}, at::kCUDA);
+  auto outputs = debugLaunchGraph(graph, {a, b});
+  ASSERT_EQ(outputs.size(), 1);
+  auto o2 = a * b;
+  float max_diff = (o2 - outputs[0]).abs().max().item<double>();
+  // std::cout << "max diff: " << max_diff << "\n";
+  ASSERT_EQ(max_diff, 0);
+}
 
+TEST(FuserTest, TestOne_CUDA) {
   auto testOne = [&](int ti, int tj) {
     const auto graph_string = R"IR(
       graph(%0 : Tensor,
@@ -132,7 +131,9 @@ void testFusion() {
   testOne(0, 1);
   testOne(1, 2);
   testOne(0, 2);
+}
 
+TEST(FuserTest, FusedConcat_CUDA) {
   const auto graph_string0 = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -175,7 +176,7 @@ void testFusion() {
   };
 }
 
-void testFusionAliasing() {
+TEST(FuserTest, FusionAliasing) {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -200,7 +201,7 @@ void testFusionAliasing() {
       ->run(*g);
 }
 
-void testRegisterFusionCachesKernel() {
+TEST(FuserTest, KernelCaching) {
   // Constructs two functionally equivalent graphs
   const auto graph0_string = R"IR(
     graph(%0 : Float(2, 3, 4),
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 8f43882c9e22..186aaaec2bba 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -21,7 +21,6 @@ namespace jit {
   _(InternedStrings)                              \
   _(PassManagement)                               \
   _(Proto)                                        \
-  _(RegisterFusionCachesKernel)                   \
   _(SchemaParser)                                 \
   _(TopologicalIndex)                             \
   _(SubgraphUtils)                                \
@@ -53,7 +52,6 @@ namespace jit {
   _(ExtraFilesHookPreference)                     \
   _(SaveExtraFilesHook)                           \
   _(TypeTags)                                     \
-  _(DCE)                                          \
   _(CustomFusionNestedBlocks)                     \
   _(ModuleInterfaceSerialization)                 \
   _(ModuleCloneWithModuleInterface)               \
@@ -93,12 +91,10 @@ namespace jit {
   _(MobileSaveLoadParameters)                     \
   _(MobileSaveLoadParametersEmpty)                \
   _(LiteSGD)                                      \
-  _(LiteSequentialSampler)                        \
-  _(FusionAliasing)
+  _(LiteSequentialSampler)
 
 #if defined(USE_CUDA)
 #define TH_FORALL_TESTS_CUDA(_)                     \
-  _(Fusion)                                         \
   _(GraphExecutor)                                  \
   _(ModuleConversion)                               \
   _(Interp)                                         \
@@ -203,7 +199,6 @@ namespace jit {
   _(GPU_FusionThreadPredicate)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
-  _(Fusion)                     \
   _(GraphExecutor)              \
   _(ModuleConversion)           \
   _(Interp)                     \

From 2a1a51facbba6f9be2cc80aa6b91d795666eda46 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Wed, 23 Sep 2020 14:49:02 -0700
Subject: [PATCH 064/449] Fix typos. (#45195)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45195

Fix some typos in reducer class.
ghstack-source-id: 112673443

Test Plan: N/A

Reviewed By: rohan-varma

Differential Revision: D23862399

fbshipit-source-id: 0dc69e5ea1fa7d33c85d1909b2216bcd1f579f6a
---
 torch/csrc/distributed/c10d/reducer.cpp | 4 ++--
 torch/csrc/distributed/c10d/reducer.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index a895bea5fc26..1a5766eea84e 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -190,7 +190,7 @@ Reducer::Reducer(
 // used to override how DDP communicates gradients across ranks, this can be
 // used for algorithms like Gradient Compression/GossipGrad. This hook can be
 // registered from Python API using `register_comm_hook`. `PythonCommHook`
-// enables registering a Python hook and is a sub class of `CommHookInterface`.
+// enables registering a Python hook and is a subclass of `CommHookInterface`.
 // `CommHookInterface` can be used to implement CPP hooks in the future.
 
 Reducer::~Reducer() noexcept(false) {
@@ -493,7 +493,7 @@ void Reducer::autograd_hook(VariableIndex index) {
   // rebuilt_param_indices_ based on gradient arriving order, and then at the
   // end of finalize_backward(), buckets will be rebuilt based on
   // rebuilt_params_ and rebuilt_param_indices_, and then will be broadcasted
-  // and initialized. Also we only need to dump tensors and parameter indcies of
+  // and initialized. Also we only need to dump tensors and parameter indices of
   // one replica.
   push_rebuilt_params(index);
 
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index d45e5c2b90e1..3b441c99a3b6 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -179,7 +179,7 @@ class Reducer {
   // and on the same device can be batched. The tensor that represents the
   // flattened gradient uses the same type and is placed on the same device.
   // Buckets are filled as the gradients they hold are computed (triggered by
-  // autograd hooks). Buckets are reduced in a predetemined order that is
+  // autograd hooks). Buckets are reduced in a predetermined order that is
   // identical across processes.
   struct BucketReplica {
     // Flattened (1 dimensional) contents of bucket.

From 8e0fc711f49cecac15a944ede4451703c3db1c02 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Wed, 23 Sep 2020 14:49:27 -0700
Subject: [PATCH 065/449] [TensorExpr] Remove unused EvalConstExpr function
 (#45180)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45180

Test Plan: build

Reviewed By: ezyang

Differential Revision: D23877151

Pulled By: asuhan

fbshipit-source-id: a5d4d211c1dc85e6f7045330606163a933b9474e
---
 torch/csrc/jit/tensorexpr/loopnest.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index b7862fb953c1..2cbc7bdf186d 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -23,17 +23,6 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-namespace {
-
-// Evaluates a constant expression and returns its value.
-template <typename T>
-static T EvalConstExpr(const ExprHandle& expr) {
-  ExprEval<SimpleIREvaluator> eval(expr);
-  return eval.value<T>();
-}
-
-} // namespace
-
 class IndexFlattener : public IRMutator {
  public:
   Stmt* flatten(Stmt* s) {

From 049599886289a5ddb2b16a58564a66ecabac0142 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Wed, 23 Sep 2020 14:53:17 -0700
Subject: [PATCH 066/449] [TensorExpr] Disallow arithmetic binary operations on
 Bool (#44677)

Summary:
Arithmetic operations on Bool aren't fully supported in the evaluator. Moreover,
such semantics can be implemented by the client code through insertion of
explicit casts to widen and narrow to the desired types.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44677

Test Plan:
test_tensorexpr --gtest_filter=TensorExprTest.ExprDisallowBoolArithmetic
python test/test_jit_fuser_te.py

Reviewed By: agolynski

Differential Revision: D23801412

Pulled By: asuhan

fbshipit-source-id: fff5284e3a216655dbf5a9a64d1cb1efda271a36
---
 test/cpp/tensorexpr/test_expr.cpp    | 18 +++++++
 test/cpp/tensorexpr/tests.h          |  1 +
 test/test_jit_fuser_te.py            | 77 ++++++++++++++++++++++++++++
 torch/csrc/jit/tensorexpr/kernel.cpp | 22 +++++---
 torch/csrc/jit/tensorexpr/types.h    |  4 ++
 5 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
index c1386a85764b..e94e70aa6b38 100644
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -164,6 +164,24 @@ void testExprDoubleTest() {
   ASSERT_EQ(eval.value<double>(), 2 + (3 * 3 + 4));
 }
 
+void testExprDisallowBoolArithmetic() {
+  KernelScope kernel_scope;
+  VarHandle x("x", kBool);
+  VarHandle y("y", kBool);
+  std::string error{"arithmetic binary operations on Bool not supported"};
+  ASSERT_THROWS_WITH((x + y), error);
+  ASSERT_THROWS_WITH((x - y), error);
+  ASSERT_THROWS_WITH((x * y), error);
+  ASSERT_THROWS_WITH((x / y), error);
+  ASSERT_THROWS_WITH((x & y), error);
+  ASSERT_THROWS_WITH((x | y), error);
+  ASSERT_THROWS_WITH((x ^ y), error);
+  ASSERT_THROWS_WITH((x << y), error);
+  ASSERT_THROWS_WITH((x >> y), error);
+  ASSERT_THROWS_WITH(Max::make(x, y, /*propagate_nans=*/true), error);
+  ASSERT_THROWS_WITH(Min::make(x, y, /*propagate_nans=*/true), error);
+}
+
 void testExprVectorAdd01() {
   KernelScope kernel_scope;
   const int kVectorSize = 8;
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
index d0a9aa840b91..2d42a4a93967 100644
--- a/test/cpp/tensorexpr/tests.h
+++ b/test/cpp/tensorexpr/tests.h
@@ -23,6 +23,7 @@ namespace jit {
   _(ExprLongTest)                           \
   _(ExprHalfTest)                           \
   _(ExprDoubleTest)                         \
+  _(ExprDisallowBoolArithmetic)             \
   _(ExprVectorAdd01)                        \
   _(ExprCompareSelectEQ)                    \
   _(ExprCompareSelectDtypes)                \
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 453047eca8be..f9aca9a5dea1 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 
+import operator
 import unittest
 import contextlib
 import torch
@@ -459,6 +460,82 @@ def func(x):
         graph = backward_graph(s, skip_check=True)
         self.assertAllFused(graph, except_for={'aten::div', 'prim::Constant'})
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_add_bool(self):
+        def f(x, y, z):
+            return x + y + z
+
+        x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+
+        ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False)
+        self.assertAllFused(ge.graph_for(x, y, z))
+
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_mul_bool(self):
+        def f(x, y, z):
+            return x * y * z
+
+        x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+
+        ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False)
+        self.assertAllFused(ge.graph_for(x, y, z))
+
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_div_bool(self):
+        def f(x, y, z):
+            return (x + y) / z
+
+        x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda')
+        z = torch.ones_like(x, dtype=torch.bool, device='cuda')
+
+        ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False)
+        self.assertAllFused(ge.graph_for(x, y, z))
+
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_bitwise_ops(self):
+        def apply(fn):
+            return lambda x, y, z: fn(fn(x, y), z)
+
+        dtypes = [
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.bool,
+        ]
+        binary_ops = [
+            operator.__and__,
+            operator.__or__,
+            operator.__xor__
+        ]
+        devices = ["cuda"]
+        for dtype, op, device in product(dtypes, binary_ops, devices):
+            try:
+                x = self.data_for(dtype, device)
+                y = self.data_for(dtype, device)
+                z = self.data_for(dtype, device)
+                fn = apply(op)
+                ref = fn(x, y, z)
+            except Exception:
+                # If eager mode doesn't support a dtype/op/device combo,
+                # neither does the fuser.  Catch everything to avoid needing to
+                # guess what errors might be thrown by eager.
+                continue
+            try:
+                t = torch.jit.trace(fn, (x, y, z))
+                self.assertEqual(ref, t(x, y, z))
+                self.assertAllFused(t.graph_for(x, y, z))
+            except Exception as e:
+                raise RuntimeError(
+                    " ".join(["Failed:", str(dtype), op.__name__, device])
+                )
+
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     def test_comparison_eq_ne(self):
         def f(x, y):
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 5cd414bbe2df..293ea780ed27 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -673,11 +673,20 @@ Tensor* TensorExprKernel::computeFourOperand(
       });
 }
 
+namespace {
+
+// Convert boolean to integer, if needed.
+ExprHandle boolToInteger(const ExprHandle& x) {
+  return x.dtype().scalar_type() == ScalarType::Bool ? cast<int>(x) : x;
+}
+
+} // namespace
+
 Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
   switch (v->node()->kind()) {
     case aten::add: {
       auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
-        return lhs + rhs;
+        return boolToInteger(lhs) + boolToInteger(rhs);
       };
       TORCH_INTERNAL_ASSERT(
           v->node()->inputs().size() == 2 || v->node()->inputs().size() == 3);
@@ -694,6 +703,7 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
 
     case aten::sub: {
       auto sub_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
+        // NB: sub isn't supported on boolean, no need to promote to integer.
         return lhs - rhs;
       };
       TORCH_INTERNAL_ASSERT(
@@ -706,35 +716,35 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     case aten::mul: {
       return computeTwoOperand(
           "aten_mul", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs * rhs;
+            return boolToInteger(lhs) * boolToInteger(rhs);
           });
     } break;
 
     case aten::div: {
       return computeTwoOperand(
           "aten_div", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs / rhs;
+            return boolToInteger(lhs) / boolToInteger(rhs);
           });
     } break;
 
     case aten::__and__: {
       return computeTwoOperand(
           "aten_and", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs & rhs;
+            return boolToInteger(lhs) & boolToInteger(rhs);
           });
     } break;
 
     case aten::__or__: {
       return computeTwoOperand(
           "aten_or", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs | rhs;
+            return boolToInteger(lhs) | boolToInteger(rhs);
           });
     } break;
 
     case aten::__xor__: {
       return computeTwoOperand(
           "aten_xor", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs ^ rhs;
+            return boolToInteger(lhs) ^ boolToInteger(rhs);
           });
     } break;
 
diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h
index 8dd67c8b7125..8e39ad231545 100644
--- a/torch/csrc/jit/tensorexpr/types.h
+++ b/torch/csrc/jit/tensorexpr/types.h
@@ -124,6 +124,10 @@ inline Dtype BinaryOpDtype(
     Dtype op1_dtype,
     Dtype op2_dtype,
     ScalarType ret_type = ScalarType::None) {
+  if (op1_dtype.scalar_type() == ScalarType::Bool ||
+      op2_dtype.scalar_type() == ScalarType::Bool) {
+    throw malformed_input("arithmetic binary operations on Bool not supported");
+  }
   if (op1_dtype == op2_dtype) {
     if (ret_type == ScalarType::None) {
       return op1_dtype;

From f93ead6d37b476576b50a2f550c5898415a1fe35 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 23 Sep 2020 15:37:50 -0700
Subject: [PATCH 067/449] [quant][eagermode] Custom module support (#44835)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44835

This is for feature parity with fx graph mode quantization

Test Plan: Imported from OSS

Reviewed By: z-a-f

Differential Revision: D23745086

fbshipit-source-id: ae2fc86129f9896d5a9039b73006a4da15821307
---
 test/quantization/test_quantize.py            | 113 ++++++++++++++++++
 torch/quantization/__init__.py                |   1 +
 torch/quantization/quantize.py                |  87 +++++++++-----
 .../testing/_internal/common_quantization.py  |  23 +++-
 4 files changed, 193 insertions(+), 31 deletions(-)

diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index 91594da111c1..e54eb33770c2 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -14,6 +14,8 @@
     fuse_modules,
     quantize_dynamic,
     QuantWrapper,
+    QuantStub,
+    DeQuantStub,
     QConfig,
     default_qconfig,
     default_qat_qconfig,
@@ -21,6 +23,8 @@
     per_channel_dynamic_qconfig,
     float16_dynamic_qconfig,
     float_qparams_dynamic_qconfig,
+    register_observed_custom_module_mapping,
+    register_quantized_custom_module_mapping,
 )
 
 from torch.testing._internal.common_quantization import (
@@ -571,6 +575,115 @@ def forward(self, indices, offsets, per_sample_weights, linear_in):
         self.checkLinear(model.fc)
         self.checkDynamicQuantizedModule(quantized_model.emb, torch.nn.quantized.EmbeddingBag, torch.quint8)
 
+    @skipIfNoFBGEMM
+    def test_custom_module_class(self):
+        class CustomModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        class ObservedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_float(cls, float_module):
+                assert hasattr(float_module, 'qconfig')
+                observed = cls(float_module.conv)
+                observed.qconfig = float_module.qconfig
+                return observed
+
+        class QuantizedCustomModule(torch.nn.Module):
+            def __init__(self, conv):
+                super().__init__()
+                self.conv = conv
+
+            def forward(self, x):
+                return self.conv(x)
+
+            @classmethod
+            def from_observed(cls, observed_module):
+                assert hasattr(observed_module, 'qconfig')
+                assert hasattr(observed_module, 'activation_post_process')
+                observed_module.conv.activation_post_process = \
+                    observed_module.activation_post_process
+                quantized = cls(nnq.Conv2d.from_float(observed_module.conv))
+                return quantized
+
+        register_observed_custom_module_mapping(CustomModule, ObservedCustomModule)
+        register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = QuantStub()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.custom = CustomModule()
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv(x)
+                x = self.custom(x)
+                x = self.dequant(x)
+                return x
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.quant = QuantStub()
+                self.conv1 = torch.nn.Conv2d(1, 1, 1)
+                self.conv2 = torch.nn.Conv2d(1, 1, 1)
+                self.dequant = DeQuantStub()
+
+            def forward(self, x):
+                x = self.quant(x)
+                x = self.conv1(x)
+                x = self.conv2(x)
+                x = self.dequant(x)
+                return x
+
+        data = torch.randn(1, 1, 1, 1)
+        # instantiate M and RefM and align the parameters
+        original_m = M()
+        original_ref_m = RefM()
+        original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
+        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+        original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach())
+        original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach())
+
+        original_m.qconfig = default_qconfig
+        m = prepare(original_m)
+        self.checkObservers(m)
+        # calibration
+        m(data)
+        # all activation observers are inserted in the top level module
+
+        # check converted/quantized model
+        m = convert(m)
+        # check if the module is properly quantized
+        self.assertEqual(type(m.quant), nnq.Quantize)
+        self.assertEqual(type(m.conv), nnq.Conv2d)
+        self.assertEqual(type(m.custom.conv), nnq.Conv2d)
+        self.assertEqual(type(m.dequant), nnq.DeQuantize)
+        res = m(data)
+
+        # quantize the reference model
+        original_ref_m.eval()
+        original_ref_m.qconfig = default_qconfig
+        ref_m = prepare(original_ref_m)
+        ref_m(data)
+        ref_m = convert(ref_m)
+        ref_res = ref_m(data)
+        self.assertEqual(res, ref_res)
+
 
 @skipIfNoFBGEMM
 class TestPostTrainingDynamic(QuantizationTestCase):
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index 3193c332469f..31943e56e6a3 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -46,6 +46,7 @@ def default_eval_fn(model, calib_data):
     'register_quantized_custom_mdoule_mapping',
     'get_quantized_custom_module_class',
     'is_custom_module_class',
+    'is_observed_custom_module',
     # Sub functions for `prepare` and `swap_module`
     'propagate_qconfig_', 'add_quant_dequant', 'add_observer_', 'swap_module',
     'default_eval_fn', 'get_observer_dict',
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 8bc3b6ffc532..19a27e62ac5b 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -14,6 +14,14 @@
                                     get_qat_module_mappings,
                                     get_qconfig_propagation_list)
 
+from .custom_module_class_mappings import (
+    is_custom_module_class,
+    get_observed_custom_module_class,
+    get_quantized_custom_module_class,
+    mark_observed_custom_module,
+    is_observed_custom_module,
+)
+
 from .stubs import DeQuantStub, QuantWrapper
 from .qconfig import default_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig
 
@@ -117,38 +125,52 @@ def get_activation_post_process(qconfig, device):
             activation.to(device)
         return activation
 
-    for child in module.children():
+    def needs_observation(m):
+        return hasattr(m, 'qconfig') and m.qconfig is not None
+
+    def insert_activation_post_process(m):
+        """ Adds an activation post process module and register
+        a post hook that calls the module
+        """
+        if needs_observation(m):
+            # observer and hook will be gone after we swap the module
+            m.add_module('activation_post_process', get_activation_post_process(m.qconfig, device))
+            # Register observer as the first entry in the hook list
+            # All post forward hooks are preserved and will be executed after the observer before convert
+            handle = register_activation_post_process_hook(m)
+            m._forward_hooks.move_to_end(handle.id, last=False)
+
+    for name, child in module.named_children():
         if type(child) == nnq.FloatFunctional or type(child) == nnq.QFunctional:
             if hasattr(child, 'qconfig') and child.qconfig is not None:
                 child.activation_post_process = get_activation_post_process(child.qconfig, device)
         elif non_leaf_module_list is not None and type(child) in non_leaf_module_list:
-            if hasattr(child, 'qconfig') and child.qconfig is not None:
-                child.add_module('activation_post_process', get_activation_post_process(child.qconfig, device))
-                register_activation_post_process_hook(child)
-
+            insert_activation_post_process(child)
+            # TODO: remove
+            if needs_observation(child):
                 # Attaching prehook
                 if prehook is not None:
                     child.add_module('activation_pre_process', prehook())
                     child.register_forward_pre_hook(_observer_forward_pre_hook)
+        elif needs_observation(child) and is_custom_module_class(type(child)):
+            observed_child = get_observed_custom_module_class(type(child)).from_float(child)
+            mark_observed_custom_module(observed_child, type(child))
+            setattr(module, name, observed_child)
+            insert_activation_post_process(observed_child)
         else:
             add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, prehook)
 
     # Insert observers only for leaf nodes, note that this observer is for
     # the output of the module, for input QuantStub will observe them
-    if hasattr(module, 'qconfig') and module.qconfig is not None and \
-       len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
+    if len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
        and type(module) in qconfig_propagation_list:
-        # observer and hook will be gone after we swap the module
-        module.add_module('activation_post_process', get_activation_post_process(module.qconfig, device))
-        # Register observer as the first entry in the hook list
-        # All post forward hooks are preserved and will be executed after the observer before convert
-        handle = register_activation_post_process_hook(module)
-        module._forward_hooks.move_to_end(handle.id, last=False)
-
-        # Attaching prehook
-        if prehook is not None:
-            module.add_module('activation_pre_process', prehook())
-            module.register_forward_pre_hook(_observer_forward_pre_hook)
+        insert_activation_post_process(module)
+        # TOOD: remove
+        if needs_observation(module):
+            # Attaching prehook
+            if prehook is not None:
+                module.add_module('activation_pre_process', prehook())
+                module.register_forward_pre_hook(_observer_forward_pre_hook)
 
 def get_unique_devices_(module):
     return {p.device for p in module.parameters()} | \
@@ -429,7 +451,10 @@ def _convert(module, mapping=None, inplace=False):
                          nniqat.ConvBnReLU2d)
 
     for name, mod in module.named_children():
-        if type(mod) not in SWAPPABLE_MODULES:
+        # both swappable modules and observed custom modules are
+        # swapped as one unit
+        if type(mod) not in SWAPPABLE_MODULES and \
+           not is_observed_custom_module(mod):
             _convert(mod, mapping, inplace=True)
         reassign[name] = swap_module(mod, mapping)
 
@@ -452,15 +477,15 @@ def swap_module(mod, mapping):
     new_mod = mod
     # Always replace dequantstub with dequantize
     if hasattr(mod, 'qconfig') and mod.qconfig is not None or type(mod) == DeQuantStub:
-        if type(mod) in mapping:
-            # respect device affinity when swapping modules
-            devices = get_unique_devices_(mod)
-            assert len(devices) <= 1, (
-                "swap_module only works with cpu or single-device CUDA modules, "
-                "but got devices {}".format(devices)
-            )
-            device = next(iter(devices)) if len(devices) > 0 else None
+        swapped = False
+        if is_observed_custom_module(mod):
+            new_mod = get_quantized_custom_module_class(mod._FLOAT_MODULE).from_observed(mod)
+            swapped = True
+        elif type(mod) in mapping:
             new_mod = mapping[type(mod)].from_float(mod)
+            swapped = True
+
+        if swapped:
             # Preserve module's pre forward hooks. They'll be called on quantized input
             for pre_hook_fn in mod._forward_pre_hooks.values():
                 new_mod.register_forward_pre_hook(pre_hook_fn)
@@ -469,6 +494,14 @@ def swap_module(mod, mapping):
             for hook_fn in mod._forward_hooks.values():
                 if hook_fn is not _observer_forward_hook:
                     new_mod.register_forward_hook(hook_fn)
+
+            # respect device affinity when swapping modules
+            devices = get_unique_devices_(mod)
+            assert len(devices) <= 1, (
+                "swap_module only works with cpu or single-device CUDA modules, "
+                "but got devices {}".format(devices)
+            )
+            device = next(iter(devices)) if len(devices) > 0 else None
             if device:
                 new_mod.to(device)
     return new_mod
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 3edbd5dd7fcd..4031b2fdd0de 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -13,6 +13,10 @@
     default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
     propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_dynamic_qconfig, \
     get_default_qat_qconfig
+from torch.quantization import (
+    is_custom_module_class,
+    is_observed_custom_module,
+)
 from torch.quantization.quantization_mappings import (
     get_dynamic_quant_module_mappings,
     get_qconfig_propagation_list,
@@ -344,14 +348,25 @@ def checkObservers(self, module, propagate_qconfig_list=None):
         """
         if propagate_qconfig_list is None:
             propagate_qconfig_list = get_qconfig_propagation_list()
-        if hasattr(module, 'qconfig') and module.qconfig is not None and \
-           len(module._modules) == 0 and not isinstance(module, torch.nn.Sequential) \
-           and type(module) in propagate_qconfig_list:
+
+        # check if a module is a leaf module, ignoring activation_post_process attribute
+        def is_leaf_module(module):
+            submodule_name_count = 0
+            for name, _ in module.named_children():
+                if name != 'activation_post_process':
+                    submodule_name_count += 1
+            return submodule_name_count == 0
+
+        if (hasattr(module, 'qconfig') and module.qconfig is not None and
+           is_leaf_module(module) and not isinstance(module, torch.nn.Sequential)
+           and type(module) in propagate_qconfig_list) or \
+           is_custom_module_class(type(module)):
             self.assertTrue(hasattr(module, 'activation_post_process'),
                             'module: ' + str(type(module)) + ' do not have observer')
         # we don't need to check observers for child modules of the
         # qat modules
-        if type(module) not in get_qat_module_mappings().values():
+        if type(module) not in get_qat_module_mappings().values() and \
+           not is_observed_custom_module(module):
             for child in module.children():
                 self.checkObservers(child)
 

From 76c185dccaca99b753d51a5c3eae6f8c67e61f82 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Wed, 23 Sep 2020 17:03:48 -0700
Subject: [PATCH 068/449] [TensorExpr] When lanes differ, insert Broadcast
 instead of Cast (#45179)

Summary:
We need to check if dtypes differ in scalar type or lanes to decide between
Cast and Broadcast.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45179

Test Plan: test_tensorexpr --gtest_filter=TensorExprTest.SimplifyBroadcastTermExpander

Reviewed By: bwasti

Differential Revision: D23873316

Pulled By: asuhan

fbshipit-source-id: ca141be67e10c2b6c5f2ff9c11e42dcfc62ac620
---
 test/cpp/tensorexpr/test_simplify.cpp       | 26 +++++++++++++++++++++
 test/cpp/tensorexpr/tests.h                 |  1 +
 torch/csrc/jit/tensorexpr/ir_simplifier.cpp | 20 +++++++++++++---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index f8c5cdd3546d..f0185884fc58 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -3964,5 +3964,31 @@ void testSimplifyRampSubBroadcast() {
   ASSERT_EQ(newRamp->lanes(), num_lanes);
 }
 
+void testSimplifyBroadcastTermExpander() {
+  KernelScope kernel_scope;
+  int num_lanes = 8;
+  ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes);
+  ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes);
+  ExprHandle bc2 = Broadcast::make(ExprHandle(2), num_lanes);
+  // NB: We need a term in the middle which isn't simplified to trigger the
+  // relevant path in TermExpander::mutate. The two bc1 terms are brought
+  // together and simplified to 2 * bc1, which then needs to make 2 multi-lane.
+  ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1);
+  Buffer buf(BufHandle("buf", {num_lanes}, kInt));
+  // The result isn't fully simplified currently and thus would be brittle to
+  // match. Observe its value instead.
+  auto store = Store::make(
+      buf,
+      {Ramp::make(0, 1, num_lanes)},
+      simplified,
+      Broadcast::make(ExprHandle(1), num_lanes));
+  SimpleIREvaluator eval(store, buf);
+  std::vector<int> output(num_lanes);
+  eval(output);
+  for (int i = 0; i < num_lanes; ++i) {
+    ASSERT_EQ(output[i], 2);
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
index 2d42a4a93967..34eeaa0de19a 100644
--- a/test/cpp/tensorexpr/tests.h
+++ b/test/cpp/tensorexpr/tests.h
@@ -219,6 +219,7 @@ namespace jit {
   _(SimplifyFuseConditions)                 \
   _(SimplifySyncThreads)                    \
   _(SimplifyRampSubBroadcast)               \
+  _(SimplifyBroadcastTermExpander)          \
   _(RegisterizerSimple)                     \
   _(RegisterizerLoop)                       \
   _(RegisterizerLoopFixedLoad)              \
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index f6852b627969..37c856a2e618 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -1503,9 +1503,23 @@ const Expr* TermExpander::mutate(const Term* v) {
     if (lastNode) {
       // We want to avoid a leaving a CastNode on the scalar, so handle that
       // now.
-      if (v->scalar()->dtype() != lastNode->dtype()) {
-        lastNode = new Mul(
-            evaluateOp(new Cast(lastNode->dtype(), v->scalar())), lastNode);
+      auto termDtype = v->scalar()->dtype();
+      auto lastNodeDtype = lastNode->dtype();
+      if (termDtype != lastNodeDtype) {
+        const Expr* castV = v->scalar();
+        // Take care of lane mismatch first.
+        if (termDtype.lanes() != lastNodeDtype.lanes()) {
+          castV = new Broadcast(v->scalar(), lastNodeDtype.lanes());
+        }
+        // Now take care of scalar type as well.
+        if (termDtype.scalar_type() != lastNodeDtype.scalar_type()) {
+          castV = new Cast(lastNode->dtype(), castV);
+          // For scalars, we can simplify the cast further.
+          if (lastNodeDtype.lanes() == 1) {
+            castV = evaluateOp(castV);
+          }
+        }
+        lastNode = new Mul(castV, lastNode);
       } else {
         lastNode = new Mul(v->scalar(), lastNode);
       }

From 89c570ed0a1bfc096e2e299637c7c62831c3dd26 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Wed, 23 Sep 2020 17:25:57 -0700
Subject: [PATCH 069/449] Revert D23811085: gtestify dce and fuser tests

Test Plan: revert-hammer

Differential Revision:
D23811085 (https://github.com/pytorch/pytorch/commit/246bd9422a1f64965ad9082798c8b17f96bc2924)

Original commit changeset: 45008e41f239

fbshipit-source-id: 94c981f565cab9b710fe52a55bbe8dbf9c179c23
---
 test/cpp/jit/test_dce.cpp   |  6 +++---
 test/cpp/jit/test_fuser.cpp | 41 ++++++++++++++++++-------------------
 test/cpp/jit/tests.h        |  7 ++++++-
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp
index 6f9161d0d9ae..5799913c316a 100644
--- a/test/cpp/jit/test_dce.cpp
+++ b/test/cpp/jit/test_dce.cpp
@@ -1,12 +1,12 @@
-#include <gtest/gtest.h>
+#include <test/cpp/jit/test_base.h>
+#include <test/cpp/jit/test_utils.h>
 
-#include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
 namespace torch {
 namespace jit {
-TEST(EliminateDeadCodeTest, Basic) {
+void testDCE() {
   auto graph = std::make_shared<Graph>();
 
   // Consider the following loop:
diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp
index ef595215b882..ee0ea060f02f 100644
--- a/test/cpp/jit/test_fuser.cpp
+++ b/test/cpp/jit/test_fuser.cpp
@@ -1,4 +1,4 @@
-#include <gtest/gtest.h>
+#include "test/cpp/jit/test_base.h"
 
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include "ATen/core/interned_strings.h"
@@ -56,27 +56,28 @@
 namespace torch {
 namespace jit {
 
-TEST(FuserTest, TestSimple_CUDA) {
-  const auto graph_string = R"IR(
+void testFusion() {
+  auto testSimple = [&] {
+    const auto graph_string = R"IR(
       graph(%0 : Tensor,
             %1 : Tensor):
         %2 : Tensor = aten::mul(%0, %1)
         return (%2))IR";
-  Graph graph;
-  torch::jit::parseIR(graph_string, &graph);
-
-  auto a = at::rand({3, 4}, at::kCUDA);
-  auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
-  auto o = at::zeros({3, 4}, at::kCUDA);
-  auto outputs = debugLaunchGraph(graph, {a, b});
-  ASSERT_EQ(outputs.size(), 1);
-  auto o2 = a * b;
-  float max_diff = (o2 - outputs[0]).abs().max().item<double>();
-  // std::cout << "max diff: " << max_diff << "\n";
-  ASSERT_EQ(max_diff, 0);
-}
+    Graph graph;
+    torch::jit::parseIR(graph_string, &graph);
+
+    auto a = at::rand({3, 4}, at::kCUDA);
+    auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
+    auto o = at::zeros({3, 4}, at::kCUDA);
+    auto outputs = debugLaunchGraph(graph, {a, b});
+    ASSERT_EQ(outputs.size(), 1);
+    auto o2 = a * b;
+    float max_diff = (o2 - outputs[0]).abs().max().item<double>();
+    // std::cout << "max diff: " << max_diff << "\n";
+    ASSERT_EQ(max_diff, 0);
+  };
+  testSimple();
 
-TEST(FuserTest, TestOne_CUDA) {
   auto testOne = [&](int ti, int tj) {
     const auto graph_string = R"IR(
       graph(%0 : Tensor,
@@ -131,9 +132,7 @@ TEST(FuserTest, TestOne_CUDA) {
   testOne(0, 1);
   testOne(1, 2);
   testOne(0, 2);
-}
 
-TEST(FuserTest, FusedConcat_CUDA) {
   const auto graph_string0 = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -176,7 +175,7 @@ TEST(FuserTest, FusedConcat_CUDA) {
   };
 }
 
-TEST(FuserTest, FusionAliasing) {
+void testFusionAliasing() {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -201,7 +200,7 @@ TEST(FuserTest, FusionAliasing) {
       ->run(*g);
 }
 
-TEST(FuserTest, KernelCaching) {
+void testRegisterFusionCachesKernel() {
   // Constructs two functionally equivalent graphs
   const auto graph0_string = R"IR(
     graph(%0 : Float(2, 3, 4),
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 186aaaec2bba..8f43882c9e22 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -21,6 +21,7 @@ namespace jit {
   _(InternedStrings)                              \
   _(PassManagement)                               \
   _(Proto)                                        \
+  _(RegisterFusionCachesKernel)                   \
   _(SchemaParser)                                 \
   _(TopologicalIndex)                             \
   _(SubgraphUtils)                                \
@@ -52,6 +53,7 @@ namespace jit {
   _(ExtraFilesHookPreference)                     \
   _(SaveExtraFilesHook)                           \
   _(TypeTags)                                     \
+  _(DCE)                                          \
   _(CustomFusionNestedBlocks)                     \
   _(ModuleInterfaceSerialization)                 \
   _(ModuleCloneWithModuleInterface)               \
@@ -91,10 +93,12 @@ namespace jit {
   _(MobileSaveLoadParameters)                     \
   _(MobileSaveLoadParametersEmpty)                \
   _(LiteSGD)                                      \
-  _(LiteSequentialSampler)
+  _(LiteSequentialSampler)                        \
+  _(FusionAliasing)
 
 #if defined(USE_CUDA)
 #define TH_FORALL_TESTS_CUDA(_)                     \
+  _(Fusion)                                         \
   _(GraphExecutor)                                  \
   _(ModuleConversion)                               \
   _(Interp)                                         \
@@ -199,6 +203,7 @@ namespace jit {
   _(GPU_FusionThreadPredicate)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
+  _(Fusion)                     \
   _(GraphExecutor)              \
   _(ModuleConversion)           \
   _(Interp)                     \

From e9aa6898ab83988f6f3f5df351907e77e7cd38be Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Wed, 23 Sep 2020 17:40:15 -0700
Subject: [PATCH 070/449] Revert D23802296: gtest-ify JIT tests, through the
 letter c

Test Plan: revert-hammer

Differential Revision:
D23802296 (https://github.com/pytorch/pytorch/commit/d2b045030eb60283b8aeeb2956c7ebe91628fece)

Original commit changeset: 20c9798a414e

fbshipit-source-id: a28d56039ca404fe94ed7572f1febd1673e3e788
---
 test/cpp/jit/test_autodiff.cpp                |   9 +-
 test/cpp/jit/test_class_import.cpp            |  12 +-
 test/cpp/jit/test_class_parser.cpp            |   4 +-
 test/cpp/jit/test_cleanup_passes.cpp          |  37 +-
 test/cpp/jit/test_code_template.cpp           |  50 +--
 test/cpp/jit/test_constant_pooling.cpp        |  87 +++--
 .../jit/test_create_autodiff_subgraphs.cpp    |   5 +-
 test/cpp/jit/test_custom_class.cpp            |   4 +-
 test/cpp/jit/test_custom_operators.cpp        | 342 +++++++++---------
 test/cpp/jit/test_misc.cpp                    |  10 -
 test/cpp/jit/tests.h                          |  16 +
 11 files changed, 294 insertions(+), 282 deletions(-)

diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp
index 3993c63b1708..7d431776a971 100644
--- a/test/cpp/jit/test_autodiff.cpp
+++ b/test/cpp/jit/test_autodiff.cpp
@@ -1,5 +1,4 @@
-#include <gtest/gtest.h>
-
+#include "test/cpp/jit/test_base.h"
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/frontend/tracer.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
@@ -84,7 +83,7 @@ variable_list grad(
       fmap(inputs, get_edge));
 }
 
-TEST(AutodiffTest, ADFormulas) {
+void testADFormulas() {
   const auto cast = [](const Variable& v) {
     return static_cast<at::Tensor>(v);
   };
@@ -175,7 +174,7 @@ TEST(AutodiffTest, ADFormulas) {
   }
 }
 
-TEST(AutodiffTest, Differentiate) {
+void testDifferentiate() {
   // Note: can't use IRParser for this test due to issue #23989
   auto graph = std::make_shared<Graph>();
   std::vector<int64_t> sizes{2, 3, 4};
@@ -230,7 +229,7 @@ TEST(AutodiffTest, Differentiate) {
       ->run(*grad_spec.df);
 }
 
-TEST(AutodiffTest, DifferentiateWithRequiresGrad) {
+void testDifferentiateWithRequiresGrad() {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp
index ffa845b3e2a8..82bc0cf3bccc 100644
--- a/test/cpp/jit/test_class_import.cpp
+++ b/test/cpp/jit/test_class_import.cpp
@@ -1,7 +1,7 @@
-#include <gtest/gtest.h>
+#include <test/cpp/jit/test_base.h>
+#include <test/cpp/jit/test_utils.h>
 
 #include <ATen/core/qualified_name.h>
-#include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 #include <torch/csrc/jit/serialization/import_source.h>
 #include <torch/torch.h>
@@ -45,7 +45,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-TEST(ClassImportTest, Basic) {
+void testClassImport() {
   auto cu1 = std::make_shared<CompilationUnit>();
   auto cu2 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
@@ -80,7 +80,7 @@ TEST(ClassImportTest, Basic) {
   ASSERT_FALSE(c);
 }
 
-TEST(ClassImportTest, ScriptObject) {
+void testScriptObject() {
   Module m1("m1");
   Module m2("m2");
   std::vector<at::IValue> constantTable;
@@ -114,7 +114,7 @@ def __init__(self, x):
     return x
 )JIT";
 
-TEST(ClassImportTest, ClassDerive) {
+void testClassDerive() {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu);
   const auto self = SimpleSelf(cls);
@@ -142,7 +142,7 @@ class FooBar1234(Module):
     return (self.f).top()
 )JIT";
 
-TEST(ClassImportTest, CustomClass) {
+void testSaveLoadTorchbind() {
   auto cu1 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
   // Import different versions of FooTest into two namespaces.
diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp
index a5b19f63fd3f..45e37103bb5a 100644
--- a/test/cpp/jit/test_class_parser.cpp
+++ b/test/cpp/jit/test_class_parser.cpp
@@ -1,5 +1,3 @@
-#include <gtest/gtest.h>
-
 #include <test/cpp/jit/test_base.h>
 #include <torch/csrc/jit/frontend/parser.h>
 #include <torch/csrc/jit/frontend/resolver.h>
@@ -17,7 +15,7 @@ const auto testSource = R"JIT(
     an_attribute : Tensor
 )JIT";
 
-TEST(ClassParserTest, Basic) {
+void testClassParser() {
   Parser p(std::make_shared<Source>(testSource));
   std::vector<Def> definitions;
   std::vector<Resolver> resolvers;
diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp
index 38ceef932eb0..2f2ca4e0a19b 100644
--- a/test/cpp/jit/test_cleanup_passes.cpp
+++ b/test/cpp/jit/test_cleanup_passes.cpp
@@ -1,19 +1,19 @@
-#include <gtest/gtest.h>
-
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/testing/file_check.h>
+#include "test/cpp/jit/test_base.h"
 
 namespace torch {
 namespace jit {
 
-TEST(CleanupPassTest, Basic) {
+void testCleanUpPasses() {
   // Tests stability of clean up passes when dealing with constant pooling
   // and constant propagation.
-  auto graph = std::make_shared<Graph>();
-  parseIR(
-      R"IR(
+  {
+    auto graph = std::make_shared<Graph>();
+    parseIR(
+        R"IR(
 graph(%cond.1 : Tensor,
       %suffix.1 : str):
   %3 : bool = aten::Bool(%cond.1) # o.py:6:7
@@ -31,19 +31,20 @@ graph(%cond.1 : Tensor,
       -> (%12)
   return (%25)
   )IR",
-      &*graph);
-  runCleanupPasses(graph);
-  testing::FileCheck()
-      .check_count(
-          "prim::Constant[value=\"same string with a twist\"]",
-          1,
-          /*exactly=*/true)
-      ->run(*graph);
+        &*graph);
+    runCleanupPasses(graph);
+    testing::FileCheck()
+        .check_count(
+            "prim::Constant[value=\"same string with a twist\"]",
+            1,
+            /*exactly=*/true)
+        ->run(*graph);
 
-  auto graph_after_pass_once = graph->toString();
-  runCleanupPasses(graph);
-  auto graph_after_pass_twice = graph->toString();
-  ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
+    auto graph_after_pass_once = graph->toString();
+    runCleanupPasses(graph);
+    auto graph_after_pass_twice = graph->toString();
+    ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
+  }
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp
index 35897474f1f2..e4d7d1ef856e 100644
--- a/test/cpp/jit/test_code_template.cpp
+++ b/test/cpp/jit/test_code_template.cpp
@@ -1,6 +1,6 @@
-#include <gtest/gtest.h>
+#include "test/cpp/jit/test_base.h"
+#include "test/cpp/jit/test_utils.h"
 
-#include <test/cpp/jit/test_utils.h>
 #include "torch/csrc/jit/frontend/code_template.h"
 
 namespace torch {
@@ -33,29 +33,31 @@ static const auto ct_expect = R"(
   int notest(int a)
   )";
 
-TEST(TestCodeTemplate, Copying) {
-  TemplateEnv e;
-  e.s("hi", "foo");
-  e.v("what", {"is", "this"});
-  TemplateEnv c(e);
-  c.s("hi", "foo2");
-  ASSERT_EQ(e.s("hi"), "foo");
-  ASSERT_EQ(c.s("hi"), "foo2");
-  ASSERT_EQ(e.v("what")[0], "is");
-}
+void testCodeTemplate() {
+  {
+    TemplateEnv e;
+    e.s("hi", "foo");
+    e.v("what", {"is", "this"});
+    TemplateEnv c(e);
+    c.s("hi", "foo2");
+    ASSERT_EQ(e.s("hi"), "foo");
+    ASSERT_EQ(c.s("hi"), "foo2");
+    ASSERT_EQ(e.v("what")[0], "is");
+  }
 
-TEST(TestCodeTemplate, Formatting) {
-  TemplateEnv e;
-  e.v("args", {"hi", "8"});
-  e.v("bar", {"what\non many\nlines...", "7"});
-  e.s("a", "3");
-  e.s("b", "4");
-  e.v("stuff", {"things...", "others"});
-  e.v("empty", {});
-  auto s = ct.format(e);
-  // std::cout << "'" << s << "'\n";
-  // std::cout << "'" << ct_expect << "'\n";
-  ASSERT_EQ(s, ct_expect);
+  {
+    TemplateEnv e;
+    e.v("args", {"hi", "8"});
+    e.v("bar", {"what\non many\nlines...", "7"});
+    e.s("a", "3");
+    e.s("b", "4");
+    e.v("stuff", {"things...", "others"});
+    e.v("empty", {});
+    auto s = ct.format(e);
+    // std::cout << "'" << s << "'\n";
+    // std::cout << "'" << ct_expect << "'\n";
+    ASSERT_EQ(s, ct_expect);
+  }
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp
index c8cb58e1886a..b949c9a45b25 100644
--- a/test/cpp/jit/test_constant_pooling.cpp
+++ b/test/cpp/jit/test_constant_pooling.cpp
@@ -1,10 +1,9 @@
-#include <gtest/gtest.h>
-
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/testing/file_check.h>
+#include "test/cpp/jit/test_base.h"
 
 #include <sstream>
 #include <string>
@@ -12,26 +11,26 @@
 namespace torch {
 namespace jit {
 
-TEST(ConstantPoolingTest, Int) {
-  auto graph = std::make_shared<Graph>();
-  parseIR(
-      R"IR(
+void testConstantPooling() {
+  {
+    auto graph = std::make_shared<Graph>();
+    parseIR(
+        R"IR(
 graph():
   %8 : int = prim::Constant[value=1]()
   %10 : int = prim::Constant[value=1]()
   return (%8, %10)
   )IR",
-      &*graph);
-  ConstantPooling(graph);
-  testing::FileCheck()
-      .check_count("prim::Constant", 1, /*exactly*/ true)
-      ->run(*graph);
-}
-
-TEST(ConstantPoolingTest, PoolingAcrossBlocks) {
-  auto graph = std::make_shared<Graph>();
-  parseIR(
-      R"IR(
+        &*graph);
+    ConstantPooling(graph);
+    testing::FileCheck()
+        .check_count("prim::Constant", 1, /*exactly*/ true)
+        ->run(*graph);
+  }
+  {
+    auto graph = std::make_shared<Graph>();
+    parseIR(
+        R"IR(
 graph(%cond : Tensor):
   %a : str = prim::Constant[value="bcd"]()
   %3 : bool = aten::Bool(%cond)
@@ -45,18 +44,17 @@ graph(%cond : Tensor):
   %7 : (str, str) = prim::TupleConstruct(%a, %b)
   return (%7)
   )IR",
-      &*graph);
-  ConstantPooling(graph);
-  testing::FileCheck()
-      .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
-      ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
-      ->run(*graph);
-}
-
-TEST(ConstantPoolingTest, PoolingDifferentDevices) {
-  auto graph = std::make_shared<Graph>();
-  parseIR(
-      R"IR(
+        &*graph);
+    ConstantPooling(graph);
+    testing::FileCheck()
+        .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
+        ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
+        ->run(*graph);
+  }
+  {
+    auto graph = std::make_shared<Graph>();
+    parseIR(
+        R"IR(
 graph():
   %2 : int = prim::Constant[value=2]()
   %1 : int = prim::Constant[value=1]()
@@ -72,21 +70,22 @@ graph():
   prim::Print(%x, %y, %z)
   return (%1)
   )IR",
-      &*graph);
-  // three tensors created - two different devices among the three
-  // don't have good support for parsing tensor constants
-  ConstantPropagation(graph);
-  ConstantPooling(graph);
-  testing::FileCheck()
-      .check_count(
-          "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
-          1,
-          /*exactly*/ true)
-      ->check_count(
-          "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
-          1,
-          /*exactly*/ true)
-      ->run(*graph);
+        &*graph);
+    // three tensors created - two different devices among the three
+    // don't have good support for parsing tensor constants
+    ConstantPropagation(graph);
+    ConstantPooling(graph);
+    testing::FileCheck()
+        .check_count(
+            "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
+            1,
+            /*exactly*/ true)
+        ->check_count(
+            "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
+            1,
+            /*exactly*/ true)
+        ->run(*graph);
+  }
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
index e97043f84d24..8da6d9d6a1b2 100644
--- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp
+++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
@@ -1,5 +1,4 @@
-#include <gtest/gtest.h>
-
+#include "test/cpp/jit/test_base.h"
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
@@ -7,7 +6,7 @@
 namespace torch {
 namespace jit {
 
-TEST(CreateAutodiffSubgraphsTest, Basic) {
+void testCreateAutodiffSubgraphs() {
   auto graph = build_lstm();
   CreateAutodiffSubgraphs(graph, /*threshold=*/2);
   // all of the ops are within the DifferentiableGraph
diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp
index 25c518d3142c..543fbc20eb3d 100644
--- a/test/cpp/jit/test_custom_class.cpp
+++ b/test/cpp/jit/test_custom_class.cpp
@@ -1,5 +1,3 @@
-#include <gtest/gtest.h>
-
 #include <torch/custom_class.h>
 #include <torch/script.h>
 
@@ -320,7 +318,7 @@ TORCH_LIBRARY(_TorchScriptTesting, m) {
 
 } // namespace
 
-TEST(CustomClassTest, TorchbindIValueAPI) {
+void testTorchbindIValueAPI() {
   script::Module m("m");
 
   // test make_custom_class API
diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp
index d3f61268e8f1..529b36385bd4 100644
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@@ -1,5 +1,4 @@
-#include <gtest/gtest.h>
-
+#include "test/cpp/jit/test_base.h"
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/ir/alias_analysis.h"
@@ -12,135 +11,134 @@
 namespace torch {
 namespace jit {
 
-TEST(CustomOperatorTest, InferredSchema) {
-  torch::RegisterOperators reg(
-      "foo::bar", [](double a, at::Tensor b) { return a + b; });
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
-  ASSERT_EQ(ops.size(), 1);
-
-  auto& op = ops.front();
-  ASSERT_EQ(op->schema().name(), "foo::bar");
-
-  ASSERT_EQ(op->schema().arguments().size(), 2);
-  ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
-  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-  ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
-  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+void testCustomOperators() {
+  {
+    torch::RegisterOperators reg(
+        "foo::bar", [](double a, at::Tensor b) { return a + b; });
+    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+    ASSERT_EQ(ops.size(), 1);
 
-  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+    auto& op = ops.front();
+    ASSERT_EQ(op->schema().name(), "foo::bar");
 
-  Stack stack;
-  push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
-  at::Tensor output;
-  pop(stack, output);
+    ASSERT_EQ(op->schema().arguments().size(), 2);
+    ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
+    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+    ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
+    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-}
+    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-TEST(CustomOperatorTest, ExplicitSchema) {
-  torch::RegisterOperators reg(
-      "foo::bar_with_schema(float a, Tensor b) -> Tensor",
-      [](double a, at::Tensor b) { return a + b; });
+    Stack stack;
+    push(stack, 2.0f, at::ones(5));
+    op->getOperation()(&stack);
+    at::Tensor output;
+    pop(stack, output);
 
-  auto& ops =
-      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
-  ASSERT_EQ(ops.size(), 1);
-
-  auto& op = ops.front();
-  ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
+    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+  }
+  {
+    torch::RegisterOperators reg(
+        "foo::bar_with_schema(float a, Tensor b) -> Tensor",
+        [](double a, at::Tensor b) { return a + b; });
 
-  ASSERT_EQ(op->schema().arguments().size(), 2);
-  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+    auto& ops =
+        getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+    ASSERT_EQ(ops.size(), 1);
 
-  ASSERT_EQ(op->schema().returns().size(), 1);
-  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+    auto& op = ops.front();
+    ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
 
-  Stack stack;
-  push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
-  at::Tensor output;
-  pop(stack, output);
+    ASSERT_EQ(op->schema().arguments().size(), 2);
+    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-}
+    ASSERT_EQ(op->schema().returns().size(), 1);
+    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-TEST(CustomOperatorTest, ListParameters) {
-  // Check that lists work well.
-  torch::RegisterOperators reg(
-      "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
-      [](torch::List<int64_t> ints,
-         torch::List<double> floats,
-         torch::List<at::Tensor> tensors) { return floats; });
-
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
-  ASSERT_EQ(ops.size(), 1);
-
-  auto& op = ops.front();
-  ASSERT_EQ(op->schema().name(), "foo::lists");
-
-  ASSERT_EQ(op->schema().arguments().size(), 3);
-  ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
-  ASSERT_TRUE(
-      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
-  ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
-  ASSERT_TRUE(
-      op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
-  ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
-  ASSERT_TRUE(
-      op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
-
-  ASSERT_EQ(op->schema().returns().size(), 1);
-  ASSERT_TRUE(
-      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
-
-  Stack stack;
-  push(stack, c10::List<int64_t>({1, 2}));
-  push(stack, c10::List<double>({1.0, 2.0}));
-  push(stack, c10::List<at::Tensor>({at::ones(5)}));
-  op->getOperation()(&stack);
-  c10::List<double> output;
-  pop(stack, output);
-
-  ASSERT_EQ(output.size(), 2);
-  ASSERT_EQ(output.get(0), 1.0);
-  ASSERT_EQ(output.get(1), 2.0);
-}
+    Stack stack;
+    push(stack, 2.0f, at::ones(5));
+    op->getOperation()(&stack);
+    at::Tensor output;
+    pop(stack, output);
 
-TEST(CustomOperatorTest, ListParameters2) {
-  torch::RegisterOperators reg(
-      "foo::lists2(Tensor[] tensors) -> Tensor[]",
-      [](torch::List<at::Tensor> tensors) { return tensors; });
+    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+  }
+  {
+    // Check that lists work well.
+    torch::RegisterOperators reg(
+        "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
+        [](torch::List<int64_t> ints,
+           torch::List<double> floats,
+           torch::List<at::Tensor> tensors) { return floats; });
+
+    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+    ASSERT_EQ(ops.size(), 1);
+
+    auto& op = ops.front();
+    ASSERT_EQ(op->schema().name(), "foo::lists");
+
+    ASSERT_EQ(op->schema().arguments().size(), 3);
+    ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
+    ASSERT_TRUE(
+        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
+    ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
+    ASSERT_TRUE(
+        op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
+    ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
+    ASSERT_TRUE(
+        op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
+
+    ASSERT_EQ(op->schema().returns().size(), 1);
+    ASSERT_TRUE(
+        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
+
+    Stack stack;
+    push(stack, c10::List<int64_t>({1, 2}));
+    push(stack, c10::List<double>({1.0, 2.0}));
+    push(stack, c10::List<at::Tensor>({at::ones(5)}));
+    op->getOperation()(&stack);
+    c10::List<double> output;
+    pop(stack, output);
+
+    ASSERT_EQ(output.size(), 2);
+    ASSERT_EQ(output.get(0), 1.0);
+    ASSERT_EQ(output.get(1), 2.0);
+  }
+  {
+    torch::RegisterOperators reg(
+        "foo::lists2(Tensor[] tensors) -> Tensor[]",
+        [](torch::List<at::Tensor> tensors) { return tensors; });
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
-  ASSERT_EQ(ops.size(), 1);
+    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+    ASSERT_EQ(ops.size(), 1);
 
-  auto& op = ops.front();
-  ASSERT_EQ(op->schema().name(), "foo::lists2");
+    auto& op = ops.front();
+    ASSERT_EQ(op->schema().name(), "foo::lists2");
 
-  ASSERT_EQ(op->schema().arguments().size(), 1);
-  ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
-  ASSERT_TRUE(
-      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
+    ASSERT_EQ(op->schema().arguments().size(), 1);
+    ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
+    ASSERT_TRUE(
+        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
 
-  ASSERT_EQ(op->schema().returns().size(), 1);
-  ASSERT_TRUE(
-      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
+    ASSERT_EQ(op->schema().returns().size(), 1);
+    ASSERT_TRUE(
+        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
 
-  Stack stack;
-  push(stack, c10::List<at::Tensor>({at::ones(5)}));
-  op->getOperation()(&stack);
-  c10::List<at::Tensor> output;
-  pop(stack, output);
+    Stack stack;
+    push(stack, c10::List<at::Tensor>({at::ones(5)}));
+    op->getOperation()(&stack);
+    c10::List<at::Tensor> output;
+    pop(stack, output);
 
-  ASSERT_EQ(output.size(), 1);
-  ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
+    ASSERT_EQ(output.size(), 1);
+    ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
+  }
 }
 
-TEST(CustomOperatorTest, Aliasing) {
+void testCustomOperatorAliasing() {
   torch::RegisterOperators reg(
       "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor {
         a.add_(b);
@@ -184,65 +182,77 @@ graph(%x: Tensor, %y: Tensor):
   }
 }
 
-static constexpr char op_list[] = "foofoo::bar.template;foo::another";
+void testIValueKWargs() {
+  const auto text = R"(
+    def foo(a : int, b : int, c : int = 4):
+      return a + 2*b + 3*c
+  )";
+  auto cu = compile(text);
+  auto result = cu->get_function("foo")({1}, {{"b", 3}});
+  ASSERT_EQ(result.toInt(), 19);
+}
+
+void testTemplatedOperatorCreator() {
+  constexpr char op_list[] = "foofoo::bar.template;foo::another";
 #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n)                                   \
   torch::detail::SelectiveStr<c10::impl::op_whitelist_contains_name_in_schema( \
       l, n)>(n)
 
-TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
-  // Try to register an op name that does not exist in op_list.
-  // Expected: the op name is not registered.
-  torch::jit::RegisterOperators reg({OperatorGenerator(
-      TORCH_SELECTIVE_NAME_IN_SCHEMA(
-          op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
-      [](Stack* stack) {
-        double a;
-        at::Tensor b;
-        pop(stack, a, b);
-        push(stack, a + b);
-      },
-      aliasAnalysisFromSchema())});
-
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
-  ASSERT_EQ(ops.size(), 0);
-}
+  {
+    // Try to register an op name that does not exist in op_list.
+    // Expected: the op name is not registered.
+    torch::jit::RegisterOperators reg({OperatorGenerator(
+        TORCH_SELECTIVE_NAME_IN_SCHEMA(
+            op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
+        [](Stack* stack) {
+          double a;
+          at::Tensor b;
+          pop(stack, a, b);
+          push(stack, a + b);
+        },
+        aliasAnalysisFromSchema())});
+
+    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+    ASSERT_EQ(ops.size(), 0);
+  }
 
-TEST(TestCustomOperator, OperatorGeneratorBasic) {
-  // The operator should be successfully registered since its name is in the
-  // whitelist.
-  torch::jit::RegisterOperators reg({OperatorGenerator(
-      TORCH_SELECTIVE_NAME_IN_SCHEMA(
-          op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
-      [](Stack* stack) {
-        double a;
-        at::Tensor b;
-        pop(stack, a, b);
-        push(stack, a + b);
-      },
-      aliasAnalysisFromSchema())});
-
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
-  ASSERT_EQ(ops.size(), 1);
-
-  auto& op = ops.front();
-  ASSERT_EQ(op->schema().name(), "foofoo::bar");
-
-  ASSERT_EQ(op->schema().arguments().size(), 2);
-  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
-
-  ASSERT_EQ(op->schema().returns().size(), 1);
-  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
-
-  Stack stack;
-  push(stack, 2.0f, at::ones(5));
-  op->getOperation()(&stack);
-  at::Tensor output;
-  pop(stack, output);
-
-  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+  {
+    // The operator should be successfully registered since its name is in the
+    // whitelist.
+    torch::jit::RegisterOperators reg({OperatorGenerator(
+        TORCH_SELECTIVE_NAME_IN_SCHEMA(
+            op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
+        [](Stack* stack) {
+          double a;
+          at::Tensor b;
+          pop(stack, a, b);
+          push(stack, a + b);
+        },
+        aliasAnalysisFromSchema())});
+
+    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+    ASSERT_EQ(ops.size(), 1);
+
+    auto& op = ops.front();
+    ASSERT_EQ(op->schema().name(), "foofoo::bar");
+
+    ASSERT_EQ(op->schema().arguments().size(), 2);
+    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+
+    ASSERT_EQ(op->schema().returns().size(), 1);
+    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+
+    Stack stack;
+    push(stack, 2.0f, at::ones(5));
+    op->getOperation()(&stack);
+    at::Tensor output;
+    pop(stack, output);
+
+    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+  }
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 92baba1168da..953d1bf42fc0 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2225,15 +2225,5 @@ void testProfilerDisableInCallback() {
   t.join();
 }
 
-void testIValueKWargs() {
-  const auto text = R"(
-    def foo(a : int, b : int, c : int = 4):
-      return a + 2*b + 3*c
-  )";
-  auto cu = compile(text);
-  auto result = cu->get_function("foo")({1}, {{"b", 3}});
-  ASSERT_EQ(result.toInt(), 19);
-}
-
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 8f43882c9e22..45d7f48b1f8a 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -9,14 +9,22 @@
 namespace torch {
 namespace jit {
 #define TH_FORALL_TESTS(_)                        \
+  _(ADFormulas)                                   \
   _(Attributes)                                   \
   _(Blocks)                                       \
   _(CallStack)                                    \
   _(CallStackCaching)                             \
+  _(CodeTemplate)                                 \
   _(ControlFlow)                                  \
+  _(CreateAutodiffSubgraphs)                      \
+  _(CustomOperators)                              \
+  _(CustomOperatorAliasing)                       \
+  _(TemplatedOperatorCreator)                     \
   _(IValueKWargs)                                 \
   _(CustomFusion)                                 \
   _(SchemaMatching)                               \
+  _(Differentiate)                                \
+  _(DifferentiateWithRequiresGrad)                \
   _(FromQualString)                               \
   _(InternedStrings)                              \
   _(PassManagement)                               \
@@ -27,9 +35,12 @@ namespace jit {
   _(SubgraphUtils)                                \
   _(SubgraphUtilsVmap)                            \
   _(IRParser)                                     \
+  _(ConstantPooling)                              \
+  _(CleanUpPasses)                                \
   _(THNNConv)                                     \
   _(ATenNativeBatchNorm)                          \
   _(NoneSchemaMatch)                              \
+  _(ClassParser)                                  \
   _(UnifyTypes)                                   \
   _(Profiler)                                     \
   _(FallbackGraphs)                               \
@@ -50,11 +61,15 @@ namespace jit {
   _(ModuleDeepcopyAliasing)                       \
   _(ModuleDefine)                                 \
   _(QualifiedName)                                \
+  _(ClassImport)                                  \
+  _(ScriptObject)                                 \
   _(ExtraFilesHookPreference)                     \
   _(SaveExtraFilesHook)                           \
   _(TypeTags)                                     \
   _(DCE)                                          \
   _(CustomFusionNestedBlocks)                     \
+  _(ClassDerive)                                  \
+  _(SaveLoadTorchbind)                            \
   _(ModuleInterfaceSerialization)                 \
   _(ModuleCloneWithModuleInterface)               \
   _(ClassTypeAddRemoveAttr)                       \
@@ -85,6 +100,7 @@ namespace jit {
   _(LiteInterpreterHierarchyModuleInfo)           \
   _(LiteInterpreterDuplicatedClassTypeModuleInfo) \
   _(LiteInterpreterEval)                          \
+  _(TorchbindIValueAPI)                           \
   _(LiteInterpreterDict)                          \
   _(LiteInterpreterFindAndRunMethod)              \
   _(LiteInterpreterFindWrongMethodName)           \

From 27c7158166089db7329b9f0dea65da36e3785cda Mon Sep 17 00:00:00 2001
From: Bugra Akyildiz <vbugra@fb.com>
Date: Wed, 23 Sep 2020 17:55:24 -0700
Subject: [PATCH 071/449] Remove __future__ imports for legacy Python2 supports
 (#45033)

Summary:
There is a module called `2to3` which you can target for future specifically to remove these, the directory of `caffe2` has the most redundant imports:

```2to3 -f future -w caffe2```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45033

Reviewed By: seemethere

Differential Revision: D23808648

Pulled By: bugra

fbshipit-source-id: 38971900f0fe43ab44a9168e57f2307580d36a38
---
 caffe2/contrib/aten/aten_test.py                          | 8 ++++----
 .../contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py   | 8 ++++----
 caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py  | 8 ++++----
 caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py | 2 +-
 caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py         | 8 ++++----
 caffe2/contrib/fakelowp/test/test_fusions.py              | 2 +-
 caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py        | 2 +-
 caffe2/contrib/fakelowp/test/test_int8_quant.py           | 2 +-
 caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py  | 8 ++++----
 caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py         | 8 ++++----
 caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py   | 8 ++++----
 caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py   | 2 +-
 caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py   | 2 +-
 caffe2/contrib/gloo/gloo_test.py                          | 8 ++++----
 caffe2/contrib/nccl/nccl_ops_test.py                      | 8 ++++----
 caffe2/contrib/nnpack/nnpack_ops_test.py                  | 8 ++++----
 caffe2/contrib/playground/AnyExp.py                       | 8 ++++----
 caffe2/contrib/playground/AnyExpOnTerm.py                 | 8 ++++----
 caffe2/contrib/playground/ModuleRegister.py               | 8 ++++----
 caffe2/contrib/playground/checkpoint.py                   | 8 ++++----
 caffe2/contrib/playground/compute_loss.py                 | 8 ++++----
 caffe2/contrib/playground/compute_topk_accuracy.py        | 8 ++++----
 caffe2/contrib/playground/meter.py                        | 8 ++++----
 caffe2/contrib/playground/module_map.py                   | 8 ++++----
 caffe2/contrib/playground/output_generator.py             | 8 ++++----
 caffe2/contrib/playground/resnetdemo/IN1k_resnet.py       | 8 ++++----
 .../playground/resnetdemo/IN1k_resnet_no_test_model.py    | 8 ++++----
 .../resnetdemo/caffe2_resnet50_default_forward.py         | 8 ++++----
 .../resnetdemo/caffe2_resnet50_default_param_update.py    | 8 ++++----
 .../playground/resnetdemo/explicit_resnet_forward.py      | 8 ++++----
 .../playground/resnetdemo/explicit_resnet_param_update.py | 8 ++++----
 caffe2/contrib/playground/resnetdemo/gfs_IN1k.py          | 8 ++++----
 .../resnetdemo/override_no_test_model_no_checkpoint.py    | 8 ++++----
 .../contrib/playground/resnetdemo/rendezvous_filestore.py | 8 ++++----
 caffe2/contrib/prof/cuda_profile_ops_test.py              | 8 ++++----
 caffe2/contrib/tensorboard/tensorboard.py                 | 8 ++++----
 caffe2/contrib/tensorboard/tensorboard_exporter.py        | 8 ++++----
 caffe2/contrib/tensorboard/tensorboard_exporter_test.py   | 8 ++++----
 caffe2/contrib/tensorboard/tensorboard_test.py            | 8 ++++----
 caffe2/contrib/warpctc/ctc_ops_test.py                    | 6 +++---
 caffe2/core/nomnigraph/op_gen.py                          | 8 ++++----
 caffe2/distributed/file_store_handler_op_test.py          | 8 ++++----
 caffe2/distributed/redis_store_handler_op_test.py         | 8 ++++----
 caffe2/distributed/store_ops_test_util.py                 | 8 ++++----
 caffe2/experiments/python/SparseTransformer.py            | 8 ++++----
 caffe2/experiments/python/convnet_benchmarks.py           | 8 ++++----
 caffe2/experiments/python/device_reduce_sum_bench.py      | 8 ++++----
 caffe2/experiments/python/funhash_op_test.py              | 8 ++++----
 caffe2/experiments/python/net_construct_bench.py          | 8 ++++----
 caffe2/experiments/python/sparse_funhash_op_test.py       | 8 ++++----
 caffe2/experiments/python/sparse_reshape_op_test.py       | 8 ++++----
 caffe2/experiments/python/tt_contraction_op_test.py       | 8 ++++----
 caffe2/experiments/python/tt_pad_op_test.py               | 8 ++++----
 caffe2/perfkernels/hp_emblookup_codegen.py                | 2 +-
 caffe2/python/__init__.py                                 | 2 +-
 caffe2/python/allcompare_test.py                          | 8 ++++----
 caffe2/python/attention.py                                | 8 ++++----
 caffe2/python/benchmark_generator.py                      | 8 ++++----
 .../benchmarks/fused_rowwise_nbit_conversion_bench.py     | 2 +-
 .../benchmarks/sparse_lengths_sum_nbit_benchmark.py       | 2 +-
 caffe2/python/binarysize.py                               | 8 ++++----
 caffe2/python/brew.py                                     | 8 ++++----
 caffe2/python/brew_test.py                                | 8 ++++----
 caffe2/python/build.py                                    | 8 ++++----
 caffe2/python/cached_reader.py                            | 8 ++++----
 caffe2/python/checkpoint.py                               | 8 ++++----
 caffe2/python/checkpoint_test.py                          | 8 ++++----
 caffe2/python/cnn.py                                      | 8 ++++----
 caffe2/python/context.py                                  | 8 ++++----
 caffe2/python/context_test.py                             | 8 ++++----
 caffe2/python/control.py                                  | 8 ++++----
 caffe2/python/control_ops_grad.py                         | 8 ++++----
 caffe2/python/control_ops_grad_test.py                    | 8 ++++----
 caffe2/python/control_ops_util.py                         | 8 ++++----
 caffe2/python/control_test.py                             | 8 ++++----
 caffe2/python/convert.py                                  | 8 ++++----
 caffe2/python/convert_test.py                             | 8 ++++----
 caffe2/python/core.py                                     | 8 ++++----
 caffe2/python/core_gradients_test.py                      | 8 ++++----
 caffe2/python/core_test.py                                | 8 ++++----
 caffe2/python/crf.py                                      | 2 +-
 caffe2/python/crf_predict.py                              | 2 +-
 caffe2/python/crf_viterbi_test.py                         | 8 ++++----
 caffe2/python/data_parallel_model.py                      | 6 +++---
 caffe2/python/data_parallel_model_test.py                 | 6 +++---
 caffe2/python/data_workers.py                             | 8 ++++----
 caffe2/python/data_workers_test.py                        | 8 ++++----
 caffe2/python/dataio.py                                   | 8 ++++----
 caffe2/python/dataio_test.py                              | 8 ++++----
 caffe2/python/dataset.py                                  | 8 ++++----
 caffe2/python/db_file_reader.py                           | 8 ++++----
 caffe2/python/db_test.py                                  | 8 ++++----
 caffe2/python/docs/formatter.py                           | 8 ++++----
 caffe2/python/docs/generator.py                           | 8 ++++----
 caffe2/python/docs/github.py                              | 8 ++++----
 caffe2/python/docs/parser.py                              | 8 ++++----
 caffe2/python/dyndep.py                                   | 8 ++++----
 caffe2/python/embedding_generation_benchmark.py           | 8 ++++----
 caffe2/python/examples/char_rnn.py                        | 8 ++++----
 caffe2/python/examples/lmdb_create_example.py             | 8 ++++----
 caffe2/python/experiment_util.py                          | 8 ++++----
 caffe2/python/extension_loader.py                         | 8 ++++----
 caffe2/python/fakefp16_transform_lib.py                   | 6 +++---
 caffe2/python/fakelowp/init_shared_libs.py                | 2 +-
 caffe2/python/fakelowp/test_utils.py                      | 8 ++++----
 caffe2/python/filler_test.py                              | 6 +++---
 caffe2/python/functional.py                               | 8 ++++----
 caffe2/python/functional_test.py                          | 8 ++++----
 caffe2/python/fused_8bit_rowwise_conversion_ops_test.py   | 8 ++++----
 caffe2/python/gradient_check_test.py                      | 8 ++++----
 caffe2/python/gradient_checker.py                         | 8 ++++----
 caffe2/python/gru_cell.py                                 | 8 ++++----
 caffe2/python/helpers/algebra.py                          | 8 ++++----
 caffe2/python/helpers/arg_scope.py                        | 6 +++---
 caffe2/python/helpers/array_helpers.py                    | 8 ++++----
 caffe2/python/helpers/control_ops.py                      | 8 ++++----
 caffe2/python/helpers/conv.py                             | 8 ++++----
 caffe2/python/helpers/db_input.py                         | 8 ++++----
 caffe2/python/helpers/dropout.py                          | 8 ++++----
 caffe2/python/helpers/elementwise_linear.py               | 8 ++++----
 caffe2/python/helpers/fc.py                               | 8 ++++----
 caffe2/python/helpers/nonlinearity.py                     | 8 ++++----
 caffe2/python/helpers/normalization.py                    | 8 ++++----
 caffe2/python/helpers/pooling.py                          | 8 ++++----
 caffe2/python/helpers/tools.py                            | 8 ++++----
 caffe2/python/helpers/train.py                            | 8 ++++----
 caffe2/python/hip_test_util.py                            | 8 ++++----
 caffe2/python/hsm_util.py                                 | 8 ++++----
 caffe2/python/hypothesis_test.py                          | 6 +++---
 caffe2/python/hypothesis_test_util.py                     | 8 ++++----
 caffe2/python/ideep/LRN_op_test.py                        | 8 ++++----
 caffe2/python/ideep/adam_op_test.py                       | 8 ++++----
 caffe2/python/ideep/blobs_queue_db_test.py                | 8 ++++----
 caffe2/python/ideep/channel_shuffle_op_test.py            | 8 ++++----
 caffe2/python/ideep/concat_split_op_test.py               | 8 ++++----
 caffe2/python/ideep/conv_op_test.py                       | 8 ++++----
 caffe2/python/ideep/conv_transpose_test.py                | 6 +++---
 caffe2/python/ideep/convfusion_op_test.py                 | 8 ++++----
 caffe2/python/ideep/copy_op_test.py                       | 8 ++++----
 caffe2/python/ideep/dropout_op_test.py                    | 8 ++++----
 caffe2/python/ideep/elementwise_sum_op_test.py            | 8 ++++----
 caffe2/python/ideep/expanddims_squeeze_op_test.py         | 8 ++++----
 caffe2/python/ideep/fc_op_test.py                         | 8 ++++----
 caffe2/python/ideep/leaky_relu_op_test.py                 | 8 ++++----
 caffe2/python/ideep/moment_sgd_op_test.py                 | 8 ++++----
 caffe2/python/ideep/operator_fallback_op_test.py          | 8 ++++----
 caffe2/python/ideep/order_switch_op_test.py               | 8 ++++----
 caffe2/python/ideep/pool_op_test.py                       | 8 ++++----
 caffe2/python/ideep/pre_convert_test.py                   | 8 ++++----
 caffe2/python/ideep/relu_op_test.py                       | 8 ++++----
 caffe2/python/ideep/reshape_op_test.py                    | 8 ++++----
 caffe2/python/ideep/shape_op_test.py                      | 8 ++++----
 caffe2/python/ideep/sigmoid_op_test.py                    | 8 ++++----
 caffe2/python/ideep/softmax_op_test.py                    | 8 ++++----
 caffe2/python/ideep/spatial_bn_op_test.py                 | 8 ++++----
 caffe2/python/ideep/test_ideep_net.py                     | 8 ++++----
 caffe2/python/ideep/transform_ideep_net.py                | 8 ++++----
 caffe2/python/ideep/transpose_op_test.py                  | 8 ++++----
 caffe2/python/ideep/weightedsum_op_test.py                | 8 ++++----
 caffe2/python/ideep_test_util.py                          | 8 ++++----
 caffe2/python/layer_model_helper.py                       | 8 ++++----
 caffe2/python/layer_model_instantiator.py                 | 8 ++++----
 caffe2/python/layer_parameter_sharing_test.py             | 8 ++++----
 caffe2/python/layer_test_util.py                          | 8 ++++----
 caffe2/python/layers/__init__.py                          | 8 ++++----
 caffe2/python/layers/adaptive_weight.py                   | 2 +-
 caffe2/python/layers/add_bias.py                          | 8 ++++----
 caffe2/python/layers/arc_cosine_feature_map.py            | 8 ++++----
 caffe2/python/layers/batch_huber_loss.py                  | 8 ++++----
 caffe2/python/layers/batch_lr_loss.py                     | 8 ++++----
 caffe2/python/layers/batch_mse_loss.py                    | 8 ++++----
 caffe2/python/layers/batch_normalization.py               | 8 ++++----
 caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py  | 8 ++++----
 caffe2/python/layers/batch_softmax_loss.py                | 8 ++++----
 caffe2/python/layers/blob_weighted_sum.py                 | 8 ++++----
 caffe2/python/layers/bpr_loss.py                          | 8 ++++----
 caffe2/python/layers/bucket_weighted.py                   | 8 ++++----
 caffe2/python/layers/build_index.py                       | 8 ++++----
 caffe2/python/layers/concat.py                            | 8 ++++----
 caffe2/python/layers/constant_weight.py                   | 8 ++++----
 caffe2/python/layers/conv.py                              | 8 ++++----
 caffe2/python/layers/dropout.py                           | 8 ++++----
 caffe2/python/layers/fc.py                                | 8 ++++----
 caffe2/python/layers/fc_with_bootstrap.py                 | 2 +-
 caffe2/python/layers/fc_without_bias.py                   | 8 ++++----
 caffe2/python/layers/feature_sparse_to_dense.py           | 2 +-
 caffe2/python/layers/functional.py                        | 8 ++++----
 caffe2/python/layers/gather_record.py                     | 8 ++++----
 caffe2/python/layers/homotopy_weight.py                   | 8 ++++----
 caffe2/python/layers/label_smooth.py                      | 8 ++++----
 caffe2/python/layers/last_n_window_collector.py           | 8 ++++----
 caffe2/python/layers/layer_normalization.py               | 8 ++++----
 caffe2/python/layers/layers.py                            | 2 +-
 caffe2/python/layers/margin_rank_loss.py                  | 8 ++++----
 caffe2/python/layers/merge_id_lists.py                    | 8 ++++----
 caffe2/python/layers/pairwise_similarity.py               | 8 ++++----
 caffe2/python/layers/position_weighted.py                 | 8 ++++----
 caffe2/python/layers/random_fourier_features.py           | 8 ++++----
 caffe2/python/layers/reservoir_sampling.py                | 8 ++++----
 caffe2/python/layers/sampling_train.py                    | 8 ++++----
 caffe2/python/layers/sampling_trainable_mixin.py          | 8 ++++----
 caffe2/python/layers/select_record_by_context.py          | 8 ++++----
 caffe2/python/layers/semi_random_features.py              | 8 ++++----
 caffe2/python/layers/sparse_dropout_with_replacement.py   | 8 ++++----
 caffe2/python/layers/sparse_feature_hash.py               | 8 ++++----
 caffe2/python/layers/sparse_lookup.py                     | 8 ++++----
 caffe2/python/layers/split.py                             | 8 ++++----
 caffe2/python/layers/tags.py                              | 8 ++++----
 caffe2/python/layers/uniform_sampling.py                  | 8 ++++----
 caffe2/python/layers_test.py                              | 8 ++++----
 caffe2/python/lazy_dyndep.py                              | 8 ++++----
 caffe2/python/lazy_dyndep_test.py                         | 8 ++++----
 .../python/lengths_reducer_fused_8bit_rowwise_ops_test.py | 2 +-
 caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py    | 8 ++++----
 caffe2/python/lstm_benchmark.py                           | 8 ++++----
 caffe2/python/memonger.py                                 | 8 ++++----
 caffe2/python/memonger_test.py                            | 8 ++++----
 caffe2/python/mkl/mkl_LRN_op_test.py                      | 8 ++++----
 caffe2/python/mkl/mkl_LRN_speed_test.py                   | 8 ++++----
 caffe2/python/mkl/mkl_concat_op_test.py                   | 8 ++++----
 caffe2/python/mkl/mkl_conv_op_test.py                     | 8 ++++----
 caffe2/python/mkl/mkl_copy_op_test.py                     | 8 ++++----
 caffe2/python/mkl/mkl_elementwise_add_op_test.py          | 8 ++++----
 caffe2/python/mkl/mkl_elementwise_sum_op_test.py          | 8 ++++----
 caffe2/python/mkl/mkl_fc_op_test.py                       | 8 ++++----
 caffe2/python/mkl/mkl_fc_speed_test.py                    | 8 ++++----
 caffe2/python/mkl/mkl_fill_op_test.py                     | 8 ++++----
 caffe2/python/mkl/mkl_pool_op_test.py                     | 8 ++++----
 caffe2/python/mkl/mkl_pool_speed_test.py                  | 8 ++++----
 caffe2/python/mkl/mkl_relu_op_test.py                     | 8 ++++----
 caffe2/python/mkl/mkl_sbn_op_test.py                      | 8 ++++----
 caffe2/python/mkl/mkl_sbn_speed_test.py                   | 8 ++++----
 caffe2/python/mkl/mkl_sigmoid_op_test.py                  | 8 ++++----
 caffe2/python/mkl/mkl_speed_test.py                       | 8 ++++----
 caffe2/python/mkl/mkl_squeeze_op_test.py                  | 8 ++++----
 caffe2/python/mkl/rewrite_graph.py                        | 8 ++++----
 caffe2/python/mkl/rewrite_graph_test.py                   | 8 ++++----
 caffe2/python/mkl_test_util.py                            | 8 ++++----
 caffe2/python/model_helper.py                             | 8 ++++----
 caffe2/python/model_helper_test.py                        | 2 +-
 caffe2/python/modeling/compute_histogram_for_blobs.py     | 8 ++++----
 .../python/modeling/compute_histogram_for_blobs_test.py   | 8 ++++----
 caffe2/python/modeling/compute_norm_for_blobs.py          | 8 ++++----
 caffe2/python/modeling/compute_norm_for_blobs_test.py     | 8 ++++----
 caffe2/python/modeling/compute_statistics_for_blobs.py    | 8 ++++----
 .../python/modeling/compute_statistics_for_blobs_test.py  | 8 ++++----
 caffe2/python/modeling/get_entry_from_blobs.py            | 8 ++++----
 caffe2/python/modeling/get_entry_from_blobs_test.py       | 8 ++++----
 caffe2/python/modeling/gradient_clipping.py               | 8 ++++----
 caffe2/python/modeling/gradient_clipping_test.py          | 8 ++++----
 caffe2/python/modeling/initializers.py                    | 8 ++++----
 caffe2/python/modeling/initializers_test.py               | 8 ++++----
 caffe2/python/modeling/net_modifier.py                    | 8 ++++----
 caffe2/python/modeling/parameter_info.py                  | 8 ++++----
 caffe2/python/modeling/parameter_sharing.py               | 8 ++++----
 caffe2/python/modeling/parameter_sharing_test.py          | 8 ++++----
 caffe2/python/models/__sym_init__.py                      | 8 ++++----
 caffe2/python/models/download.py                          | 8 ++++----
 caffe2/python/models/imagenet_trainer_test_utils.py       | 8 ++++----
 caffe2/python/models/resnet.py                            | 6 +++---
 caffe2/python/models/resnet_test.py                       | 8 ++++----
 caffe2/python/models/seq2seq/beam_search.py               | 8 ++++----
 caffe2/python/models/seq2seq/seq2seq_beam_search_test.py  | 8 ++++----
 caffe2/python/models/seq2seq/seq2seq_model_helper.py      | 8 ++++----
 caffe2/python/models/seq2seq/seq2seq_model_helper_test.py | 8 ++++----
 caffe2/python/models/seq2seq/seq2seq_util.py              | 8 ++++----
 caffe2/python/models/seq2seq/train.py                     | 8 ++++----
 caffe2/python/models/seq2seq/translate.py                 | 8 ++++----
 caffe2/python/models/shufflenet.py                        | 8 ++++----
 caffe2/python/models/shufflenet_test.py                   | 8 ++++----
 caffe2/python/modifier_context.py                         | 8 ++++----
 caffe2/python/net_builder.py                              | 8 ++++----
 caffe2/python/net_builder_test.py                         | 8 ++++----
 caffe2/python/net_drawer.py                               | 8 ++++----
 caffe2/python/net_printer.py                              | 8 ++++----
 caffe2/python/net_printer_test.py                         | 8 ++++----
 caffe2/python/nomnigraph.py                               | 2 +-
 caffe2/python/nomnigraph_test.py                          | 8 ++++----
 caffe2/python/nomnigraph_transformations.py               | 2 +-
 caffe2/python/nomnigraph_transformations_test.py          | 8 ++++----
 caffe2/python/normalizer.py                               | 2 +-
 caffe2/python/normalizer_context.py                       | 8 ++++----
 caffe2/python/normalizer_test.py                          | 6 +++---
 caffe2/python/numa_benchmark.py                           | 6 +++---
 caffe2/python/numa_test.py                                | 6 +++---
 caffe2/python/observer_test.py                            | 8 ++++----
 caffe2/python/onnx/backend.py                             | 8 ++++----
 caffe2/python/onnx/backend_cpp_rep.py                     | 8 ++++----
 caffe2/python/onnx/backend_rep.py                         | 8 ++++----
 caffe2/python/onnx/bin/conversion.py                      | 6 +++---
 caffe2/python/onnx/error.py                               | 8 ++++----
 caffe2/python/onnx/frontend.py                            | 8 ++++----
 caffe2/python/onnx/helper.py                              | 8 ++++----
 caffe2/python/onnx/onnxifi.py                             | 8 ++++----
 caffe2/python/onnx/test_onnxifi.py                        | 8 ++++----
 caffe2/python/onnx/tests/__init__.py                      | 8 ++++----
 caffe2/python/onnx/tests/c2_ref_test.py                   | 8 ++++----
 caffe2/python/onnx/tests/conversion_test.py               | 6 +++---
 caffe2/python/onnx/tests/helper_test.py                   | 8 ++++----
 caffe2/python/onnx/tests/onnx_backend_test.py             | 8 ++++----
 caffe2/python/onnx/tests/ssa_test.py                      | 8 ++++----
 caffe2/python/onnx/tests/test_utils.py                    | 8 ++++----
 caffe2/python/onnx/workspace.py                           | 8 ++++----
 caffe2/python/operator_fp_exceptions_test.py              | 6 +++---
 caffe2/python/operator_test/activation_ops_test.py        | 8 ++++----
 caffe2/python/operator_test/adadelta_test.py              | 8 ++++----
 caffe2/python/operator_test/adagrad_test.py               | 2 +-
 caffe2/python/operator_test/adagrad_test_helper.py        | 2 +-
 caffe2/python/operator_test/adam_test.py                  | 8 ++++----
 caffe2/python/operator_test/affine_channel_op_test.py     | 6 +++---
 caffe2/python/operator_test/apmeter_test.py               | 8 ++++----
 caffe2/python/operator_test/arg_ops_test.py               | 8 ++++----
 caffe2/python/operator_test/assert_test.py                | 6 +++---
 caffe2/python/operator_test/atomic_ops_test.py            | 8 ++++----
 caffe2/python/operator_test/basic_rnn_test.py             | 8 ++++----
 caffe2/python/operator_test/batch_box_cox_test.py         | 8 ++++----
 caffe2/python/operator_test/batch_bucketize_op_test.py    | 8 ++++----
 caffe2/python/operator_test/batch_moments_op_test.py      | 6 +++---
 .../python/operator_test/batch_sparse_to_dense_op_test.py | 8 ++++----
 caffe2/python/operator_test/bbox_transform_test.py        | 8 ++++----
 caffe2/python/operator_test/bisect_percentile_op_test.py  | 8 ++++----
 caffe2/python/operator_test/blobs_queue_db_test.py        | 8 ++++----
 caffe2/python/operator_test/boolean_mask_test.py          | 6 +++---
 caffe2/python/operator_test/boolean_unmask_test.py        | 8 ++++----
 caffe2/python/operator_test/box_with_nms_limit_op_test.py | 8 ++++----
 caffe2/python/operator_test/bucketize_op_test.py          | 8 ++++----
 caffe2/python/operator_test/cast_op_test.py               | 8 ++++----
 caffe2/python/operator_test/ceil_op_test.py               | 8 ++++----
 .../operator_test/channel_backprop_stats_op_test.py       | 8 ++++----
 caffe2/python/operator_test/channel_shuffle_test.py       | 2 +-
 caffe2/python/operator_test/channel_stats_op_test.py      | 6 +++---
 caffe2/python/operator_test/checkpoint_test.py            | 8 ++++----
 caffe2/python/operator_test/clip_op_test.py               | 8 ++++----
 caffe2/python/operator_test/clip_tensor_op_test.py        | 8 ++++----
 .../collect_and_distribute_fpn_rpn_proposals_op_test.py   | 8 ++++----
 caffe2/python/operator_test/concat_split_op_test.py       | 8 ++++----
 caffe2/python/operator_test/conditional_test.py           | 6 +++---
 caffe2/python/operator_test/conftest.py                   | 8 ++++----
 caffe2/python/operator_test/conv_test.py                  | 2 +-
 caffe2/python/operator_test/conv_transpose_test.py        | 6 +++---
 caffe2/python/operator_test/copy_ops_test.py              | 8 ++++----
 .../python/operator_test/copy_rows_to_tensor_op_test.py   | 2 +-
 .../operator_test/cosine_embedding_criterion_op_test.py   | 8 ++++----
 caffe2/python/operator_test/counter_ops_test.py           | 8 ++++----
 caffe2/python/operator_test/crf_test.py                   | 8 ++++----
 caffe2/python/operator_test/cross_entropy_ops_test.py     | 8 ++++----
 .../operator_test/ctc_beam_search_decoder_op_test.py      | 8 ++++----
 caffe2/python/operator_test/ctc_greedy_decoder_op_test.py | 8 ++++----
 caffe2/python/operator_test/cudnn_recurrent_test.py       | 8 ++++----
 caffe2/python/operator_test/data_couple_op_test.py        | 8 ++++----
 caffe2/python/operator_test/dataset_ops_test.py           | 8 ++++----
 caffe2/python/operator_test/deform_conv_test.py           | 2 +-
 .../operator_test/dense_vector_to_id_list_op_test.py      | 8 ++++----
 caffe2/python/operator_test/depthwise_3x3_conv_test.py    | 8 ++++----
 caffe2/python/operator_test/detectron_keypoints.py        | 8 ++++----
 caffe2/python/operator_test/distance_op_test.py           | 8 ++++----
 caffe2/python/operator_test/dropout_op_test.py            | 8 ++++----
 caffe2/python/operator_test/duplicate_operands_test.py    | 8 ++++----
 caffe2/python/operator_test/elementwise_linear_op_test.py | 8 ++++----
 .../python/operator_test/elementwise_logical_ops_test.py  | 8 ++++----
 .../python/operator_test/elementwise_op_broadcast_test.py | 8 ++++----
 caffe2/python/operator_test/elementwise_ops_test.py       | 8 ++++----
 caffe2/python/operator_test/emptysample_ops_test.py       | 8 ++++----
 caffe2/python/operator_test/enforce_finite_op_test.py     | 8 ++++----
 caffe2/python/operator_test/ensure_clipped_test.py        | 2 +-
 caffe2/python/operator_test/ensure_cpu_output_op_test.py  | 8 ++++----
 caffe2/python/operator_test/erf_op_test.py                | 8 ++++----
 caffe2/python/operator_test/expand_op_test.py             | 8 ++++----
 caffe2/python/operator_test/fc_operator_test.py           | 8 ++++----
 caffe2/python/operator_test/feature_maps_ops_test.py      | 8 ++++----
 caffe2/python/operator_test/filler_ops_test.py            | 8 ++++----
 caffe2/python/operator_test/find_op_test.py               | 8 ++++----
 caffe2/python/operator_test/flatten_op_test.py            | 8 ++++----
 caffe2/python/operator_test/flexible_top_k_test.py        | 8 ++++----
 caffe2/python/operator_test/floor_op_test.py              | 8 ++++----
 .../fused_nbit_rowwise_conversion_ops_test.py             | 2 +-
 .../operator_test/fused_nbit_rowwise_test_helper.py       | 2 +-
 caffe2/python/operator_test/gather_ops_test.py            | 8 ++++----
 caffe2/python/operator_test/gather_ranges_op_test.py      | 2 +-
 .../given_tensor_byte_string_to_uint8_fill_op_test.py     | 8 ++++----
 caffe2/python/operator_test/given_tensor_fill_op_test.py  | 8 ++++----
 caffe2/python/operator_test/glu_op_test.py                | 8 ++++----
 caffe2/python/operator_test/group_conv_test.py            | 6 +++---
 caffe2/python/operator_test/group_norm_op_test.py         | 6 +++---
 caffe2/python/operator_test/gru_test.py                   | 8 ++++----
 .../python/operator_test/heatmap_max_keypoint_op_test.py  | 8 ++++----
 caffe2/python/operator_test/hsm_test.py                   | 8 ++++----
 caffe2/python/operator_test/hyperbolic_ops_test.py        | 8 ++++----
 caffe2/python/operator_test/im2col_col2im_test.py         | 8 ++++----
 caffe2/python/operator_test/image_input_op_test.py        | 8 ++++----
 caffe2/python/operator_test/index_hash_ops_test.py        | 8 ++++----
 caffe2/python/operator_test/index_ops_test.py             | 8 ++++----
 caffe2/python/operator_test/instance_norm_test.py         | 6 +++---
 caffe2/python/operator_test/integral_image_ops_test.py    | 8 ++++----
 caffe2/python/operator_test/jsd_ops_test.py               | 8 ++++----
 caffe2/python/operator_test/key_split_ops_test.py         | 8 ++++----
 caffe2/python/operator_test/lars_test.py                  | 8 ++++----
 caffe2/python/operator_test/layer_norm_op_test.py         | 8 ++++----
 caffe2/python/operator_test/leaky_relu_test.py            | 6 +++---
 .../operator_test/learning_rate_adaption_op_test.py       | 8 ++++----
 caffe2/python/operator_test/learning_rate_op_test.py      | 8 ++++----
 caffe2/python/operator_test/length_split_op_test.py       | 8 ++++----
 caffe2/python/operator_test/lengths_pad_op_test.py        | 8 ++++----
 .../lengths_reducer_fused_nbit_rowwise_ops_test.py        | 2 +-
 caffe2/python/operator_test/lengths_tile_op_test.py       | 8 ++++----
 caffe2/python/operator_test/lengths_top_k_ops_test.py     | 8 ++++----
 caffe2/python/operator_test/listwise_l2r_operator_test.py | 2 +-
 caffe2/python/operator_test/load_save_test.py             | 8 ++++----
 caffe2/python/operator_test/locally_connected_op_test.py  | 6 +++---
 caffe2/python/operator_test/loss_ops_test.py              | 8 ++++----
 caffe2/python/operator_test/lpnorm_op_test.py             | 8 ++++----
 caffe2/python/operator_test/map_ops_test.py               | 8 ++++----
 .../operator_test/margin_ranking_criterion_op_test.py     | 8 ++++----
 caffe2/python/operator_test/math_ops_test.py              | 8 ++++----
 caffe2/python/operator_test/matmul_op_test.py             | 8 ++++----
 caffe2/python/operator_test/mean_op_test.py               | 8 ++++----
 caffe2/python/operator_test/merge_id_lists_op_test.py     | 8 ++++----
 caffe2/python/operator_test/mkl_conv_op_test.py           | 8 ++++----
 caffe2/python/operator_test/mkl_packed_fc_op_test.py      | 8 ++++----
 caffe2/python/operator_test/mod_op_test.py                | 8 ++++----
 caffe2/python/operator_test/moments_op_test.py            | 8 ++++----
 caffe2/python/operator_test/momentum_sgd_test.py          | 8 ++++----
 caffe2/python/operator_test/mpi_test.py                   | 8 ++++----
 caffe2/python/operator_test/mul_gradient_benchmark.py     | 8 ++++----
 caffe2/python/operator_test/negate_gradient_op_test.py    | 8 ++++----
 caffe2/python/operator_test/ngram_ops_test.py             | 8 ++++----
 caffe2/python/operator_test/normalize_op_test.py          | 6 +++---
 caffe2/python/operator_test/numpy_tile_op_test.py         | 8 ++++----
 caffe2/python/operator_test/one_hot_ops_test.py           | 8 ++++----
 caffe2/python/operator_test/onnx_while_test.py            | 6 +++---
 caffe2/python/operator_test/order_switch_test.py          | 2 +-
 caffe2/python/operator_test/pack_ops_test.py              | 8 ++++----
 caffe2/python/operator_test/pack_rnn_sequence_op_test.py  | 8 ++++----
 caffe2/python/operator_test/pad_test.py                   | 6 +++---
 caffe2/python/operator_test/partition_ops_test.py         | 8 ++++----
 caffe2/python/operator_test/percentile_op_test.py         | 8 ++++----
 .../operator_test/piecewise_linear_transform_test.py      | 8 ++++----
 caffe2/python/operator_test/pooling_test.py               | 6 +++---
 caffe2/python/operator_test/prepend_dim_test.py           | 8 ++++----
 caffe2/python/operator_test/python_op_test.py             | 8 ++++----
 caffe2/python/operator_test/quantile_test.py              | 2 +-
 .../operator_test/rand_quantization_op_speed_test.py      | 2 +-
 caffe2/python/operator_test/rand_quantization_op_test.py  | 8 ++++----
 caffe2/python/operator_test/rank_loss_operator_test.py    | 8 ++++----
 caffe2/python/operator_test/rebatching_queue_test.py      | 8 ++++----
 caffe2/python/operator_test/record_queue_test.py          | 8 ++++----
 .../python/operator_test/recurrent_net_executor_test.py   | 8 ++++----
 caffe2/python/operator_test/recurrent_network_test.py     | 8 ++++----
 caffe2/python/operator_test/reduce_ops_test.py            | 8 ++++----
 caffe2/python/operator_test/reduction_ops_test.py         | 8 ++++----
 caffe2/python/operator_test/reshape_ops_test.py           | 8 ++++----
 caffe2/python/operator_test/resize_op_test.py             | 6 +++---
 caffe2/python/operator_test/rmac_regions_op_test.py       | 8 ++++----
 caffe2/python/operator_test/rms_norm_op_test.py           | 2 +-
 caffe2/python/operator_test/rnn_cell_test.py              | 8 ++++----
 caffe2/python/operator_test/roi_align_rotated_op_test.py  | 8 ++++----
 caffe2/python/operator_test/rowwise_counter_test.py       | 2 +-
 caffe2/python/operator_test/scale_op_test.py              | 8 ++++----
 caffe2/python/operator_test/segment_ops_test.py           | 8 ++++----
 caffe2/python/operator_test/selu_op_test.py               | 8 ++++----
 caffe2/python/operator_test/sequence_ops_test.py          | 8 ++++----
 caffe2/python/operator_test/shape_inference_test.py       | 8 ++++----
 .../operator_test/sinusoid_position_encoding_op_test.py   | 8 ++++----
 caffe2/python/operator_test/softmax_ops_test.py           | 8 ++++----
 caffe2/python/operator_test/softplus_op_test.py           | 8 ++++----
 .../sparse_dropout_with_replacement_op_test.py            | 8 ++++----
 .../python/operator_test/sparse_gradient_checker_test.py  | 8 ++++----
 .../python/operator_test/sparse_lengths_sum_benchmark.py  | 2 +-
 caffe2/python/operator_test/sparse_lp_regularizer_test.py | 8 ++++----
 caffe2/python/operator_test/sparse_normalize_test.py      | 8 ++++----
 caffe2/python/operator_test/sparse_ops_test.py            | 8 ++++----
 .../python/operator_test/sparse_to_dense_mask_op_test.py  | 8 ++++----
 caffe2/python/operator_test/spatial_bn_op_test.py         | 8 ++++----
 .../python/operator_test/specialized_segment_ops_test.py  | 2 +-
 caffe2/python/operator_test/square_root_divide_op_test.py | 8 ++++----
 caffe2/python/operator_test/stats_ops_test.py             | 8 ++++----
 caffe2/python/operator_test/stats_put_ops_test.py         | 8 ++++----
 caffe2/python/operator_test/storm_test.py                 | 8 ++++----
 caffe2/python/operator_test/string_ops_test.py            | 8 ++++----
 caffe2/python/operator_test/text_file_reader_test.py      | 8 ++++----
 caffe2/python/operator_test/thresholded_relu_op_test.py   | 8 ++++----
 caffe2/python/operator_test/tile_op_test.py               | 8 ++++----
 caffe2/python/operator_test/top_k_test.py                 | 8 ++++----
 caffe2/python/operator_test/torch_integration_test.py     | 2 +-
 caffe2/python/operator_test/transpose_op_test.py          | 6 +++---
 caffe2/python/operator_test/trigonometric_op_test.py      | 8 ++++----
 caffe2/python/operator_test/unique_ops_test.py            | 8 ++++----
 .../python/operator_test/unique_uniform_fill_op_test.py   | 8 ++++----
 caffe2/python/operator_test/upsample_op_test.py           | 6 +++---
 caffe2/python/operator_test/utility_ops_test.py           | 8 ++++----
 caffe2/python/operator_test/video_input_op_test.py        | 2 +-
 caffe2/python/operator_test/weight_scale_test.py          | 8 ++++----
 caffe2/python/operator_test/weighted_multi_sample_test.py | 8 ++++----
 caffe2/python/operator_test/weighted_sample_test.py       | 8 ++++----
 caffe2/python/operator_test/weighted_sum_test.py          | 8 ++++----
 caffe2/python/operator_test/wngrad_test.py                | 8 ++++----
 caffe2/python/optimizer.py                                | 2 +-
 caffe2/python/optimizer_context.py                        | 8 ++++----
 caffe2/python/optimizer_test.py                           | 6 +++---
 caffe2/python/optimizer_test_util.py                      | 8 ++++----
 caffe2/python/parallel_workers.py                         | 8 ++++----
 caffe2/python/parallel_workers_test.py                    | 8 ++++----
 caffe2/python/parallelize_bmuf_distributed_test.py        | 6 +++---
 caffe2/python/pipeline.py                                 | 8 ++++----
 caffe2/python/pipeline_test.py                            | 8 ++++----
 caffe2/python/predictor/mobile_exporter.py                | 8 ++++----
 caffe2/python/predictor/mobile_exporter_test.py           | 8 ++++----
 caffe2/python/predictor/predictor_exporter.py             | 8 ++++----
 caffe2/python/predictor/predictor_exporter_test.py        | 8 ++++----
 caffe2/python/predictor/predictor_py_utils.py             | 8 ++++----
 caffe2/python/predictor/predictor_test.py                 | 8 ++++----
 caffe2/python/predictor/serde.py                          | 8 ++++----
 caffe2/python/predictor_constants.py                      | 8 ++++----
 caffe2/python/python_op_test.py                           | 8 ++++----
 caffe2/python/queue_util.py                               | 8 ++++----
 caffe2/python/record_queue.py                             | 8 ++++----
 caffe2/python/recurrent.py                                | 8 ++++----
 caffe2/python/regularizer.py                              | 2 +-
 caffe2/python/regularizer_context.py                      | 8 ++++----
 caffe2/python/regularizer_test.py                         | 2 +-
 caffe2/python/rnn/__init__.py                             | 8 ++++----
 caffe2/python/rnn/lstm_comparison.py                      | 8 ++++----
 caffe2/python/rnn/rnn_cell_test_util.py                   | 8 ++++----
 caffe2/python/rnn_cell.py                                 | 8 ++++----
 caffe2/python/schema.py                                   | 8 ++++----
 caffe2/python/schema_test.py                              | 8 ++++----
 caffe2/python/scope.py                                    | 8 ++++----
 caffe2/python/scope_test.py                               | 8 ++++----
 caffe2/python/serialized_test/coverage.py                 | 8 ++++----
 caffe2/python/serialized_test/serialized_test_util.py     | 8 ++++----
 caffe2/python/session.py                                  | 8 ++++----
 caffe2/python/session_test.py                             | 8 ++++----
 caffe2/python/sparse_to_dense_mask_test.py                | 8 ++++----
 caffe2/python/sparse_to_dense_test.py                     | 8 ++++----
 caffe2/python/task.py                                     | 8 ++++----
 caffe2/python/task_test.py                                | 8 ++++----
 caffe2/python/test/blob_deallocation_test.py              | 6 +++---
 caffe2/python/test/do_op_test.py                          | 6 +++---
 caffe2/python/test/executor_test.py                       | 6 +++---
 caffe2/python/test/executor_test_util.py                  | 6 +++---
 caffe2/python/test/fakefp16_transform_test.py             | 6 +++---
 caffe2/python/test/gpu_context_test.py                    | 8 ++++----
 caffe2/python/test/python_protobuf_test.py                | 6 +++---
 caffe2/python/test_util.py                                | 8 ++++----
 caffe2/python/text_file_reader.py                         | 8 ++++----
 caffe2/python/timeout_guard.py                            | 8 ++++----
 caffe2/python/transformations.py                          | 8 ++++----
 caffe2/python/transformations_test.py                     | 8 ++++----
 caffe2/python/trt/test_trt.py                             | 8 ++++----
 caffe2/python/trt/transform.py                            | 8 ++++----
 caffe2/python/tt_core.py                                  | 6 +++---
 caffe2/python/tt_core_test.py                             | 8 ++++----
 caffe2/python/utils.py                                    | 8 ++++----
 caffe2/python/utils_test.py                               | 8 ++++----
 caffe2/python/workspace.py                                | 8 ++++----
 caffe2/python/workspace_test.py                           | 8 ++++----
 .../quantization/server/batch_matmul_dnnlowp_op_test.py   | 2 +-
 .../server/batch_permutation_dnnlowp_op_test.py           | 2 +-
 .../server/channel_shuffle_dnnlowp_op_test.py             | 2 +-
 caffe2/quantization/server/concat_dnnlowp_op_test.py      | 2 +-
 .../quantization/server/conv_depthwise_dnnlowp_op_test.py | 2 +-
 caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py  | 2 +-
 caffe2/quantization/server/conv_dnnlowp_op_test.py        | 2 +-
 .../server/conv_groupwise_dnnlowp_acc16_op_test.py        | 2 +-
 .../quantization/server/conv_groupwise_dnnlowp_op_test.py | 2 +-
 caffe2/quantization/server/dequantize_dnnlowp_op_test.py  | 2 +-
 caffe2/quantization/server/dnnlowp_test_utils.py          | 2 +-
 .../server/elementwise_add_dnnlowp_op_test.py             | 2 +-
 .../server/elementwise_linear_dnnlowp_op_test.py          | 2 +-
 .../server/elementwise_mul_dnnlowp_op_test.py             | 2 +-
 .../server/elementwise_sum_dnnlowp_op_test.py             | 2 +-
 .../server/fully_connected_dnnlowp_acc16_op_test.py       | 2 +-
 .../server/fully_connected_dnnlowp_op_test.py             | 2 +-
 caffe2/quantization/server/fully_connected_fp16_test.py   | 2 +-
 .../server/fully_connected_rowwise_dnnlowp_op_test.py     | 2 +-
 caffe2/quantization/server/gather_dnnlowp_op_test.py      | 2 +-
 caffe2/quantization/server/group_norm_dnnlowp_op_test.py  | 2 +-
 caffe2/quantization/server/int8_gen_quant_params_test.py  | 2 +-
 .../server/int8_quant_scheme_blob_fill_test.py            | 2 +-
 caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py   | 2 +-
 caffe2/quantization/server/observer_test.py               | 2 +-
 caffe2/quantization/server/pool_dnnlowp_op_test.py        | 2 +-
 caffe2/quantization/server/quantize_dnnlowp_op_test.py    | 2 +-
 caffe2/quantization/server/relu_dnnlowp_op_test.py        | 2 +-
 .../server/resize_nearest_3d_dnnlowp_op_test.py           | 2 +-
 .../quantization/server/resize_nearest_dnnlowp_op_test.py | 2 +-
 caffe2/quantization/server/sigmoid_dnnlowp_op_test.py     | 2 +-
 .../server/spatial_batch_norm_dnnlowp_op_test.py          | 2 +-
 caffe2/quantization/server/tanh_dnnlowp_op_test.py        | 2 +-
 caffe2/quantization/server/utils.py                       | 2 +-
 scripts/get_python_cmake_flags.py                         | 6 +++---
 setup.py                                                  | 2 +-
 tools/amd_build/build_amd.py                              | 2 +-
 tools/autograd/gen_variable_type.py                       | 2 +-
 tools/clang_tidy.py                                       | 2 +-
 tools/pyi/gen_pyi.py                                      | 2 +-
 tools/setup_helpers/cmake.py                              | 2 +-
 597 files changed, 2086 insertions(+), 2086 deletions(-)

diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py
index 92448fe355de..d9d99a1c1ae9 100644
--- a/caffe2/contrib/aten/aten_test.py
+++ b/caffe2/contrib/aten/aten_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, dyndep
 from hypothesis import given
diff --git a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
index a8979ca63aa6..94a76fed85f5 100644
--- a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
index 1a4f57b6aa05..7b1b5f070171 100644
--- a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
index 511c29884288..b7a9fc810cfc 100644
--- a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
index bb013a26a609..7a68af63a84b 100644
--- a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py
index 22e78b0756c0..45757badba43 100644
--- a/caffe2/contrib/fakelowp/test/test_fusions.py
+++ b/caffe2/contrib/fakelowp/test/test_fusions.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
index 4c82917f042c..5a91a00706ff 100644
--- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.fakelowp.init_shared_libs  # noqa
 import numpy as np
diff --git a/caffe2/contrib/fakelowp/test/test_int8_quant.py b/caffe2/contrib/fakelowp/test/test_int8_quant.py
index 83d0cc176def..02095286e1ee 100644
--- a/caffe2/contrib/fakelowp/test/test_int8_quant.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_quant.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 # Must happen before importing caffe2.python.*
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
index 698b839f3785..9ff0986116b6 100644
--- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import caffe2.python.fakelowp.init_shared_libs  # noqa
diff --git a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
index 58161409fa80..e8512b4dcd74 100644
--- a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
index 0ca76bd86ba9..a8d6640fa58e 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
index da7eae2708f3..f8fd03cbfb73 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import unittest
 
diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
index ad26952a901c..207403f1bd0d 100644
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import unittest
 
diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py
index 8eaff9e137ae..fbca9b8fe64c 100644
--- a/caffe2/contrib/gloo/gloo_test.py
+++ b/caffe2/contrib/gloo/gloo_test.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py
index 3f4685548281..2d4e9b518b9b 100644
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py
index b12acd151a71..4bedf0e0ecd6 100644
--- a/caffe2/contrib/nnpack/nnpack_ops_test.py
+++ b/caffe2/contrib/nnpack/nnpack_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py
index 5d968b0455fc..b8e2f8b37b2a 100644
--- a/caffe2/contrib/playground/AnyExp.py
+++ b/caffe2/contrib/playground/AnyExp.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from abc import abstractmethod
 
diff --git a/caffe2/contrib/playground/AnyExpOnTerm.py b/caffe2/contrib/playground/AnyExpOnTerm.py
index b269777da675..dcfe61f14545 100644
--- a/caffe2/contrib/playground/AnyExpOnTerm.py
+++ b/caffe2/contrib/playground/AnyExpOnTerm.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import json
diff --git a/caffe2/contrib/playground/ModuleRegister.py b/caffe2/contrib/playground/ModuleRegister.py
index 89a9deb8989e..27e0c07f6384 100644
--- a/caffe2/contrib/playground/ModuleRegister.py
+++ b/caffe2/contrib/playground/ModuleRegister.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import inspect
 import logging
diff --git a/caffe2/contrib/playground/checkpoint.py b/caffe2/contrib/playground/checkpoint.py
index 9887a408cc01..5ea3d2a9035c 100644
--- a/caffe2/contrib/playground/checkpoint.py
+++ b/caffe2/contrib/playground/checkpoint.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import pickle
diff --git a/caffe2/contrib/playground/compute_loss.py b/caffe2/contrib/playground/compute_loss.py
index 53eb77d77701..2965ff3895ac 100644
--- a/caffe2/contrib/playground/compute_loss.py
+++ b/caffe2/contrib/playground/compute_loss.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.contrib.playground.meter as Meter
 from caffe2.python import workspace
diff --git a/caffe2/contrib/playground/compute_topk_accuracy.py b/caffe2/contrib/playground/compute_topk_accuracy.py
index 396b797ed1b6..e2f148231c6d 100644
--- a/caffe2/contrib/playground/compute_topk_accuracy.py
+++ b/caffe2/contrib/playground/compute_topk_accuracy.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.contrib.playground.meter as Meter
 from caffe2.python import workspace
diff --git a/caffe2/contrib/playground/meter.py b/caffe2/contrib/playground/meter.py
index 7e109e445d04..ed0158bbf087 100644
--- a/caffe2/contrib/playground/meter.py
+++ b/caffe2/contrib/playground/meter.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from abc import abstractmethod
 
diff --git a/caffe2/contrib/playground/module_map.py b/caffe2/contrib/playground/module_map.py
index 0f5de5943a36..8eb1a3a00cdc 100644
--- a/caffe2/contrib/playground/module_map.py
+++ b/caffe2/contrib/playground/module_map.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 # Input
 import caffe2.contrib.playground.resnetdemo.\
diff --git a/caffe2/contrib/playground/output_generator.py b/caffe2/contrib/playground/output_generator.py
index 41d8e3fdfae4..aaa977c08faa 100644
--- a/caffe2/contrib/playground/output_generator.py
+++ b/caffe2/contrib/playground/output_generator.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import timeout_guard
 
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
index 52ce95ed5dab..58085dbc3721 100644
--- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
+++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
index cf893b598446..480070752e63 100644
--- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
+++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
index 174ffe1e034a..fa0fedd84a8c 100644
--- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
+++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python.models.resnet as resnet
 
diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
index 974653446a22..5697d1301b8a 100644
--- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
+++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def gen_param_update_builder_fun(self, model, dataset, is_train):
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
index 01b51fa8450c..056ddd8c9ea0 100644
--- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
+++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 logging.basicConfig()
diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
index 8a86289778ee..5378acd61886 100644
--- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
+++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
index 8b2647114b63..496ac22ffde5 100644
--- a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
+++ b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 # # example1 using gfs as input source.
 
diff --git a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
index 4cc2d68cbfd7..419d6a25e95b 100644
--- a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
+++ b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 def checkpoint(self, epoch):
     self.model_path = None
diff --git a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
index d757896793ff..0a56d68257ee 100644
--- a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
+++ b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python import dyndep
diff --git a/caffe2/contrib/prof/cuda_profile_ops_test.py b/caffe2/contrib/prof/cuda_profile_ops_test.py
index 2953503bbea5..c77b7ae88ba6 100644
--- a/caffe2/contrib/prof/cuda_profile_ops_test.py
+++ b/caffe2/contrib/prof/cuda_profile_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/contrib/tensorboard/tensorboard.py b/caffe2/contrib/tensorboard/tensorboard.py
index 9aece77bc09a..6f5ad1896e35 100644
--- a/caffe2/contrib/tensorboard/tensorboard.py
+++ b/caffe2/contrib/tensorboard/tensorboard.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import click
 import collections
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py
index a3c0e0e59723..ef12ce563cde 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from builtins import bytes
 import copy
diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
index 6b9c894e16fb..31ef8180fb57 100644
--- a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
+++ b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/contrib/tensorboard/tensorboard_test.py b/caffe2/contrib/tensorboard/tensorboard_test.py
index 494cb6fc7d12..8751be14ead5 100644
--- a/caffe2/contrib/tensorboard/tensorboard_test.py
+++ b/caffe2/contrib/tensorboard/tensorboard_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import click.testing
 import numpy as np
diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py
index 3b21c8b66747..013e80a98773 100644
--- a/caffe2/contrib/warpctc/ctc_ops_test.py
+++ b/caffe2/contrib/warpctc/ctc_ops_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py
index 49cd2abb2cef..fbe1c8da377e 100755
--- a/caffe2/core/nomnigraph/op_gen.py
+++ b/caffe2/core/nomnigraph/op_gen.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 from textwrap import dedent
diff --git a/caffe2/distributed/file_store_handler_op_test.py b/caffe2/distributed/file_store_handler_op_test.py
index 2e90c548d50f..427b68420d39 100644
--- a/caffe2/distributed/file_store_handler_op_test.py
+++ b/caffe2/distributed/file_store_handler_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import errno
 import os
diff --git a/caffe2/distributed/redis_store_handler_op_test.py b/caffe2/distributed/redis_store_handler_op_test.py
index 3df69bf2701a..8f5d58e85185 100644
--- a/caffe2/distributed/redis_store_handler_op_test.py
+++ b/caffe2/distributed/redis_store_handler_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import uuid
diff --git a/caffe2/distributed/store_ops_test_util.py b/caffe2/distributed/store_ops_test_util.py
index 2abe697cface..05245be9b210 100644
--- a/caffe2/distributed/store_ops_test_util.py
+++ b/caffe2/distributed/store_ops_test_util.py
@@ -1,9 +1,9 @@
 ## @package store_ops_test_util
 # Module caffe2.distributed.store_ops_test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from multiprocessing import Process, Queue
 
diff --git a/caffe2/experiments/python/SparseTransformer.py b/caffe2/experiments/python/SparseTransformer.py
index ff9ab7715c33..d97f076a7bb3 100644
--- a/caffe2/experiments/python/SparseTransformer.py
+++ b/caffe2/experiments/python/SparseTransformer.py
@@ -15,10 +15,10 @@
 
 ## @package SparseTransformer
 # Module caffe2.experiments.python.SparseTransformer
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import workspace
 import scipy.sparse
 
diff --git a/caffe2/experiments/python/convnet_benchmarks.py b/caffe2/experiments/python/convnet_benchmarks.py
index 386c9c4b7ebc..ff9b7a20bc73 100644
--- a/caffe2/experiments/python/convnet_benchmarks.py
+++ b/caffe2/experiments/python/convnet_benchmarks.py
@@ -15,10 +15,10 @@
 
 ## @package convnet_benchmarks
 # Module caffe2.experiments.python.convnet_benchmarks
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 """
 Benchmark for common convnets.
 
diff --git a/caffe2/experiments/python/device_reduce_sum_bench.py b/caffe2/experiments/python/device_reduce_sum_bench.py
index dbe0dae4f0c2..1a795e2fcf0e 100644
--- a/caffe2/experiments/python/device_reduce_sum_bench.py
+++ b/caffe2/experiments/python/device_reduce_sum_bench.py
@@ -15,10 +15,10 @@
 
 ## @package device_reduce_sum_bench
 # Module caffe2.experiments.python.device_reduce_sum_bench
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import itertools
diff --git a/caffe2/experiments/python/funhash_op_test.py b/caffe2/experiments/python/funhash_op_test.py
index 6a4eb0e6b5b5..3fc4c8bf54fd 100644
--- a/caffe2/experiments/python/funhash_op_test.py
+++ b/caffe2/experiments/python/funhash_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/experiments/python/net_construct_bench.py b/caffe2/experiments/python/net_construct_bench.py
index b7cf605c0c04..ec12517c03be 100644
--- a/caffe2/experiments/python/net_construct_bench.py
+++ b/caffe2/experiments/python/net_construct_bench.py
@@ -15,10 +15,10 @@
 
 ## @package net_construct_bench
 # Module caffe2.experiments.python.net_construct_bench
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import logging
diff --git a/caffe2/experiments/python/sparse_funhash_op_test.py b/caffe2/experiments/python/sparse_funhash_op_test.py
index 2af006249c7d..cfc7a0bb6165 100644
--- a/caffe2/experiments/python/sparse_funhash_op_test.py
+++ b/caffe2/experiments/python/sparse_funhash_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/experiments/python/sparse_reshape_op_test.py b/caffe2/experiments/python/sparse_reshape_op_test.py
index 5849580f09e1..a22bf561ce86 100644
--- a/caffe2/experiments/python/sparse_reshape_op_test.py
+++ b/caffe2/experiments/python/sparse_reshape_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/experiments/python/tt_contraction_op_test.py b/caffe2/experiments/python/tt_contraction_op_test.py
index 4cd04a16ea23..1e41e9ed8ddd 100644
--- a/caffe2/experiments/python/tt_contraction_op_test.py
+++ b/caffe2/experiments/python/tt_contraction_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/experiments/python/tt_pad_op_test.py b/caffe2/experiments/python/tt_pad_op_test.py
index 10be7adcb453..27d13543348b 100644
--- a/caffe2/experiments/python/tt_pad_op_test.py
+++ b/caffe2/experiments/python/tt_pad_op_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index f79b7c8e7d9c..75b0c8b583be 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 import sys
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 09b5652e61f2..8582eff9ce19 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 from caffe2.proto import caffe2_pb2
 import os
 import sys
diff --git a/caffe2/python/allcompare_test.py b/caffe2/python/allcompare_test.py
index 663cc9e02864..22038715f289 100644
--- a/caffe2/python/allcompare_test.py
+++ b/caffe2/python/allcompare_test.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py
index 73be94feaf2b..59f4a5adb6a5 100644
--- a/caffe2/python/attention.py
+++ b/caffe2/python/attention.py
@@ -1,9 +1,9 @@
 ## @package attention
 # Module caffe2.python.attention
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew
 
diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py
index 8393ca7875aa..84d0d46490b0 100644
--- a/caffe2/python/benchmark_generator.py
+++ b/caffe2/python/benchmark_generator.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import string
 
 import argparse
diff --git a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
index 9b9a196e9770..ce96dbc1dd63 100644
--- a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
+++ b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 
diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
index bdba35545255..1b683be0d51e 100644
--- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
+++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 import datetime
diff --git a/caffe2/python/binarysize.py b/caffe2/python/binarysize.py
index 802d61025e30..39dba40df8a0 100644
--- a/caffe2/python/binarysize.py
+++ b/caffe2/python/binarysize.py
@@ -15,10 +15,10 @@
 green, assuming that you have a xterm connection that supports color.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import subprocess
 import sys
diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py
index 2722c21d84d0..0e050ec32c44 100644
--- a/caffe2/python/brew.py
+++ b/caffe2/python/brew.py
@@ -1,9 +1,9 @@
 ## @package model_helper_api
 # Module caffe2.python.model_helper_api
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import sys
 import copy
diff --git a/caffe2/python/brew_test.py b/caffe2/python/brew_test.py
index 8b3d08977c2c..4973876a8008 100644
--- a/caffe2/python/brew_test.py
+++ b/caffe2/python/brew_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, core, scope, workspace
 from caffe2.python.modeling.parameter_info import ParameterTags
diff --git a/caffe2/python/build.py b/caffe2/python/build.py
index 0f447265d5f4..862c031004c5 100644
--- a/caffe2/python/build.py
+++ b/caffe2/python/build.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python._import_c_extension as C
 
diff --git a/caffe2/python/cached_reader.py b/caffe2/python/cached_reader.py
index 1dd179c71caf..980c4fe40e08 100644
--- a/caffe2/python/cached_reader.py
+++ b/caffe2/python/cached_reader.py
@@ -1,9 +1,9 @@
 ## @package cached_reader
 # Module caffe2.python.cached_reader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
index cdd96eb1f492..9d7797fc3ada 100644
--- a/caffe2/python/checkpoint.py
+++ b/caffe2/python/checkpoint.py
@@ -1,9 +1,9 @@
 ## @package checkpoint
 # Module caffe2.python.checkpoint
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import logging
diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py
index a91bbf9910e2..90746747dd98 100644
--- a/caffe2/python/checkpoint_test.py
+++ b/caffe2/python/checkpoint_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.schema import Struct, ConstRecord
 from caffe2.python import core, workspace, model_helper
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index aead1d599474..a0fd52e1fdbc 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -1,9 +1,9 @@
 ## @package cnn
 # Module caffe2.python.cnn
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, workspace
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/context.py b/caffe2/python/context.py
index 928807ba2805..28815bb7f36b 100644
--- a/caffe2/python/context.py
+++ b/caffe2/python/context.py
@@ -1,9 +1,9 @@
 ## @package context
 # Module caffe2.python.context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import threading
 import six
diff --git a/caffe2/python/context_test.py b/caffe2/python/context_test.py
index 6a1f77f5ecf8..6c259d326a19 100644
--- a/caffe2/python/context_test.py
+++ b/caffe2/python/context_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context, test_util
 from threading import Thread
diff --git a/caffe2/python/control.py b/caffe2/python/control.py
index dd332f745f9a..6b0654d6f26e 100644
--- a/caffe2/python/control.py
+++ b/caffe2/python/control.py
@@ -11,10 +11,10 @@
   If
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from future.utils import viewitems
diff --git a/caffe2/python/control_ops_grad.py b/caffe2/python/control_ops_grad.py
index 5a8d24cf55d8..a0e85f4d0bc1 100644
--- a/caffe2/python/control_ops_grad.py
+++ b/caffe2/python/control_ops_grad.py
@@ -1,9 +1,9 @@
 ## @package control_ops_grad
 # Module caffe2.python.control_ops_grad
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 
diff --git a/caffe2/python/control_ops_grad_test.py b/caffe2/python/control_ops_grad_test.py
index a84b9ca0a168..f637e38a5e33 100644
--- a/caffe2/python/control_ops_grad_test.py
+++ b/caffe2/python/control_ops_grad_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import core, test_util, workspace
diff --git a/caffe2/python/control_ops_util.py b/caffe2/python/control_ops_util.py
index 76ab14a7bc65..cfff82de318b 100644
--- a/caffe2/python/control_ops_util.py
+++ b/caffe2/python/control_ops_util.py
@@ -1,9 +1,9 @@
 ## @package control_ops_util
 # Module caffe2.python.control_ops_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 
diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py
index e51aeffa8b04..3f9df172d2b7 100644
--- a/caffe2/python/control_test.py
+++ b/caffe2/python/control_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import control, core, test_util, workspace
 
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
index 44f81d6e2d13..18033661a69e 100644
--- a/caffe2/python/convert.py
+++ b/caffe2/python/convert.py
@@ -1,9 +1,9 @@
 ## @package workspace
 # Module caffe2.python.workspace
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2, torch_pb2
 
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
index 82c969c901ea..a1dc52aad2d9 100644
--- a/caffe2/python/convert_test.py
+++ b/caffe2/python/convert_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import convert, workspace
 from caffe2.proto import caffe2_pb2, torch_pb2
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 3b493277a182..6d7c503e2c81 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -1,9 +1,9 @@
 ## @package core
 # Module caffe2.python.core
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from collections import namedtuple, OrderedDict, defaultdict
 from past.builtins import basestring
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index 8b229029f5f7..3674b7aa4585 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from future.utils import bytes_to_native_str
 from hypothesis import given, settings
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 8660f5cc2106..b0f5b11f0d1c 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from inspect import currentframe, getframeinfo
 import unittest
diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py
index a009f8f0fa31..703ae604c654 100644
--- a/caffe2/python/crf.py
+++ b/caffe2/python/crf.py
@@ -1,6 +1,6 @@
 ## @package crf
 # Module caffe2.python.crf
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python import brew, core, model_helper, recurrent
diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py
index dd1c8720bfb1..9bc0372c50c0 100644
--- a/caffe2/python/crf_predict.py
+++ b/caffe2/python/crf_predict.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python.crf import CRFWithLoss
diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py
index 970a7c6d4a8f..052bbbf4e6bf 100644
--- a/caffe2/python/crf_viterbi_test.py
+++ b/caffe2/python/crf_viterbi_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import workspace, crf
 
 from caffe2.python.cnn import CNNModelHelper
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 7f5527472cc2..95abb7159d42 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -1,8 +1,8 @@
 ## @package data_parallel_model
 # Module caffe2.python.data_parallel_model
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from collections import OrderedDict
 from future.utils import viewitems, viewkeys, viewvalues
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
index e106dee97039..a0dbb3037c2c 100644
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from future.utils import viewkeys
 from multiprocessing import Process, Queue
diff --git a/caffe2/python/data_workers.py b/caffe2/python/data_workers.py
index eb49da78c0af..698a8953ef13 100644
--- a/caffe2/python/data_workers.py
+++ b/caffe2/python/data_workers.py
@@ -1,9 +1,9 @@
 ## @package data_workers
 # Module caffe2.python.data_workers
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 '''
diff --git a/caffe2/python/data_workers_test.py b/caffe2/python/data_workers_test.py
index 1abd8dfa28d7..4669aaf59476 100644
--- a/caffe2/python/data_workers_test.py
+++ b/caffe2/python/data_workers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
index 5221262582ee..ff6e9c6860f6 100644
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
@@ -15,10 +15,10 @@
 
 See `dataset.py` for an example of implementation.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.schema import Field, Struct, from_blob_list
diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py
index 26f1c0902f71..0c45fb50aed9 100644
--- a/caffe2/python/dataio_test.py
+++ b/caffe2/python/dataio_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.dataio import (
     CompositeReader,
diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py
index 387dbbaead58..4c2d4c806476 100644
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@@ -10,10 +10,10 @@
 is stored as a set of native Caffe2 tensors, thus no type conversion or
 deserialization is necessary.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.dataio import Reader, Writer
diff --git a/caffe2/python/db_file_reader.py b/caffe2/python/db_file_reader.py
index 9296f1c6b7db..265b19251717 100644
--- a/caffe2/python/db_file_reader.py
+++ b/caffe2/python/db_file_reader.py
@@ -1,9 +1,9 @@
 ## @package db_file_reader
 # Module caffe2.python.db_file_reader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope, workspace, _import_c_extension as C
 from caffe2.python.dataio import Reader
diff --git a/caffe2/python/db_test.py b/caffe2/python/db_test.py
index f642202b36f0..f0f5d2770dc0 100644
--- a/caffe2/python/db_test.py
+++ b/caffe2/python/db_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace
 
diff --git a/caffe2/python/docs/formatter.py b/caffe2/python/docs/formatter.py
index 0a16420f6d5a..904f1731e960 100644
--- a/caffe2/python/docs/formatter.py
+++ b/caffe2/python/docs/formatter.py
@@ -1,9 +1,9 @@
 ## @package formatter
 # Module caffe2.python.docs.formatter
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python.docs.parser import Parser
 
 
diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py
index 1bc41b7d1ccb..c5a7df369bc2 100644
--- a/caffe2/python/docs/generator.py
+++ b/caffe2/python/docs/generator.py
@@ -1,9 +1,9 @@
 ## @package generator
 # Module caffe2.python.docs.generator
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import os
 from caffe2.python import core, workspace
diff --git a/caffe2/python/docs/github.py b/caffe2/python/docs/github.py
index 5cb1fdcf5d7b..3fd78507346e 100644
--- a/caffe2/python/docs/github.py
+++ b/caffe2/python/docs/github.py
@@ -1,9 +1,9 @@
 ## @package github
 # Module caffe2.python.docs.github
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import os
 from caffe2.python.docs.formatter import Markdown
diff --git a/caffe2/python/docs/parser.py b/caffe2/python/docs/parser.py
index 024989c97e25..a4edb6e07246 100644
--- a/caffe2/python/docs/parser.py
+++ b/caffe2/python/docs/parser.py
@@ -1,9 +1,9 @@
 ## @package parser
 # Module caffe2.python.docs.parser
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import re
 
 
diff --git a/caffe2/python/dyndep.py b/caffe2/python/dyndep.py
index 8bea14423875..0382cc3a8212 100644
--- a/caffe2/python/dyndep.py
+++ b/caffe2/python/dyndep.py
@@ -1,9 +1,9 @@
 ## @package dyndep
 # Module caffe2.python.dyndep
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import ctypes
 import os
diff --git a/caffe2/python/embedding_generation_benchmark.py b/caffe2/python/embedding_generation_benchmark.py
index a4d66036b93d..33dbf757dda4 100644
--- a/caffe2/python/embedding_generation_benchmark.py
+++ b/caffe2/python/embedding_generation_benchmark.py
@@ -1,9 +1,9 @@
 ## @package embedding_generation_benchmark
 # Module caffe2.python.embedding_generation_benchmark
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, utils, model_helper
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
index fb2059f94868..59e85431e8bf 100644
--- a/caffe2/python/examples/char_rnn.py
+++ b/caffe2/python/examples/char_rnn.py
@@ -1,9 +1,9 @@
 ## @package char_rnn
 # Module caffe2.python.examples.char_rnn
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, model_helper, utils, brew
 from caffe2.python.rnn_cell import LSTM
diff --git a/caffe2/python/examples/lmdb_create_example.py b/caffe2/python/examples/lmdb_create_example.py
index b29b3b806001..af56069a7be0 100644
--- a/caffe2/python/examples/lmdb_create_example.py
+++ b/caffe2/python/examples/lmdb_create_example.py
@@ -1,9 +1,9 @@
 ## @package lmdb_create_example
 # Module caffe2.python.examples.lmdb_create_example
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import numpy as np
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
index cbe9491d9cf6..822a0a2950ba 100644
--- a/caffe2/python/experiment_util.py
+++ b/caffe2/python/experiment_util.py
@@ -1,9 +1,9 @@
 ## @package experiment_util
 # Module caffe2.python.experiment_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import datetime
 import time
diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py
index c533ae6d77bc..06c6707dcce9 100644
--- a/caffe2/python/extension_loader.py
+++ b/caffe2/python/extension_loader.py
@@ -1,9 +1,9 @@
 ## @package extension_loader
 # Module caffe2.python.extension_loader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import contextlib
 import ctypes
 import sys
diff --git a/caffe2/python/fakefp16_transform_lib.py b/caffe2/python/fakefp16_transform_lib.py
index 885f15732055..c3f142061479 100644
--- a/caffe2/python/fakefp16_transform_lib.py
+++ b/caffe2/python/fakefp16_transform_lib.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
 
 import caffe2.python._import_c_extension as C
 from caffe2.proto.caffe2_pb2 import NetDef
diff --git a/caffe2/python/fakelowp/init_shared_libs.py b/caffe2/python/fakelowp/init_shared_libs.py
index d289c7c4a97d..2a98de4571aa 100644
--- a/caffe2/python/fakelowp/init_shared_libs.py
+++ b/caffe2/python/fakelowp/init_shared_libs.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import ctypes
 import os
diff --git a/caffe2/python/fakelowp/test_utils.py b/caffe2/python/fakelowp/test_utils.py
index 75e4422f3ccc..4a31a92e5bce 100644
--- a/caffe2/python/fakelowp/test_utils.py
+++ b/caffe2/python/fakelowp/test_utils.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import sys
 import numpy as np
diff --git a/caffe2/python/filler_test.py b/caffe2/python/filler_test.py
index 52ea756d5bea..9aff384e99af 100644
--- a/caffe2/python/filler_test.py
+++ b/caffe2/python/filler_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, test_util, workspace
 
diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py
index 7c26f69a0c43..d32acb3d8a90 100644
--- a/caffe2/python/functional.py
+++ b/caffe2/python/functional.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py
index e7803e829bb4..d90943761aa4 100644
--- a/caffe2/python/functional_test.py
+++ b/caffe2/python/functional_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
index d2ecf118ea27..a7e5d714b63c 100644
--- a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
+++ b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
index 1b492229a433..3f8dd83b5538 100644
--- a/caffe2/python/gradient_check_test.py
+++ b/caffe2/python/gradient_check_test.py
@@ -2,10 +2,10 @@
 # can gradually remove this test script. DO NOT ADD MORE TESTS TO THIS
 # FILE.
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import (
     brew,
diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
index b1cdcc2bbb56..afb8d5071492 100644
--- a/caffe2/python/gradient_checker.py
+++ b/caffe2/python/gradient_checker.py
@@ -1,9 +1,9 @@
 ## @package gradient_checker
 # Module caffe2.python.gradient_checker
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py
index e6caa2cae1eb..049a9152878a 100644
--- a/caffe2/python/gru_cell.py
+++ b/caffe2/python/gru_cell.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 from caffe2.python import brew, rnn_cell
diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py
index 6bc3779a4ca1..948c55ac88ce 100644
--- a/caffe2/python/helpers/algebra.py
+++ b/caffe2/python/helpers/algebra.py
@@ -1,9 +1,9 @@
 ## @package algebra
 # Module caffe2.python.helpers.algebra
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def transpose(model, blob_in, blob_out, use_cudnn=False, **kwargs):
diff --git a/caffe2/python/helpers/arg_scope.py b/caffe2/python/helpers/arg_scope.py
index ac6978be8064..a112e9b84c5d 100644
--- a/caffe2/python/helpers/arg_scope.py
+++ b/caffe2/python/helpers/arg_scope.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 import contextlib
 import copy
 import threading
diff --git a/caffe2/python/helpers/array_helpers.py b/caffe2/python/helpers/array_helpers.py
index 3f8955331d4e..fae0011bf1f6 100644
--- a/caffe2/python/helpers/array_helpers.py
+++ b/caffe2/python/helpers/array_helpers.py
@@ -1,9 +1,9 @@
 ## @package arra_helpers
 # Module caffe2.python.helpers.array_helpers
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def concat(model, blobs_in, blob_out, **kwargs):
diff --git a/caffe2/python/helpers/control_ops.py b/caffe2/python/helpers/control_ops.py
index a738a71fe44c..c6f71d0761a5 100644
--- a/caffe2/python/helpers/control_ops.py
+++ b/caffe2/python/helpers/control_ops.py
@@ -1,9 +1,9 @@
 ## @package control_ops
 # Module caffe2.python.helpers.control_ops
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.control_ops_util import add_if_op, add_while_op
 
diff --git a/caffe2/python/helpers/conv.py b/caffe2/python/helpers/conv.py
index bb88b2e3757f..dfca165084df 100644
--- a/caffe2/python/helpers/conv.py
+++ b/caffe2/python/helpers/conv.py
@@ -1,9 +1,9 @@
 ## @package conv
 # Module caffe2.python.helpers.conv
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.modeling import initializers
diff --git a/caffe2/python/helpers/db_input.py b/caffe2/python/helpers/db_input.py
index 6e642a393da4..d5772cb7653e 100644
--- a/caffe2/python/helpers/db_input.py
+++ b/caffe2/python/helpers/db_input.py
@@ -1,9 +1,9 @@
 ## @package db_input
 # Module caffe2.python.helpers.db_input
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 def db_input(model, blobs_out, batch_size, db, db_type):
     dbreader_name = "dbreader_" + db
diff --git a/caffe2/python/helpers/dropout.py b/caffe2/python/helpers/dropout.py
index 6fbb5bcda99a..d7280318f60d 100644
--- a/caffe2/python/helpers/dropout.py
+++ b/caffe2/python/helpers/dropout.py
@@ -1,9 +1,9 @@
 ## @package dropout
 # Module caffe2.python.helpers.dropout
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def dropout(model, blob_in, blob_out, use_cudnn=False, **kwargs):
diff --git a/caffe2/python/helpers/elementwise_linear.py b/caffe2/python/helpers/elementwise_linear.py
index 55fbd708489c..ef9184d00dd2 100644
--- a/caffe2/python/helpers/elementwise_linear.py
+++ b/caffe2/python/helpers/elementwise_linear.py
@@ -1,9 +1,9 @@
 ## @package elementwise_linear
 # Module caffe2.python.helpers.elementwise_linear
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.modeling.parameter_info import ParameterTags
diff --git a/caffe2/python/helpers/fc.py b/caffe2/python/helpers/fc.py
index 9d61dc7ac145..0feb2b65745e 100644
--- a/caffe2/python/helpers/fc.py
+++ b/caffe2/python/helpers/fc.py
@@ -1,9 +1,9 @@
 ## @package fc
 # Module caffe2.python.helpers.fc
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.modeling import initializers
diff --git a/caffe2/python/helpers/nonlinearity.py b/caffe2/python/helpers/nonlinearity.py
index f773cc3114de..3a8be3bb056a 100644
--- a/caffe2/python/helpers/nonlinearity.py
+++ b/caffe2/python/helpers/nonlinearity.py
@@ -1,9 +1,9 @@
 ## @package nonlinearity
 # Module caffe2.python.helpers.nonlinearity
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 
diff --git a/caffe2/python/helpers/normalization.py b/caffe2/python/helpers/normalization.py
index 621f565b5455..b13b43f6859a 100644
--- a/caffe2/python/helpers/normalization.py
+++ b/caffe2/python/helpers/normalization.py
@@ -1,9 +1,9 @@
 ## @package normalization
 # Module caffe2.python.helpers.normalization
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope
 from caffe2.python.modeling.parameter_info import ParameterTags
diff --git a/caffe2/python/helpers/pooling.py b/caffe2/python/helpers/pooling.py
index 412d55434d16..9e6fc784f289 100644
--- a/caffe2/python/helpers/pooling.py
+++ b/caffe2/python/helpers/pooling.py
@@ -2,10 +2,10 @@
 # Module caffe2.python.helpers.pooling
 ## @package fc
 # Module caffe2.python.helpers.pooling
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def max_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
diff --git a/caffe2/python/helpers/tools.py b/caffe2/python/helpers/tools.py
index 59defe9e236b..178620eab593 100644
--- a/caffe2/python/helpers/tools.py
+++ b/caffe2/python/helpers/tools.py
@@ -1,9 +1,9 @@
 ## @package tools
 # Module caffe2.python.helpers.tools
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def image_input(
diff --git a/caffe2/python/helpers/train.py b/caffe2/python/helpers/train.py
index bee36347808a..02883af7402d 100644
--- a/caffe2/python/helpers/train.py
+++ b/caffe2/python/helpers/train.py
@@ -1,9 +1,9 @@
 ## @package train
 # Module caffe2.python.helpers.train
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/hip_test_util.py b/caffe2/python/hip_test_util.py
index 3910c9e5c2ce..beab3be1c40a 100644
--- a/caffe2/python/hip_test_util.py
+++ b/caffe2/python/hip_test_util.py
@@ -6,10 +6,10 @@
 operators.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 
diff --git a/caffe2/python/hsm_util.py b/caffe2/python/hsm_util.py
index e98056f9cd88..ec465c12240e 100644
--- a/caffe2/python/hsm_util.py
+++ b/caffe2/python/hsm_util.py
@@ -1,9 +1,9 @@
 ## @package hsm_util
 # Module caffe2.python.hsm_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import hsm_pb2
 
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 897be5fab44a..8a286383f60f 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 import copy
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 797010b46890..2000e269969e 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -34,10 +34,10 @@
   implemented on the CPU.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.proto import caffe2_pb2
 from caffe2.python import (
     workspace, device_checker, gradient_checker, test_util, core)
diff --git a/caffe2/python/ideep/LRN_op_test.py b/caffe2/python/ideep/LRN_op_test.py
index 956f10be8831..23ecd79062f7 100644
--- a/caffe2/python/ideep/LRN_op_test.py
+++ b/caffe2/python/ideep/LRN_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/adam_op_test.py b/caffe2/python/ideep/adam_op_test.py
index a0d9b2ce014f..5ac0395bff63 100644
--- a/caffe2/python/ideep/adam_op_test.py
+++ b/caffe2/python/ideep/adam_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/blobs_queue_db_test.py b/caffe2/python/ideep/blobs_queue_db_test.py
index ded18e89c5ae..966fcc23d47d 100644
--- a/caffe2/python/ideep/blobs_queue_db_test.py
+++ b/caffe2/python/ideep/blobs_queue_db_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/channel_shuffle_op_test.py b/caffe2/python/ideep/channel_shuffle_op_test.py
index 8c3eea3d8618..b4cedca61061 100644
--- a/caffe2/python/ideep/channel_shuffle_op_test.py
+++ b/caffe2/python/ideep/channel_shuffle_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/concat_split_op_test.py b/caffe2/python/ideep/concat_split_op_test.py
index c28a7f1fe52c..75c9ceeba0e4 100644
--- a/caffe2/python/ideep/concat_split_op_test.py
+++ b/caffe2/python/ideep/concat_split_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py
index e82d8aec5515..ae4473ea4864 100644
--- a/caffe2/python/ideep/conv_op_test.py
+++ b/caffe2/python/ideep/conv_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import sys
diff --git a/caffe2/python/ideep/conv_transpose_test.py b/caffe2/python/ideep/conv_transpose_test.py
index be35dbd8a382..eeda2ea43a2d 100644
--- a/caffe2/python/ideep/conv_transpose_test.py
+++ b/caffe2/python/ideep/conv_transpose_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py
index f24333745741..18ce574b623b 100644
--- a/caffe2/python/ideep/convfusion_op_test.py
+++ b/caffe2/python/ideep/convfusion_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/copy_op_test.py b/caffe2/python/ideep/copy_op_test.py
index 4b0a15bd999a..668282f2e159 100644
--- a/caffe2/python/ideep/copy_op_test.py
+++ b/caffe2/python/ideep/copy_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py
index efecfb501bff..33b0a52a7421 100644
--- a/caffe2/python/ideep/dropout_op_test.py
+++ b/caffe2/python/ideep/dropout_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from hypothesis import given
diff --git a/caffe2/python/ideep/elementwise_sum_op_test.py b/caffe2/python/ideep/elementwise_sum_op_test.py
index 9daf34088fc0..11a35d6b2b28 100644
--- a/caffe2/python/ideep/elementwise_sum_op_test.py
+++ b/caffe2/python/ideep/elementwise_sum_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/expanddims_squeeze_op_test.py b/caffe2/python/ideep/expanddims_squeeze_op_test.py
index 4a4fb7319b25..3693a217bb4b 100644
--- a/caffe2/python/ideep/expanddims_squeeze_op_test.py
+++ b/caffe2/python/ideep/expanddims_squeeze_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py
index 9e29bfaed919..6549bb6ad6bb 100644
--- a/caffe2/python/ideep/fc_op_test.py
+++ b/caffe2/python/ideep/fc_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from functools import reduce
diff --git a/caffe2/python/ideep/leaky_relu_op_test.py b/caffe2/python/ideep/leaky_relu_op_test.py
index 8a68d2e608ef..6d84f88f4fe2 100644
--- a/caffe2/python/ideep/leaky_relu_op_test.py
+++ b/caffe2/python/ideep/leaky_relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py
index 06d0e9be0e57..596bab0ad3cc 100644
--- a/caffe2/python/ideep/moment_sgd_op_test.py
+++ b/caffe2/python/ideep/moment_sgd_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py
index 6d40a88b5c13..dc928c264082 100644
--- a/caffe2/python/ideep/operator_fallback_op_test.py
+++ b/caffe2/python/ideep/operator_fallback_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py
index 8a967dcf9c08..a259e01bab10 100644
--- a/caffe2/python/ideep/order_switch_op_test.py
+++ b/caffe2/python/ideep/order_switch_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/ideep/pool_op_test.py b/caffe2/python/ideep/pool_op_test.py
index 9659d3961338..9ab3fcddbadb 100644
--- a/caffe2/python/ideep/pool_op_test.py
+++ b/caffe2/python/ideep/pool_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/pre_convert_test.py b/caffe2/python/ideep/pre_convert_test.py
index a32eedd74469..6c0b7ca5d7a7 100644
--- a/caffe2/python/ideep/pre_convert_test.py
+++ b/caffe2/python/ideep/pre_convert_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/relu_op_test.py b/caffe2/python/ideep/relu_op_test.py
index bd05c69381c5..e2fda68aed2b 100644
--- a/caffe2/python/ideep/relu_op_test.py
+++ b/caffe2/python/ideep/relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/reshape_op_test.py b/caffe2/python/ideep/reshape_op_test.py
index c9714f6eb4a5..c2bca948a52c 100644
--- a/caffe2/python/ideep/reshape_op_test.py
+++ b/caffe2/python/ideep/reshape_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.test_util import TestCase
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py
index e1ab30c12e45..47114832f85d 100644
--- a/caffe2/python/ideep/shape_op_test.py
+++ b/caffe2/python/ideep/shape_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/sigmoid_op_test.py b/caffe2/python/ideep/sigmoid_op_test.py
index b67932108084..2b5eb0e3a2b5 100644
--- a/caffe2/python/ideep/sigmoid_op_test.py
+++ b/caffe2/python/ideep/sigmoid_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/softmax_op_test.py b/caffe2/python/ideep/softmax_op_test.py
index 9043061514a0..b76d6509609b 100644
--- a/caffe2/python/ideep/softmax_op_test.py
+++ b/caffe2/python/ideep/softmax_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py
index 25b83e2447fc..618a0e7fbfc3 100644
--- a/caffe2/python/ideep/spatial_bn_op_test.py
+++ b/caffe2/python/ideep/spatial_bn_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py
index b0483cf4c4b6..aa1c5bc260fa 100644
--- a/caffe2/python/ideep/test_ideep_net.py
+++ b/caffe2/python/ideep/test_ideep_net.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py
index 6345b76735a7..962d4051718b 100644
--- a/caffe2/python/ideep/transform_ideep_net.py
+++ b/caffe2/python/ideep/transform_ideep_net.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import copy
diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py
index b02085a3ba3b..8b324ed964ae 100644
--- a/caffe2/python/ideep/transpose_op_test.py
+++ b/caffe2/python/ideep/transpose_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep/weightedsum_op_test.py b/caffe2/python/ideep/weightedsum_op_test.py
index 2a0b3ec3e7b0..b1e46fca4851 100644
--- a/caffe2/python/ideep/weightedsum_op_test.py
+++ b/caffe2/python/ideep/weightedsum_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py
index e131ee027c35..7129ed14ba74 100644
--- a/caffe2/python/ideep_test_util.py
+++ b/caffe2/python/ideep_test_util.py
@@ -6,10 +6,10 @@
 operators.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index 90e5a4d76b6d..7c3dda3b320c 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -1,9 +1,9 @@
 # @package layer_model_helper
 # Module caffe2.python.layer_model_helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, model_helper, schema, scope, utils, muji
 from caffe2.python.modeling.parameter_info import (
diff --git a/caffe2/python/layer_model_instantiator.py b/caffe2/python/layer_model_instantiator.py
index 9ceb1310bf30..9284b9b9e687 100644
--- a/caffe2/python/layer_model_instantiator.py
+++ b/caffe2/python/layer_model_instantiator.py
@@ -1,9 +1,9 @@
 ## @package layer_model_instantiator
 # Module caffe2.python.layer_model_instantiator
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import InstantiationContext
diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py
index 5d87dbd7522a..518412b9e90c 100644
--- a/caffe2/python/layer_parameter_sharing_test.py
+++ b/caffe2/python/layer_parameter_sharing_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope
 from caffe2.python.modeling.parameter_sharing import (
diff --git a/caffe2/python/layer_test_util.py b/caffe2/python/layer_test_util.py
index 2f2e23062e34..ae28e82b98cc 100644
--- a/caffe2/python/layer_test_util.py
+++ b/caffe2/python/layer_test_util.py
@@ -1,9 +1,9 @@
 ## @package layer_test_util
 # Module caffe2.python.layer_test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from collections import namedtuple
 
diff --git a/caffe2/python/layers/__init__.py b/caffe2/python/layers/__init__.py
index 2a09dc8419a6..487b7751fd08 100644
--- a/caffe2/python/layers/__init__.py
+++ b/caffe2/python/layers/__init__.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from importlib import import_module
 import pkgutil
diff --git a/caffe2/python/layers/adaptive_weight.py b/caffe2/python/layers/adaptive_weight.py
index c081e8573038..146a0bdb1974 100644
--- a/caffe2/python/layers/adaptive_weight.py
+++ b/caffe2/python/layers/adaptive_weight.py
@@ -1,6 +1,6 @@
 # @package adaptive_weight
 # Module caffe2.fb.python.layers.adaptive_weight
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python import core, schema
diff --git a/caffe2/python/layers/add_bias.py b/caffe2/python/layers/add_bias.py
index 0ffa46afb2b3..1a0fd8b295f3 100644
--- a/caffe2/python/layers/add_bias.py
+++ b/caffe2/python/layers/add_bias.py
@@ -1,9 +1,9 @@
 ## @package add_bias
 # Module caffe2.python.layers.add_bias
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/arc_cosine_feature_map.py b/caffe2/python/layers/arc_cosine_feature_map.py
index 2409eca551a1..89c5014f5c5c 100644
--- a/caffe2/python/layers/arc_cosine_feature_map.py
+++ b/caffe2/python/layers/arc_cosine_feature_map.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/batch_huber_loss.py b/caffe2/python/layers/batch_huber_loss.py
index 48b6ebcf8f58..0a5323625419 100644
--- a/caffe2/python/layers/batch_huber_loss.py
+++ b/caffe2/python/layers/batch_huber_loss.py
@@ -1,9 +1,9 @@
 # @package batch_huber_loss
 # Module caffe2.python.layers.batch_huber_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py
index a560a3f654a9..46b0e4d42cdf 100644
--- a/caffe2/python/layers/batch_lr_loss.py
+++ b/caffe2/python/layers/batch_lr_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_lr_loss
 # Module caffe2.python.layers.batch_lr_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/batch_mse_loss.py b/caffe2/python/layers/batch_mse_loss.py
index 89da74f3c1e9..b0dd63ab09c8 100644
--- a/caffe2/python/layers/batch_mse_loss.py
+++ b/caffe2/python/layers/batch_mse_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_mse_loss
 # Module caffe2.python.layers.batch_mse_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py
index 9fe3ee51eb56..6395b09ff67f 100644
--- a/caffe2/python/layers/batch_normalization.py
+++ b/caffe2/python/layers/batch_normalization.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
index 9ef8cf563dbe..84e7d4873f50 100644
--- a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
+++ b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_sigmoid_cross_entropy_loss
 # Module caffe2.python.layers.batch_sigmoid_cross_entropy_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/batch_softmax_loss.py b/caffe2/python/layers/batch_softmax_loss.py
index d5f9413ef96a..30667a04c159 100644
--- a/caffe2/python/layers/batch_softmax_loss.py
+++ b/caffe2/python/layers/batch_softmax_loss.py
@@ -1,9 +1,9 @@
 ## @package batch_softmax_loss
 # Module caffe2.python.layers.batch_softmax_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/blob_weighted_sum.py b/caffe2/python/layers/blob_weighted_sum.py
index cf8ecfd99045..a37fab463581 100644
--- a/caffe2/python/layers/blob_weighted_sum.py
+++ b/caffe2/python/layers/blob_weighted_sum.py
@@ -1,9 +1,9 @@
 ## @package BlobWeightedSum
 # Module caffe2.python.layers.blob_weighted_sum
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/bpr_loss.py b/caffe2/python/layers/bpr_loss.py
index 4e6a60fdaa57..389de8c241e8 100644
--- a/caffe2/python/layers/bpr_loss.py
+++ b/caffe2/python/layers/bpr_loss.py
@@ -1,9 +1,9 @@
 ## @package bpr_loss
 # Module caffe2.python.layers.bpr_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/bucket_weighted.py b/caffe2/python/layers/bucket_weighted.py
index 3c750e7b136f..2c200a922fdd 100644
--- a/caffe2/python/layers/bucket_weighted.py
+++ b/caffe2/python/layers/bucket_weighted.py
@@ -1,9 +1,9 @@
 ## @package bucket_weighted
 # Module caffe2.python.layers.bucket_weighted
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 import numpy as np
diff --git a/caffe2/python/layers/build_index.py b/caffe2/python/layers/build_index.py
index b8c999bc256e..29c63f3d8948 100644
--- a/caffe2/python/layers/build_index.py
+++ b/caffe2/python/layers/build_index.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/layers/concat.py b/caffe2/python/layers/concat.py
index 062485757edc..fb1dc6ab6dbf 100644
--- a/caffe2/python/layers/concat.py
+++ b/caffe2/python/layers/concat.py
@@ -1,9 +1,9 @@
 ## @package concat
 # Module caffe2.python.layers.concat
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/constant_weight.py b/caffe2/python/layers/constant_weight.py
index 06e9d9cd9b66..d160ed8206b3 100644
--- a/caffe2/python/layers/constant_weight.py
+++ b/caffe2/python/layers/constant_weight.py
@@ -1,9 +1,9 @@
 # @package constant_weight
 # Module caffe2.fb.python.layers.constant_weight
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/conv.py b/caffe2/python/layers/conv.py
index bb22acf0cafa..e98bac7e2d80 100644
--- a/caffe2/python/layers/conv.py
+++ b/caffe2/python/layers/conv.py
@@ -1,9 +1,9 @@
 ## @package conv
 # Module caffe2.python.layers.conv
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/dropout.py b/caffe2/python/layers/dropout.py
index a5d3f01a440e..4bc0cf2785b2 100644
--- a/caffe2/python/layers/dropout.py
+++ b/caffe2/python/layers/dropout.py
@@ -1,8 +1,8 @@
 # Module caffe2.python.layers.dropout
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/fc.py b/caffe2/python/layers/fc.py
index a9eeceff2c21..9220f22165a3 100644
--- a/caffe2/python/layers/fc.py
+++ b/caffe2/python/layers/fc.py
@@ -1,9 +1,9 @@
 ## @package fc
 # Module caffe2.python.layers.fc
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.helpers.arg_scope import get_current_scope
 from caffe2.python import schema
diff --git a/caffe2/python/layers/fc_with_bootstrap.py b/caffe2/python/layers/fc_with_bootstrap.py
index 6a48f572ddba..b3c2eb346f96 100644
--- a/caffe2/python/layers/fc_with_bootstrap.py
+++ b/caffe2/python/layers/fc_with_bootstrap.py
@@ -1,6 +1,6 @@
 ## @package fc_with_bootstrap
 # Module caffe2.python.layers.fc_with_bootstrap
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import math
 
diff --git a/caffe2/python/layers/fc_without_bias.py b/caffe2/python/layers/fc_without_bias.py
index e8923a8e5b9c..2899af618b79 100644
--- a/caffe2/python/layers/fc_without_bias.py
+++ b/caffe2/python/layers/fc_without_bias.py
@@ -1,9 +1,9 @@
 ## @package fc_without_bias
 # Module caffe2.python.layers.fc_without_bias
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py
index 69fe91a48691..ca004d136ded 100644
--- a/caffe2/python/layers/feature_sparse_to_dense.py
+++ b/caffe2/python/layers/feature_sparse_to_dense.py
@@ -1,6 +1,6 @@
 # @package sparse_to_dense
 # Module caffe2.python.layers.sparse_to_dense
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from collections import defaultdict
 
diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py
index 53d5c050242f..c6d156fd68ce 100644
--- a/caffe2/python/layers/functional.py
+++ b/caffe2/python/layers/functional.py
@@ -1,9 +1,9 @@
 # @package functional
 # Module caffe2.python.layers.functional
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema, scope, workspace
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/gather_record.py b/caffe2/python/layers/gather_record.py
index 1289c097902c..da468d5db90c 100644
--- a/caffe2/python/layers/gather_record.py
+++ b/caffe2/python/layers/gather_record.py
@@ -1,9 +1,9 @@
 ## @package gather_record
 # Module caffe2.python.layers.gather_record
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/homotopy_weight.py b/caffe2/python/layers/homotopy_weight.py
index 63da1f04abf4..4c24223cbc8d 100644
--- a/caffe2/python/layers/homotopy_weight.py
+++ b/caffe2/python/layers/homotopy_weight.py
@@ -1,10 +1,10 @@
 # @package homotopy_weight
 # Module caffe2.fb.python.layers.homotopy_weight
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/label_smooth.py b/caffe2/python/layers/label_smooth.py
index e2282e051611..7e4987270660 100644
--- a/caffe2/python/layers/label_smooth.py
+++ b/caffe2/python/layers/label_smooth.py
@@ -15,10 +15,10 @@
 
 # @package label_smooth
 # Module caffe2.python.layers.label_smooth
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py
index fb93effbff2d..a16b731a2f78 100644
--- a/caffe2/python/layers/last_n_window_collector.py
+++ b/caffe2/python/layers/last_n_window_collector.py
@@ -1,9 +1,9 @@
 ## @package last_n_window_collector
 # Module caffe2.python.layers.last_n_window_collector
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py
index 0dc6795994cb..580a03bfc5da 100644
--- a/caffe2/python/layers/layer_normalization.py
+++ b/caffe2/python/layers/layer_normalization.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/layers.py b/caffe2/python/layers/layers.py
index 216d0b2e3286..abcdd1596220 100644
--- a/caffe2/python/layers/layers.py
+++ b/caffe2/python/layers/layers.py
@@ -1,6 +1,6 @@
 ## @package layers
 # Module caffe2.python.layers.layers
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import logging
 from collections import namedtuple
diff --git a/caffe2/python/layers/margin_rank_loss.py b/caffe2/python/layers/margin_rank_loss.py
index 15267752caa3..6f97ade23ef4 100644
--- a/caffe2/python/layers/margin_rank_loss.py
+++ b/caffe2/python/layers/margin_rank_loss.py
@@ -1,9 +1,9 @@
 ## @package random_neg_rank_loss
 # Module caffe2.python.layers.random_neg_rank_loss
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema, core
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py
index 117dd7904787..68c27b587567 100644
--- a/caffe2/python/layers/merge_id_lists.py
+++ b/caffe2/python/layers/merge_id_lists.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/pairwise_similarity.py b/caffe2/python/layers/pairwise_similarity.py
index 30cb6ace2b81..5020e5432c2a 100644
--- a/caffe2/python/layers/pairwise_similarity.py
+++ b/caffe2/python/layers/pairwise_similarity.py
@@ -1,9 +1,9 @@
 ## @package dot_product
 # Module caffe2.python.layers.dot_product
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/position_weighted.py b/caffe2/python/layers/position_weighted.py
index 19ddda2b6dcf..12e26bcd774e 100644
--- a/caffe2/python/layers/position_weighted.py
+++ b/caffe2/python/layers/position_weighted.py
@@ -1,9 +1,9 @@
 ## @package position_weighted
 # Module caffe2.python.layers.position_weighted
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 import numpy as np
diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py
index 6056da4ba7cf..bde05ab97147 100644
--- a/caffe2/python/layers/random_fourier_features.py
+++ b/caffe2/python/layers/random_fourier_features.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/reservoir_sampling.py b/caffe2/python/layers/reservoir_sampling.py
index 3819a1971da4..21b9c44f2a79 100644
--- a/caffe2/python/layers/reservoir_sampling.py
+++ b/caffe2/python/layers/reservoir_sampling.py
@@ -1,9 +1,9 @@
 ## @package reservoir_sampling
 # Module caffe2.python.layers.reservoir_sampling
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.layers.layers import ModelLayer
diff --git a/caffe2/python/layers/sampling_train.py b/caffe2/python/layers/sampling_train.py
index 1c617326da7f..034c897e2c2f 100644
--- a/caffe2/python/layers/sampling_train.py
+++ b/caffe2/python/layers/sampling_train.py
@@ -1,9 +1,9 @@
 ## @package sampling_train
 # Module caffe2.python.layers.sampling_train
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import ModelLayer, get_layer_class
diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py
index 911fd8391e3f..403cc5a4a51c 100644
--- a/caffe2/python/layers/sampling_trainable_mixin.py
+++ b/caffe2/python/layers/sampling_trainable_mixin.py
@@ -1,9 +1,9 @@
 ## @package sampling_trainable_mixin
 # Module caffe2.python.layers.sampling_trainable_mixin
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import abc
 import six
diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py
index 65e44bece97c..49e42ca308d7 100644
--- a/caffe2/python/layers/select_record_by_context.py
+++ b/caffe2/python/layers/select_record_by_context.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 
diff --git a/caffe2/python/layers/semi_random_features.py b/caffe2/python/layers/semi_random_features.py
index d7b96d956d08..58f30ac71f19 100644
--- a/caffe2/python/layers/semi_random_features.py
+++ b/caffe2/python/layers/semi_random_features.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.arc_cosine_feature_map import ArcCosineFeatureMap
diff --git a/caffe2/python/layers/sparse_dropout_with_replacement.py b/caffe2/python/layers/sparse_dropout_with_replacement.py
index 8275d83d8734..3e03888e57dc 100644
--- a/caffe2/python/layers/sparse_dropout_with_replacement.py
+++ b/caffe2/python/layers/sparse_dropout_with_replacement.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/sparse_feature_hash.py b/caffe2/python/layers/sparse_feature_hash.py
index 3927b199fbdf..c3ada99dc4a7 100644
--- a/caffe2/python/layers/sparse_feature_hash.py
+++ b/caffe2/python/layers/sparse_feature_hash.py
@@ -1,9 +1,9 @@
 ## @package sparse_feature_hash
 # Module caffe2.python.layers.sparse_feature_hash
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema, core
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
index 30cb60266c4d..dd1c42606063 100644
--- a/caffe2/python/layers/sparse_lookup.py
+++ b/caffe2/python/layers/sparse_lookup.py
@@ -1,9 +1,9 @@
 ## @package sparse_lookup
 # Module caffe2.python.layers.sparse_lookup
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.optimizer import FP16_ENGINES, Optimizer
 from caffe2.python.helpers.arg_scope import get_current_scope
diff --git a/caffe2/python/layers/split.py b/caffe2/python/layers/split.py
index a83881f5a091..58e569a272c7 100644
--- a/caffe2/python/layers/split.py
+++ b/caffe2/python/layers/split.py
@@ -1,9 +1,9 @@
 ## @package split
 # Module caffe2.python.layers.split
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import schema
 from caffe2.python.layers.layers import (
diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py
index 28b7312dbcaa..5161ee2e1a96 100644
--- a/caffe2/python/layers/tags.py
+++ b/caffe2/python/layers/tags.py
@@ -1,9 +1,9 @@
 ## @package tags
 # Module caffe2.python.layers.tags
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import six
 
diff --git a/caffe2/python/layers/uniform_sampling.py b/caffe2/python/layers/uniform_sampling.py
index 46ed29bbaa41..5581371d008d 100644
--- a/caffe2/python/layers/uniform_sampling.py
+++ b/caffe2/python/layers/uniform_sampling.py
@@ -1,9 +1,9 @@
 ## @package uniform_sampling
 # Module caffe2.python.layers.uniform_sampling
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
index 4d037a891ade..e084a011d357 100644
--- a/caffe2/python/layers_test.py
+++ b/caffe2/python/layers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 import numpy as np
diff --git a/caffe2/python/lazy_dyndep.py b/caffe2/python/lazy_dyndep.py
index e1799838f4b2..e53d4fda350b 100644
--- a/caffe2/python/lazy_dyndep.py
+++ b/caffe2/python/lazy_dyndep.py
@@ -1,9 +1,9 @@
 ## @package lazy_dyndep
 # Module caffe2.python.lazy_dyndep
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 from caffe2.python import dyndep, lazy
diff --git a/caffe2/python/lazy_dyndep_test.py b/caffe2/python/lazy_dyndep_test.py
index 881215ac36e3..1441facd3a6f 100644
--- a/caffe2/python/lazy_dyndep_test.py
+++ b/caffe2/python/lazy_dyndep_test.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
index f08e9147d3ba..718b7fb3a987 100644
--- a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
+++ b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
index d73db5aaa36c..a38d442dd952 100644
--- a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
+++ b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/lstm_benchmark.py b/caffe2/python/lstm_benchmark.py
index cfa53a81610c..29f819ec622e 100644
--- a/caffe2/python/lstm_benchmark.py
+++ b/caffe2/python/lstm_benchmark.py
@@ -1,9 +1,9 @@
 ## @package lstm_benchmark
 # Module caffe2.python.lstm_benchmark
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, utils, rnn_cell, model_helper
diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py
index c299c817ace4..a728fc4e2157 100644
--- a/caffe2/python/memonger.py
+++ b/caffe2/python/memonger.py
@@ -1,9 +1,9 @@
 ## @package memonger
 # Module caffe2.python.memonger
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import networkx as nx
 import collections
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
index 7d5c52224b1c..8584e8d5e4cc 100644
--- a/caffe2/python/memonger_test.py
+++ b/caffe2/python/memonger_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py
index 73df4820a5d1..2b084bea591b 100644
--- a/caffe2/python/mkl/mkl_LRN_op_test.py
+++ b/caffe2/python/mkl/mkl_LRN_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py
index 35eae62d5be1..ae42902d9102 100644
--- a/caffe2/python/mkl/mkl_LRN_speed_test.py
+++ b/caffe2/python/mkl/mkl_LRN_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_concat_op_test.py b/caffe2/python/mkl/mkl_concat_op_test.py
index a1a96ca755d9..8b01f8885b1c 100644
--- a/caffe2/python/mkl/mkl_concat_op_test.py
+++ b/caffe2/python/mkl/mkl_concat_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py
index 38ceb680bb6d..f1fe7b062318 100644
--- a/caffe2/python/mkl/mkl_conv_op_test.py
+++ b/caffe2/python/mkl/mkl_conv_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_copy_op_test.py b/caffe2/python/mkl/mkl_copy_op_test.py
index 633865cd5047..b2baeb9ef1af 100644
--- a/caffe2/python/mkl/mkl_copy_op_test.py
+++ b/caffe2/python/mkl/mkl_copy_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_elementwise_add_op_test.py b/caffe2/python/mkl/mkl_elementwise_add_op_test.py
index eab454ffe105..0709b5afd9f6 100644
--- a/caffe2/python/mkl/mkl_elementwise_add_op_test.py
+++ b/caffe2/python/mkl/mkl_elementwise_add_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
index 71e0754a0214..3adec4848e50 100644
--- a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
+++ b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py
index 01e8c9b5a925..01786d55c337 100644
--- a/caffe2/python/mkl/mkl_fc_op_test.py
+++ b/caffe2/python/mkl/mkl_fc_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py
index 7cabadfe1da0..85f5605e9676 100644
--- a/caffe2/python/mkl/mkl_fc_speed_test.py
+++ b/caffe2/python/mkl/mkl_fc_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py
index dbdf12c1aca4..26a9b7131b0b 100644
--- a/caffe2/python/mkl/mkl_fill_op_test.py
+++ b/caffe2/python/mkl/mkl_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_pool_op_test.py b/caffe2/python/mkl/mkl_pool_op_test.py
index b733edaace1c..a56e9448317a 100644
--- a/caffe2/python/mkl/mkl_pool_op_test.py
+++ b/caffe2/python/mkl/mkl_pool_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py
index a0fa8ca6ece8..b25e0f915cc7 100644
--- a/caffe2/python/mkl/mkl_pool_speed_test.py
+++ b/caffe2/python/mkl/mkl_pool_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_relu_op_test.py b/caffe2/python/mkl/mkl_relu_op_test.py
index 90e365da554b..76ec33bcbe91 100644
--- a/caffe2/python/mkl/mkl_relu_op_test.py
+++ b/caffe2/python/mkl/mkl_relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py
index 4a5fad2b7b68..2ac9080ce670 100644
--- a/caffe2/python/mkl/mkl_sbn_op_test.py
+++ b/caffe2/python/mkl/mkl_sbn_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py
index d37bef32b9b7..3b3b71d1c997 100644
--- a/caffe2/python/mkl/mkl_sbn_speed_test.py
+++ b/caffe2/python/mkl/mkl_sbn_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_sigmoid_op_test.py b/caffe2/python/mkl/mkl_sigmoid_op_test.py
index 654008c67b7d..abdb0983778d 100644
--- a/caffe2/python/mkl/mkl_sigmoid_op_test.py
+++ b/caffe2/python/mkl/mkl_sigmoid_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py
index 4034705580d5..9a7310a484d1 100644
--- a/caffe2/python/mkl/mkl_speed_test.py
+++ b/caffe2/python/mkl/mkl_speed_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/mkl/mkl_squeeze_op_test.py b/caffe2/python/mkl/mkl_squeeze_op_test.py
index 1e4b5791b0b6..8af090f60d88 100644
--- a/caffe2/python/mkl/mkl_squeeze_op_test.py
+++ b/caffe2/python/mkl/mkl_squeeze_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index c003e0e3b09b..3a88a3deeccc 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import copy
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py
index 42e3269fc4d8..1ad209cdbdfd 100644
--- a/caffe2/python/mkl/rewrite_graph_test.py
+++ b/caffe2/python/mkl/rewrite_graph_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/mkl_test_util.py b/caffe2/python/mkl_test_util.py
index 5d8f66500190..88fb3cc800ec 100644
--- a/caffe2/python/mkl_test_util.py
+++ b/caffe2/python/mkl_test_util.py
@@ -6,10 +6,10 @@
 operators.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index a26bf844f2de..a5a4865c0ec1 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -1,9 +1,9 @@
 ## @package model_helper
 # Module caffe2.python.model_helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope, workspace
 from caffe2.python.helpers.db_input import db_input
diff --git a/caffe2/python/model_helper_test.py b/caffe2/python/model_helper_test.py
index fcccddf401db..1423e4a97733 100644
--- a/caffe2/python/model_helper_test.py
+++ b/caffe2/python/model_helper_test.py
@@ -1,6 +1,6 @@
 """unittest for ModelHelper class"""
 
-from __future__ import absolute_import, division, print_function
+
 
 import unittest
 
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs.py b/caffe2/python/modeling/compute_histogram_for_blobs.py
index 3b5ea4b64cba..ea83f96f7019 100644
--- a/caffe2/python/modeling/compute_histogram_for_blobs.py
+++ b/caffe2/python/modeling/compute_histogram_for_blobs.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs_test.py b/caffe2/python/modeling/compute_histogram_for_blobs_test.py
index 6c3b59950898..4ce6bf11487a 100644
--- a/caffe2/python/modeling/compute_histogram_for_blobs_test.py
+++ b/caffe2/python/modeling/compute_histogram_for_blobs_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py
index 41b7f88d24eb..24ed7a7482c7 100644
--- a/caffe2/python/modeling/compute_norm_for_blobs.py
+++ b/caffe2/python/modeling/compute_norm_for_blobs.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema, muji
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py
index 3fefce0c4420..1bf3dae0353f 100644
--- a/caffe2/python/modeling/compute_norm_for_blobs_test.py
+++ b/caffe2/python/modeling/compute_norm_for_blobs_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs.py b/caffe2/python/modeling/compute_statistics_for_blobs.py
index 9a3fbcc96954..588b4a827cb8 100644
--- a/caffe2/python/modeling/compute_statistics_for_blobs.py
+++ b/caffe2/python/modeling/compute_statistics_for_blobs.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs_test.py b/caffe2/python/modeling/compute_statistics_for_blobs_test.py
index e880f3edacb1..bf75a1f7d149 100644
--- a/caffe2/python/modeling/compute_statistics_for_blobs_test.py
+++ b/caffe2/python/modeling/compute_statistics_for_blobs_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/get_entry_from_blobs.py b/caffe2/python/modeling/get_entry_from_blobs.py
index 88daa226c887..061dfe33991b 100644
--- a/caffe2/python/modeling/get_entry_from_blobs.py
+++ b/caffe2/python/modeling/get_entry_from_blobs.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 from caffe2.python.modeling.net_modifier import NetModifier
diff --git a/caffe2/python/modeling/get_entry_from_blobs_test.py b/caffe2/python/modeling/get_entry_from_blobs_test.py
index 8f4fbb864be1..3ec146766f30 100644
--- a/caffe2/python/modeling/get_entry_from_blobs_test.py
+++ b/caffe2/python/modeling/get_entry_from_blobs_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/gradient_clipping.py b/caffe2/python/modeling/gradient_clipping.py
index 1999ced9ba1b..b01bc2ba301f 100644
--- a/caffe2/python/modeling/gradient_clipping.py
+++ b/caffe2/python/modeling/gradient_clipping.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/modeling/gradient_clipping_test.py b/caffe2/python/modeling/gradient_clipping_test.py
index ca5c2ba8e22b..0b0e962cb727 100644
--- a/caffe2/python/modeling/gradient_clipping_test.py
+++ b/caffe2/python/modeling/gradient_clipping_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import workspace, brew, model_helper
diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py
index 2053d9e53976..b3e4b1a44dd7 100644
--- a/caffe2/python/modeling/initializers.py
+++ b/caffe2/python/modeling/initializers.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.core import DataType, BlobReference, ScopedBlobReference
 from caffe2.python.modeling.parameter_info import ParameterInfo
diff --git a/caffe2/python/modeling/initializers_test.py b/caffe2/python/modeling/initializers_test.py
index 0355d1871787..fad40c159b6e 100644
--- a/caffe2/python/modeling/initializers_test.py
+++ b/caffe2/python/modeling/initializers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import brew, model_helper, workspace
diff --git a/caffe2/python/modeling/net_modifier.py b/caffe2/python/modeling/net_modifier.py
index 0f0ac7535c88..e824c828e4bd 100644
--- a/caffe2/python/modeling/net_modifier.py
+++ b/caffe2/python/modeling/net_modifier.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import abc
 import six
diff --git a/caffe2/python/modeling/parameter_info.py b/caffe2/python/modeling/parameter_info.py
index 589aa51a7b1c..195048cf91e8 100644
--- a/caffe2/python/modeling/parameter_info.py
+++ b/caffe2/python/modeling/parameter_info.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 
diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py
index 77e5cbd3f8bc..a0174500a413 100644
--- a/caffe2/python/modeling/parameter_sharing.py
+++ b/caffe2/python/modeling/parameter_sharing.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope
 
diff --git a/caffe2/python/modeling/parameter_sharing_test.py b/caffe2/python/modeling/parameter_sharing_test.py
index f616fc1ea6ed..d37e40880c02 100644
--- a/caffe2/python/modeling/parameter_sharing_test.py
+++ b/caffe2/python/modeling/parameter_sharing_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, model_helper, scope
 from caffe2.python.modeling.parameter_sharing import (
diff --git a/caffe2/python/models/__sym_init__.py b/caffe2/python/models/__sym_init__.py
index 79f045879ebc..fa10bff7246b 100644
--- a/caffe2/python/models/__sym_init__.py
+++ b/caffe2/python/models/__sym_init__.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import os
 from caffe2.proto import caffe2_pb2
 
diff --git a/caffe2/python/models/download.py b/caffe2/python/models/download.py
index 4b9a570de807..46a9b59f6627 100644
--- a/caffe2/python/models/download.py
+++ b/caffe2/python/models/download.py
@@ -1,9 +1,9 @@
 ## @package download
 # Module caffe2.python.models.download
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import os
 import sys
diff --git a/caffe2/python/models/imagenet_trainer_test_utils.py b/caffe2/python/models/imagenet_trainer_test_utils.py
index 59107336ccd6..fec7708ea150 100644
--- a/caffe2/python/models/imagenet_trainer_test_utils.py
+++ b/caffe2/python/models/imagenet_trainer_test_utils.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import time
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
index 41ca087d9637..430d3d335e1e 100644
--- a/caffe2/python/models/resnet.py
+++ b/caffe2/python/models/resnet.py
@@ -1,9 +1,9 @@
 ## @package resnet
 # Module caffe2.python.models.resnet
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import brew
 import logging
diff --git a/caffe2/python/models/resnet_test.py b/caffe2/python/models/resnet_test.py
index ce542e8da046..38d87cefff05 100644
--- a/caffe2/python/models/resnet_test.py
+++ b/caffe2/python/models/resnet_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/models/seq2seq/beam_search.py b/caffe2/python/models/seq2seq/beam_search.py
index 7b909697fb05..6fc9f8ece480 100644
--- a/caffe2/python/models/seq2seq/beam_search.py
+++ b/caffe2/python/models/seq2seq/beam_search.py
@@ -1,9 +1,9 @@
 ## @package beam_search
 # Module caffe2.python.models.seq2seq.beam_search
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from collections import namedtuple
 from caffe2.python import core
diff --git a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
index 0ee1f6e35ba0..c10d2f1ab4ed 100644
--- a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
+++ b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import os
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper.py b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
index b2a50c4bd58b..5adabb86fadf 100644
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper.py
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
@@ -1,9 +1,9 @@
 ## @package seq2seq_model_helper
 # Module caffe2.python.models.seq2seq.seq2seq_model_helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
index 8095440f2e5a..b70b74d39dc9 100644
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
+++ b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.models.seq2seq import seq2seq_model_helper
 from caffe2.python import scope, test_util
diff --git a/caffe2/python/models/seq2seq/seq2seq_util.py b/caffe2/python/models/seq2seq/seq2seq_util.py
index d0702880c1ec..e1b4224ea4c8 100644
--- a/caffe2/python/models/seq2seq/seq2seq_util.py
+++ b/caffe2/python/models/seq2seq/seq2seq_util.py
@@ -2,10 +2,10 @@
 # Module caffe2.python.examples.seq2seq_util
 """ A bunch of util functions to build Seq2Seq models with Caffe2."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import collections
 from future.utils import viewitems
diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py
index df68e3e30d7b..8080318da4d0 100644
--- a/caffe2/python/models/seq2seq/train.py
+++ b/caffe2/python/models/seq2seq/train.py
@@ -1,9 +1,9 @@
 ## @package train
 # Module caffe2.python.models.seq2seq.train
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import collections
diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py
index d2b6a4f6399f..7e77f623e553 100644
--- a/caffe2/python/models/seq2seq/translate.py
+++ b/caffe2/python/models/seq2seq/translate.py
@@ -1,9 +1,9 @@
 ## @package translate
 # Module caffe2.python.models.seq2seq.translate
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from abc import ABCMeta, abstractmethod
 import argparse
diff --git a/caffe2/python/models/shufflenet.py b/caffe2/python/models/shufflenet.py
index c9075a4a1295..33a7f7a4b7c5 100644
--- a/caffe2/python/models/shufflenet.py
+++ b/caffe2/python/models/shufflenet.py
@@ -1,9 +1,9 @@
 # Module caffe2.python.models.shufflenet
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew
 
diff --git a/caffe2/python/models/shufflenet_test.py b/caffe2/python/models/shufflenet_test.py
index 344c720b3eb6..6ccfd0a83354 100644
--- a/caffe2/python/models/shufflenet_test.py
+++ b/caffe2/python/models/shufflenet_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/modifier_context.py b/caffe2/python/modifier_context.py
index 008e651e41f7..b65d97587549 100644
--- a/caffe2/python/modifier_context.py
+++ b/caffe2/python/modifier_context.py
@@ -1,9 +1,9 @@
 # @package modifier_context
 # Module caffe2.python.modifier_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 DEFAULT_MODIFIER = 'DEFAULT'
diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py
index f1af8c3eb521..70dcdec11a58 100644
--- a/caffe2/python/net_builder.py
+++ b/caffe2/python/net_builder.py
@@ -1,9 +1,9 @@
 ## @package net_builder
 # Module caffe2.python.net_builder
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, context
 from caffe2.python.task import Task, TaskGroup
diff --git a/caffe2/python/net_builder_test.py b/caffe2/python/net_builder_test.py
index 169419c5c17b..bef6caefac3d 100644
--- a/caffe2/python/net_builder_test.py
+++ b/caffe2/python/net_builder_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace
 from caffe2.python.core import Plan, to_execution_step, Net
diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py
index 1fd0833a718f..b55699c1c095 100644
--- a/caffe2/python/net_drawer.py
+++ b/caffe2/python/net_drawer.py
@@ -1,9 +1,9 @@
 ## @package net_drawer
 # Module caffe2.python.net_drawer
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import argparse
 import json
 import logging
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 09cde6f76767..8e1d65c01ce7 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -1,9 +1,9 @@
 ## @package net_printer
 # Module caffe2.python.net_printer
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef
 from caffe2.python.checkpoint import Job
diff --git a/caffe2/python/net_printer_test.py b/caffe2/python/net_printer_test.py
index bc086c3eee2a..e71a2b323dea 100644
--- a/caffe2/python/net_printer_test.py
+++ b/caffe2/python/net_printer_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import net_printer
 from caffe2.python.checkpoint import Job
diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
index c2f1774c7b2b..2b83e0ec9358 100644
--- a/caffe2/python/nomnigraph.py
+++ b/caffe2/python/nomnigraph.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import errno
 import os
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index 6ff47c6d4c9a..3d9adc696486 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, test_util
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/nomnigraph_transformations.py b/caffe2/python/nomnigraph_transformations.py
index f4bc2c68bbb6..570c743df152 100644
--- a/caffe2/python/nomnigraph_transformations.py
+++ b/caffe2/python/nomnigraph_transformations.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from collections import defaultdict
 
diff --git a/caffe2/python/nomnigraph_transformations_test.py b/caffe2/python/nomnigraph_transformations_test.py
index 6c58691db277..adbfe1a4885a 100644
--- a/caffe2/python/nomnigraph_transformations_test.py
+++ b/caffe2/python/nomnigraph_transformations_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python import test_util as tu
diff --git a/caffe2/python/normalizer.py b/caffe2/python/normalizer.py
index 1d452c6cbe60..2ca147328c78 100644
--- a/caffe2/python/normalizer.py
+++ b/caffe2/python/normalizer.py
@@ -1,6 +1,6 @@
 # @package optimizer
 # Module caffe2.python.normalizer
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 
 class Normalizer(object):
diff --git a/caffe2/python/normalizer_context.py b/caffe2/python/normalizer_context.py
index 57c1052103dc..a85b993b4502 100644
--- a/caffe2/python/normalizer_context.py
+++ b/caffe2/python/normalizer_context.py
@@ -1,9 +1,9 @@
 # @package regularizer_context
 # Module caffe2.python.normalizer_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context
 from caffe2.python.modifier_context import (
diff --git a/caffe2/python/normalizer_test.py b/caffe2/python/normalizer_test.py
index 1f4cb4896778..f0ce5099ea75 100644
--- a/caffe2/python/normalizer_test.py
+++ b/caffe2/python/normalizer_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python.normalizer_context import UseNormalizer, NormalizerContext
 from caffe2.python.normalizer import BatchNormalizer
diff --git a/caffe2/python/numa_benchmark.py b/caffe2/python/numa_benchmark.py
index 21c1cb158da1..a840c6932123 100644
--- a/caffe2/python/numa_benchmark.py
+++ b/caffe2/python/numa_benchmark.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
index 692f515abe87..aba6e420ed55 100644
--- a/caffe2/python/numa_test.py
+++ b/caffe2/python/numa_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/observer_test.py b/caffe2/python/observer_test.py
index 684514d17268..cc3ca1718a5c 100644
--- a/caffe2/python/observer_test.py
+++ b/caffe2/python/observer_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 9fe7b23bb7ae..d0f768e42eeb 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -5,10 +5,10 @@
 
 To run this, you will need to have Caffe2 installed as well.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import collections
diff --git a/caffe2/python/onnx/backend_cpp_rep.py b/caffe2/python/onnx/backend_cpp_rep.py
index 27135b35763d..4a75068cfd03 100644
--- a/caffe2/python/onnx/backend_cpp_rep.py
+++ b/caffe2/python/onnx/backend_cpp_rep.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.backend_rep_cpp
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from onnx.backend.base import BackendRep, namedtupledict
 
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
index 13feea3ac8c9..ab97fd562dc1 100644
--- a/caffe2/python/onnx/backend_rep.py
+++ b/caffe2/python/onnx/backend_rep.py
@@ -1,9 +1,9 @@
 # @package onnx
 # Module caffe2.python.onnx.backend_rep
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py
index a30ebdfc3f54..126eef8a8470 100644
--- a/caffe2/python/onnx/bin/conversion.py
+++ b/caffe2/python/onnx/bin/conversion.py
@@ -1,9 +1,9 @@
 ## @package onnx
 # Module caffe2.python.onnx.bin.conversion
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import json
 
diff --git a/caffe2/python/onnx/error.py b/caffe2/python/onnx/error.py
index da72af2cc9b1..1bac8290464d 100644
--- a/caffe2/python/onnx/error.py
+++ b/caffe2/python/onnx/error.py
@@ -1,8 +1,8 @@
 ## @package onnx
 # Module caffe2.python.onnx.error
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 class BaseException(Exception): pass
 class Unsupported(BaseException): pass
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index 0fc1c0328093..ee3c30949ff7 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -6,10 +6,10 @@
 To run this, you will need to have Caffe2 installed as well.
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import itertools
 import logging
diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py
index e1d56e1a6766..7f8f1a6d346a 100644
--- a/caffe2/python/onnx/helper.py
+++ b/caffe2/python/onnx/helper.py
@@ -1,9 +1,9 @@
 ## @package onnx
 # Module caffe2.python.onnx.helper
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from onnx.backend.base import namedtupledict
diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py
index 6bbd35cd434c..a04e7e4554b9 100644
--- a/caffe2/python/onnx/onnxifi.py
+++ b/caffe2/python/onnx/onnxifi.py
@@ -5,10 +5,10 @@
 ONNXIFI a Caffe2 net
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py
index a859b572bae6..7eafccaec9e4 100644
--- a/caffe2/python/onnx/test_onnxifi.py
+++ b/caffe2/python/onnx/test_onnxifi.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import json
 import numpy as np
diff --git a/caffe2/python/onnx/tests/__init__.py b/caffe2/python/onnx/tests/__init__.py
index e0a02b9d5d83..fd40910d9e70 100644
--- a/caffe2/python/onnx/tests/__init__.py
+++ b/caffe2/python/onnx/tests/__init__.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index d909cf828042..d253b06658a3 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -1,10 +1,10 @@
 # @package onnx
 # Module caffe2.python.onnx.tests.c2_ref_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import json
 import os
diff --git a/caffe2/python/onnx/tests/conversion_test.py b/caffe2/python/onnx/tests/conversion_test.py
index 8fa128acd62b..86cdddcd1692 100644
--- a/caffe2/python/onnx/tests/conversion_test.py
+++ b/caffe2/python/onnx/tests/conversion_test.py
@@ -1,9 +1,9 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.conversion_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import json
 import six
diff --git a/caffe2/python/onnx/tests/helper_test.py b/caffe2/python/onnx/tests/helper_test.py
index e3682780cb04..9000ad94fd9b 100644
--- a/caffe2/python/onnx/tests/helper_test.py
+++ b/caffe2/python/onnx/tests/helper_test.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.helper_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index ad7885fcda74..e4de0a19c07a 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -1,10 +1,10 @@
 # @package onnx
 # Module caffe2.python.onnx.tests.onnx_backend_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 
diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py
index 34f849400e30..d34d4a0e5287 100644
--- a/caffe2/python/onnx/tests/ssa_test.py
+++ b/caffe2/python/onnx/tests/ssa_test.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.ssa_test
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import copy
 import onnx
diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py
index 1fec938c8e88..d224daf05ba3 100644
--- a/caffe2/python/onnx/tests/test_utils.py
+++ b/caffe2/python/onnx/tests/test_utils.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.tests.test_utils
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import os
 import unittest
diff --git a/caffe2/python/onnx/workspace.py b/caffe2/python/onnx/workspace.py
index a311ec37dfdc..f03e3609fe8b 100644
--- a/caffe2/python/onnx/workspace.py
+++ b/caffe2/python/onnx/workspace.py
@@ -1,10 +1,10 @@
 ## @package onnx
 # Module caffe2.python.onnx.workspace
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import uuid
 
diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py
index 6e08f920a422..3a1ebcd4ec67 100644
--- a/caffe2/python/operator_fp_exceptions_test.py
+++ b/caffe2/python/operator_fp_exceptions_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
index 6a7a5ca18ef3..132bee879f6d 100644
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py
index 4cb9a54ec664..265d783e6336 100644
--- a/caffe2/python/operator_test/adadelta_test.py
+++ b/caffe2/python/operator_test/adadelta_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py
index 5ed2d0287e63..55e2f570cf07 100644
--- a/caffe2/python/operator_test/adagrad_test.py
+++ b/caffe2/python/operator_test/adagrad_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/adagrad_test_helper.py b/caffe2/python/operator_test/adagrad_test_helper.py
index 891361e3a879..0fe4aa21f5f9 100644
--- a/caffe2/python/operator_test/adagrad_test_helper.py
+++ b/caffe2/python/operator_test/adagrad_test_helper.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from functools import partial
 
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
index 0d188abc52be..2fb13c149922 100644
--- a/caffe2/python/operator_test/adam_test.py
+++ b/caffe2/python/operator_test/adam_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py
index 7e37216b82c1..76b09fdd5cd6 100644
--- a/caffe2/python/operator_test/affine_channel_op_test.py
+++ b/caffe2/python/operator_test/affine_channel_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/apmeter_test.py b/caffe2/python/operator_test/apmeter_test.py
index b7a50ab98e87..1ca26bf64f31 100644
--- a/caffe2/python/operator_test/apmeter_test.py
+++ b/caffe2/python/operator_test/apmeter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py
index ce800636e6e6..330d17ed6999 100644
--- a/caffe2/python/operator_test/arg_ops_test.py
+++ b/caffe2/python/operator_test/arg_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 import numpy as np
diff --git a/caffe2/python/operator_test/assert_test.py b/caffe2/python/operator_test/assert_test.py
index e3474c0da7a4..2bbca5ab7376 100644
--- a/caffe2/python/operator_test/assert_test.py
+++ b/caffe2/python/operator_test/assert_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py
index 753e76f15319..88e38df52da5 100644
--- a/caffe2/python/operator_test/atomic_ops_test.py
+++ b/caffe2/python/operator_test/atomic_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/operator_test/basic_rnn_test.py b/caffe2/python/operator_test/basic_rnn_test.py
index 516c066c6ed8..e863289d488c 100644
--- a/caffe2/python/operator_test/basic_rnn_test.py
+++ b/caffe2/python/operator_test/basic_rnn_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core, rnn_cell
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py
index 19186220159c..c9306ce1ab07 100644
--- a/caffe2/python/operator_test/batch_box_cox_test.py
+++ b/caffe2/python/operator_test/batch_box_cox_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py
index fb13b0c08933..82def0572686 100644
--- a/caffe2/python/operator_test/batch_bucketize_op_test.py
+++ b/caffe2/python/operator_test/batch_bucketize_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py
index c3ee8750225b..12dd72a4160a 100644
--- a/caffe2/python/operator_test/batch_moments_op_test.py
+++ b/caffe2/python/operator_test/batch_moments_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
index ef59ed23888f..adfc735c66fd 100644
--- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
+++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py
index f1ee07c0d1e3..d2584f18af40 100644
--- a/caffe2/python/operator_test/bbox_transform_test.py
+++ b/caffe2/python/operator_test/bbox_transform_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py
index 77faeaeeb608..147a41282505 100644
--- a/caffe2/python/operator_test/bisect_percentile_op_test.py
+++ b/caffe2/python/operator_test/bisect_percentile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py
index 6e4c25c77c78..6cf8170b34f8 100644
--- a/caffe2/python/operator_test/blobs_queue_db_test.py
+++ b/caffe2/python/operator_test/blobs_queue_db_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 9ccaeaf9e7a7..05b8212242e4 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py
index e3bc9f248d3a..8cba2aecf1a4 100644
--- a/caffe2/python/operator_test/boolean_unmask_test.py
+++ b/caffe2/python/operator_test/boolean_unmask_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
index bfbe9b7396fa..3131316feefd 100644
--- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py
+++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py
index d1cd6ada7f55..bf9af112a5b0 100644
--- a/caffe2/python/operator_test/bucketize_op_test.py
+++ b/caffe2/python/operator_test/bucketize_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, dyndep
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/cast_op_test.py b/caffe2/python/operator_test/cast_op_test.py
index f7ffb5b45b47..bf2a210086e6 100644
--- a/caffe2/python/operator_test/cast_op_test.py
+++ b/caffe2/python/operator_test/cast_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
index 4e30c915ce2a..e8ee47702445 100644
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
index 7d614047f48d..7adc5ce24fb7 100644
--- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py
index d420484bac6b..b821e7b6a43c 100644
--- a/caffe2/python/operator_test/channel_shuffle_test.py
+++ b/caffe2/python/operator_test/channel_shuffle_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py
index cbef433ae0d3..72eedc479dd6 100644
--- a/caffe2/python/operator_test/channel_stats_op_test.py
+++ b/caffe2/python/operator_test/channel_stats_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/checkpoint_test.py b/caffe2/python/operator_test/checkpoint_test.py
index 7449ab61f32d..3042e5989764 100644
--- a/caffe2/python/operator_test/checkpoint_test.py
+++ b/caffe2/python/operator_test/checkpoint_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, test_util
 import os
diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py
index c2d9809c8d80..3304121aab08 100644
--- a/caffe2/python/operator_test/clip_op_test.py
+++ b/caffe2/python/operator_test/clip_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py
index ee5bd8f73eb3..efc86815bc49 100644
--- a/caffe2/python/operator_test/clip_tensor_op_test.py
+++ b/caffe2/python/operator_test/clip_tensor_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
index b5d726d449fc..28e6cd3b3df6 100644
--- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
+++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
index bbe0e8eda1c1..1927b4eac78f 100644
--- a/caffe2/python/operator_test/concat_split_op_test.py
+++ b/caffe2/python/operator_test/concat_split_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import unicode_literals
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py
index 88d8fd8b7a27..2e214f089a45 100644
--- a/caffe2/python/operator_test/conditional_test.py
+++ b/caffe2/python/operator_test/conditional_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py
index ccd78eea4aa3..a240e98fc51e 100644
--- a/caffe2/python/operator_test/conftest.py
+++ b/caffe2/python/operator_test/conftest.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python.serialized_test.serialized_test_util as serial
 
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 3e24e05191ac..ae54cd37a91d 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import collections
 import functools
diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py
index 6bed93226f5b..4fcb6361d0a6 100644
--- a/caffe2/python/operator_test/conv_transpose_test.py
+++ b/caffe2/python/operator_test/conv_transpose_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py
index 4efec570e812..2b8b756cdf61 100644
--- a/caffe2/python/operator_test/copy_ops_test.py
+++ b/caffe2/python/operator_test/copy_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
index 9024ee3edfd1..8e914259bb78 100644
--- a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
+++ b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import logging
 
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
index 1124df94e67a..04bfbbe6f4f6 100644
--- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
+++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/counter_ops_test.py b/caffe2/python/operator_test/counter_ops_test.py
index 3ebe26415622..d57ff31508c6 100644
--- a/caffe2/python/operator_test/counter_ops_test.py
+++ b/caffe2/python/operator_test/counter_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py
index d9eb89fc3352..b75e7b7b1a10 100644
--- a/caffe2/python/operator_test/crf_test.py
+++ b/caffe2/python/operator_test/crf_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import workspace, crf, brew
 from caffe2.python.model_helper import ModelHelper
 import numpy as np
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
index 25dc6791fa12..d1852e7dd9e8 100644
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
index 21ca68fe007a..1dda7166e65a 100644
--- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.test_util import caffe2_flaky
diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
index 0fd38a82b403..8bc7eb47d488 100644
--- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py
index 5de901026eb6..db1b826cfe41 100644
--- a/caffe2/python/operator_test/cudnn_recurrent_test.py
+++ b/caffe2/python/operator_test/cudnn_recurrent_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import model_helper, workspace, core, rnn_cell
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/operator_test/data_couple_op_test.py b/caffe2/python/operator_test/data_couple_op_test.py
index 32cf21e81bbf..d840207159b2 100644
--- a/caffe2/python/operator_test/data_couple_op_test.py
+++ b/caffe2/python/operator_test/data_couple_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py
index 138ac90e68c8..96d93dc5effb 100644
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ b/caffe2/python/operator_test/dataset_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import core, workspace, dataset
 from caffe2.python.dataset import Const
diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py
index 31e407499063..f6ad0e38e73c 100644
--- a/caffe2/python/operator_test/deform_conv_test.py
+++ b/caffe2/python/operator_test/deform_conv_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import os
 import unittest
diff --git a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
index aea30d890416..8b6f42417fd4 100644
--- a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
+++ b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
index af431f1f07d4..2d6d6429f833 100644
--- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py
+++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/detectron_keypoints.py b/caffe2/python/operator_test/detectron_keypoints.py
index 2f34349beae4..1abff0675993 100644
--- a/caffe2/python/operator_test/detectron_keypoints.py
+++ b/caffe2/python/operator_test/detectron_keypoints.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 try:
     import cv2
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
index 753b94d20f1f..e948fdae9673 100644
--- a/caffe2/python/operator_test/distance_op_test.py
+++ b/caffe2/python/operator_test/distance_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
index c8c46127e4d9..84c2f7e35f56 100644
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import assume, given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/duplicate_operands_test.py b/caffe2/python/operator_test/duplicate_operands_test.py
index 385e69fded4c..179b42dbabc8 100644
--- a/caffe2/python/operator_test/duplicate_operands_test.py
+++ b/caffe2/python/operator_test/duplicate_operands_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
index 8c7df5f33625..ac0dc3dd0975 100644
--- a/caffe2/python/operator_test/elementwise_linear_op_test.py
+++ b/caffe2/python/operator_test/elementwise_linear_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py
index e35b4a483c6d..3195d969dee5 100644
--- a/caffe2/python/operator_test/elementwise_logical_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index ef9c1b9c8cf3..605c1d741271 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index ca2b847f088c..ed7a09eb0857 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, assume, settings
diff --git a/caffe2/python/operator_test/emptysample_ops_test.py b/caffe2/python/operator_test/emptysample_ops_test.py
index a04e9d0e161d..0f728b723163 100644
--- a/caffe2/python/operator_test/emptysample_ops_test.py
+++ b/caffe2/python/operator_test/emptysample_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py
index c8c12e240946..b843bfdc95b9 100644
--- a/caffe2/python/operator_test/enforce_finite_op_test.py
+++ b/caffe2/python/operator_test/enforce_finite_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import numpy as np
diff --git a/caffe2/python/operator_test/ensure_clipped_test.py b/caffe2/python/operator_test/ensure_clipped_test.py
index 8d3c638e1ba1..a89718745b1c 100644
--- a/caffe2/python/operator_test/ensure_clipped_test.py
+++ b/caffe2/python/operator_test/ensure_clipped_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/ensure_cpu_output_op_test.py b/caffe2/python/operator_test/ensure_cpu_output_op_test.py
index 509c28a5a8bb..4812ee3042e0 100644
--- a/caffe2/python/operator_test/ensure_cpu_output_op_test.py
+++ b/caffe2/python/operator_test/ensure_cpu_output_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import numpy as np
diff --git a/caffe2/python/operator_test/erf_op_test.py b/caffe2/python/operator_test/erf_op_test.py
index 5761c8409bd3..64714db4315c 100644
--- a/caffe2/python/operator_test/erf_op_test.py
+++ b/caffe2/python/operator_test/erf_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import math
 
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
index 4be96208fbba..0d198b1aff14 100644
--- a/caffe2/python/operator_test/expand_op_test.py
+++ b/caffe2/python/operator_test/expand_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py
index c08596f8717d..1e8b5522053d 100644
--- a/caffe2/python/operator_test/fc_operator_test.py
+++ b/caffe2/python/operator_test/fc_operator_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py
index 1d64b19b993f..19fa329c9389 100644
--- a/caffe2/python/operator_test/feature_maps_ops_test.py
+++ b/caffe2/python/operator_test/feature_maps_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace, dyndep
 from caffe2.python.test_util import TestCase
 import numpy as np
diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py
index 4a2d9419d7bc..e080dde3eb5f 100644
--- a/caffe2/python/operator_test/filler_ops_test.py
+++ b/caffe2/python/operator_test/filler_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py
index c6d2856c3514..fc25913d8744 100644
--- a/caffe2/python/operator_test/find_op_test.py
+++ b/caffe2/python/operator_test/find_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/flatten_op_test.py b/caffe2/python/operator_test/flatten_op_test.py
index 19d204e0bded..2e0340c68779 100644
--- a/caffe2/python/operator_test/flatten_op_test.py
+++ b/caffe2/python/operator_test/flatten_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import numpy as np
diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py
index 9542ecd30691..3e0e5722b0ce 100644
--- a/caffe2/python/operator_test/flexible_top_k_test.py
+++ b/caffe2/python/operator_test/flexible_top_k_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py
index 5877cb6da4e8..8c0974bb8579 100644
--- a/caffe2/python/operator_test/floor_op_test.py
+++ b/caffe2/python/operator_test/floor_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
index ecabe7d29ef0..12d0b0265afb 100644
--- a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
+++ b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import math
 import struct
diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
index 09225385191a..e9af40a128a6 100644
--- a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
+++ b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
index 967131de38d8..fc23be13fdae 100644
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ b/caffe2/python/operator_test/gather_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
index 19d538c60556..c0d73af33601 100644
--- a/caffe2/python/operator_test/gather_ranges_op_test.py
+++ b/caffe2/python/operator_test/gather_ranges_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
index 3b1b4bf86515..7dea8f308783 100644
--- a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
+++ b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py
index bcd277cf258b..3d929ce5c0ee 100644
--- a/caffe2/python/operator_test/given_tensor_fill_op_test.py
+++ b/caffe2/python/operator_test/given_tensor_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
index f70c0739ded8..f38df09ec9fb 100644
--- a/caffe2/python/operator_test/glu_op_test.py
+++ b/caffe2/python/operator_test/glu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
index 1d46888e791a..62aba236d5ba 100644
--- a/caffe2/python/operator_test/group_conv_test.py
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py
index d17998c32986..14300beed3f9 100644
--- a/caffe2/python/operator_test/group_norm_op_test.py
+++ b/caffe2/python/operator_test/group_norm_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
index 99da7a3f5626..99444f39ac26 100644
--- a/caffe2/python/operator_test/gru_test.py
+++ b/caffe2/python/operator_test/gru_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core, scope, gru_cell
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
index ae8c1dc22799..e683a04d7998 100644
--- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
+++ b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import torch
diff --git a/caffe2/python/operator_test/hsm_test.py b/caffe2/python/operator_test/hsm_test.py
index f2321adc8e01..245bca210ad9 100644
--- a/caffe2/python/operator_test/hsm_test.py
+++ b/caffe2/python/operator_test/hsm_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from hypothesis import given, settings
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
index 9fdf0cabb0bd..90a8197e7ccf 100644
--- a/caffe2/python/operator_test/hyperbolic_ops_test.py
+++ b/caffe2/python/operator_test/hyperbolic_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
index 98e9d61b5bd0..760228382bc6 100644
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/image_input_op_test.py b/caffe2/python/operator_test/image_input_op_test.py
index 79acc60739f1..0de1f0ad048b 100644
--- a/caffe2/python/operator_test/image_input_op_test.py
+++ b/caffe2/python/operator_test/image_input_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 try:
diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py
index f7c6d0cdc14a..1eb7ffa20691 100644
--- a/caffe2/python/operator_test/index_hash_ops_test.py
+++ b/caffe2/python/operator_test/index_hash_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py
index 642f340fad80..cf021f59362b 100644
--- a/caffe2/python/operator_test/index_ops_test.py
+++ b/caffe2/python/operator_test/index_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 import numpy as np
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
index e57b8a8e11d8..fb4f3c935ba8 100644
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, assume, settings
diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py
index 212f807addcf..79d79ae6de21 100644
--- a/caffe2/python/operator_test/integral_image_ops_test.py
+++ b/caffe2/python/operator_test/integral_image_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
index 51faa14b9029..6ed2db2e88c2 100644
--- a/caffe2/python/operator_test/jsd_ops_test.py
+++ b/caffe2/python/operator_test/jsd_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/key_split_ops_test.py b/caffe2/python/operator_test/key_split_ops_test.py
index be38ee38926f..18fddff58d17 100644
--- a/caffe2/python/operator_test/key_split_ops_test.py
+++ b/caffe2/python/operator_test/key_split_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/lars_test.py b/caffe2/python/operator_test/lars_test.py
index e2f02b29d26f..6f976520e06b 100644
--- a/caffe2/python/operator_test/lars_test.py
+++ b/caffe2/python/operator_test/lars_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 89ba4b2017bd..56cd72d69991 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import brew, core, workspace
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/operator_test/leaky_relu_test.py b/caffe2/python/operator_test/leaky_relu_test.py
index 2eaa782eeefd..9a888cac7901 100644
--- a/caffe2/python/operator_test/leaky_relu_test.py
+++ b/caffe2/python/operator_test/leaky_relu_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, assume
diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
index 3a5d44663771..1891171b80d8 100644
--- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py
index 1a1f9eb8c842..bdce6a4c78f7 100644
--- a/caffe2/python/operator_test/learning_rate_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py
index fa3ac0826230..28d7134ac5e8 100644
--- a/caffe2/python/operator_test/length_split_op_test.py
+++ b/caffe2/python/operator_test/length_split_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
index d9cd2b244604..626ec0542b7d 100644
--- a/caffe2/python/operator_test/lengths_pad_op_test.py
+++ b/caffe2/python/operator_test/lengths_pad_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
index 88c99c3da337..fc4e89e2545b 100644
--- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
+++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
index 4a9a6b0ff1a9..e0a5f9609588 100644
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py
index 8bc27c31144f..b8b082a02125 100644
--- a/caffe2/python/operator_test/lengths_top_k_ops_test.py
+++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py
index 8f4f680de109..c08f1180a920 100644
--- a/caffe2/python/operator_test/listwise_l2r_operator_test.py
+++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index a5e28479cf10..845bafee4702 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import errno
 import hypothesis.strategies as st
 from hypothesis import given, assume, settings
diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py
index cfd49b8a7eb8..6eb3181ea9ad 100644
--- a/caffe2/python/operator_test/locally_connected_op_test.py
+++ b/caffe2/python/operator_test/locally_connected_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import given, settings, assume
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
index e57bdb7a1d41..24cb65ac96f8 100644
--- a/caffe2/python/operator_test/loss_ops_test.py
+++ b/caffe2/python/operator_test/loss_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py
index 1fcacc4f26f8..3a58cbe6d960 100644
--- a/caffe2/python/operator_test/lpnorm_op_test.py
+++ b/caffe2/python/operator_test/lpnorm_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/map_ops_test.py b/caffe2/python/operator_test/map_ops_test.py
index add86a3a467e..dcc8b295f7c3 100644
--- a/caffe2/python/operator_test/map_ops_test.py
+++ b/caffe2/python/operator_test/map_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import itertools
 import numpy as np
diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
index 354aed27aaf4..e28dd1ce28f8 100644
--- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
+++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py
index e18025ffb92d..4849b83648f8 100644
--- a/caffe2/python/operator_test/math_ops_test.py
+++ b/caffe2/python/operator_test/math_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index fababb13c54a..b8cef19b24df 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import inspect
 
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
index 77c6b82625b1..5830089f8e9b 100644
--- a/caffe2/python/operator_test/mean_op_test.py
+++ b/caffe2/python/operator_test/mean_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py
index 9f3302c6e75a..36b765557505 100644
--- a/caffe2/python/operator_test/merge_id_lists_op_test.py
+++ b/caffe2/python/operator_test/merge_id_lists_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/mkl_conv_op_test.py b/caffe2/python/operator_test/mkl_conv_op_test.py
index b72848b9a422..595debf977fe 100644
--- a/caffe2/python/operator_test/mkl_conv_op_test.py
+++ b/caffe2/python/operator_test/mkl_conv_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/mkl_packed_fc_op_test.py b/caffe2/python/operator_test/mkl_packed_fc_op_test.py
index 59546d3891e9..2f889d693444 100644
--- a/caffe2/python/operator_test/mkl_packed_fc_op_test.py
+++ b/caffe2/python/operator_test/mkl_packed_fc_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/mod_op_test.py b/caffe2/python/operator_test/mod_op_test.py
index 92a318f3f10f..914bffd2067c 100644
--- a/caffe2/python/operator_test/mod_op_test.py
+++ b/caffe2/python/operator_test/mod_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy
 
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
index ae9d9158f506..3b270df254ce 100644
--- a/caffe2/python/operator_test/moments_op_test.py
+++ b/caffe2/python/operator_test/moments_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py
index a37e27141bd0..58f16e87a21c 100644
--- a/caffe2/python/operator_test/momentum_sgd_test.py
+++ b/caffe2/python/operator_test/momentum_sgd_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py
index 0885289c7c1a..bb111a125fc0 100644
--- a/caffe2/python/operator_test/mpi_test.py
+++ b/caffe2/python/operator_test/mpi_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/mul_gradient_benchmark.py b/caffe2/python/operator_test/mul_gradient_benchmark.py
index 721676239409..2e11aefcb497 100644
--- a/caffe2/python/operator_test/mul_gradient_benchmark.py
+++ b/caffe2/python/operator_test/mul_gradient_benchmark.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 import numpy as np
diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py
index 14ca954d363f..137be1eece34 100644
--- a/caffe2/python/operator_test/negate_gradient_op_test.py
+++ b/caffe2/python/operator_test/negate_gradient_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/ngram_ops_test.py b/caffe2/python/operator_test/ngram_ops_test.py
index 70aad5cab814..3f4e57fa230b 100644
--- a/caffe2/python/operator_test/ngram_ops_test.py
+++ b/caffe2/python/operator_test/ngram_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py
index 46f88a1de079..7a35e0bafa31 100644
--- a/caffe2/python/operator_test/normalize_op_test.py
+++ b/caffe2/python/operator_test/normalize_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
index 398b0d4b93ab..a202581f808c 100644
--- a/caffe2/python/operator_test/numpy_tile_op_test.py
+++ b/caffe2/python/operator_test/numpy_tile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
index d0b7a08ee706..593d5b5aa58c 100644
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ b/caffe2/python/operator_test/one_hot_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
index 811e38e34af7..4cff53b87d6e 100644
--- a/caffe2/python/operator_test/onnx_while_test.py
+++ b/caffe2/python/operator_test/onnx_while_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py
index 3777fdd7695d..7b3f40a27c97 100644
--- a/caffe2/python/operator_test/order_switch_test.py
+++ b/caffe2/python/operator_test/order_switch_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
index 84f3f46a6dc1..698fbb76df88 100644
--- a/caffe2/python/operator_test/pack_ops_test.py
+++ b/caffe2/python/operator_test/pack_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
index 6bf2315ca0c5..9a76e6b847a5 100644
--- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
+++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
index 43cd10c23188..6d4e6bbdcd08 100644
--- a/caffe2/python/operator_test/pad_test.py
+++ b/caffe2/python/operator_test/pad_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py
index a5a7db12b1ef..b600c302d83b 100644
--- a/caffe2/python/operator_test/partition_ops_test.py
+++ b/caffe2/python/operator_test/partition_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase, rand_array
diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py
index 54c42bf63917..d81b0a963185 100644
--- a/caffe2/python/operator_test/percentile_op_test.py
+++ b/caffe2/python/operator_test/percentile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace, dyndep
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py
index 463380306ce4..d7c4e0df4416 100644
--- a/caffe2/python/operator_test/piecewise_linear_transform_test.py
+++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py
index 743cee5cef3c..7ef98249bd79 100644
--- a/caffe2/python/operator_test/pooling_test.py
+++ b/caffe2/python/operator_test/pooling_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/prepend_dim_test.py b/caffe2/python/operator_test/prepend_dim_test.py
index 6cf8e7a81b5e..d794ba2162b9 100644
--- a/caffe2/python/operator_test/prepend_dim_test.py
+++ b/caffe2/python/operator_test/prepend_dim_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/python_op_test.py b/caffe2/python/operator_test/python_op_test.py
index 7467c8c3900c..b071070151d1 100644
--- a/caffe2/python/operator_test/python_op_test.py
+++ b/caffe2/python/operator_test/python_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.core import CreatePythonOperator
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/quantile_test.py b/caffe2/python/operator_test/quantile_test.py
index 6a4250d06183..39f3728d8e81 100644
--- a/caffe2/python/operator_test/quantile_test.py
+++ b/caffe2/python/operator_test/quantile_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/rand_quantization_op_speed_test.py b/caffe2/python/operator_test/rand_quantization_op_speed_test.py
index ce0e84028541..1c56faff645f 100644
--- a/caffe2/python/operator_test/rand_quantization_op_speed_test.py
+++ b/caffe2/python/operator_test/rand_quantization_op_speed_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import time
 
diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py
index 811a20505a3c..e244f77149e1 100644
--- a/caffe2/python/operator_test/rand_quantization_op_test.py
+++ b/caffe2/python/operator_test/rand_quantization_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import struct
diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py
index 94220d76762d..2d52da293127 100644
--- a/caffe2/python/operator_test/rank_loss_operator_test.py
+++ b/caffe2/python/operator_test/rank_loss_operator_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given
diff --git a/caffe2/python/operator_test/rebatching_queue_test.py b/caffe2/python/operator_test/rebatching_queue_test.py
index 930fad30d663..53d3fd4f4ecc 100644
--- a/caffe2/python/operator_test/rebatching_queue_test.py
+++ b/caffe2/python/operator_test/rebatching_queue_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/operator_test/record_queue_test.py b/caffe2/python/operator_test/record_queue_test.py
index d32b3e794ab4..00e47ed1cb68 100644
--- a/caffe2/python/operator_test/record_queue_test.py
+++ b/caffe2/python/operator_test/record_queue_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.dataset import Dataset
 from caffe2.python.schema import (
diff --git a/caffe2/python/operator_test/recurrent_net_executor_test.py b/caffe2/python/operator_test/recurrent_net_executor_test.py
index 24bd0122f4fb..5d9b83604423 100644
--- a/caffe2/python/operator_test/recurrent_net_executor_test.py
+++ b/caffe2/python/operator_test/recurrent_net_executor_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import model_helper, workspace, core, rnn_cell, test_util
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
index 7cf79edfafed..13650e6cad4e 100644
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import recurrent, workspace
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
index ffb5e8a02667..727631befe89 100644
--- a/caffe2/python/operator_test/reduce_ops_test.py
+++ b/caffe2/python/operator_test/reduce_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py
index 018024900281..7d4287df6609 100644
--- a/caffe2/python/operator_test/reduction_ops_test.py
+++ b/caffe2/python/operator_test/reduction_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
index 9c57ed4f3090..a42f00bbf82f 100644
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 import six
 from numpy.testing import assert_array_equal
diff --git a/caffe2/python/operator_test/resize_op_test.py b/caffe2/python/operator_test/resize_op_test.py
index 893e09cf6443..cd90656f607d 100644
--- a/caffe2/python/operator_test/resize_op_test.py
+++ b/caffe2/python/operator_test/resize_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/rmac_regions_op_test.py b/caffe2/python/operator_test/rmac_regions_op_test.py
index 856832c34b99..084d7402df5f 100644
--- a/caffe2/python/operator_test/rmac_regions_op_test.py
+++ b/caffe2/python/operator_test/rmac_regions_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/rms_norm_op_test.py b/caffe2/python/operator_test/rms_norm_op_test.py
index f5a35701877c..797b3c9a01c3 100644
--- a/caffe2/python/operator_test/rms_norm_op_test.py
+++ b/caffe2/python/operator_test/rms_norm_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 64cd7bf48913..8fe037ccb70c 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import (
     core, gradient_checker, rnn_cell, workspace, scope, utils
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
index 0487d962e6fb..c74157a039b0 100644
--- a/caffe2/python/operator_test/roi_align_rotated_op_test.py
+++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/operator_test/rowwise_counter_test.py b/caffe2/python/operator_test/rowwise_counter_test.py
index a00dd24b3f2c..a9dacc5a6d86 100644
--- a/caffe2/python/operator_test/rowwise_counter_test.py
+++ b/caffe2/python/operator_test/rowwise_counter_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/scale_op_test.py b/caffe2/python/operator_test/scale_op_test.py
index 14e17dc2c5d5..b5507e2013fa 100644
--- a/caffe2/python/operator_test/scale_op_test.py
+++ b/caffe2/python/operator_test/scale_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 
diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py
index 01c415eac953..f991a7dde211 100644
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ b/caffe2/python/operator_test/segment_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from functools import partial
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/selu_op_test.py b/caffe2/python/operator_test/selu_op_test.py
index fc903f159a4e..4dd2fa1848bf 100644
--- a/caffe2/python/operator_test/selu_op_test.py
+++ b/caffe2/python/operator_test/selu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 720bf9f02030..4609473f91f0 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from functools import partial
diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py
index aca6ff38a517..702effc226d6 100644
--- a/caffe2/python/operator_test/shape_inference_test.py
+++ b/caffe2/python/operator_test/shape_inference_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
index a925783c206e..6e8cae62dbff 100644
--- a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
+++ b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/softmax_ops_test.py b/caffe2/python/operator_test/softmax_ops_test.py
index f0f6c22cd10b..3ae26de6b513 100644
--- a/caffe2/python/operator_test/softmax_ops_test.py
+++ b/caffe2/python/operator_test/softmax_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/softplus_op_test.py b/caffe2/python/operator_test/softplus_op_test.py
index ac28a1a9a51e..dd183b774f92 100644
--- a/caffe2/python/operator_test/softplus_op_test.py
+++ b/caffe2/python/operator_test/softplus_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
index 14d637f50f41..2ba21bb6d44f 100644
--- a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
+++ b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given
diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py
index 9bdae01d1318..f1f85b1f9bec 100644
--- a/caffe2/python/operator_test/sparse_gradient_checker_test.py
+++ b/caffe2/python/operator_test/sparse_gradient_checker_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 from scipy.sparse import coo_matrix
diff --git a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
index 74690c8a2c56..fb958492cfa9 100644
--- a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
+++ b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import argparse
 import datetime
diff --git a/caffe2/python/operator_test/sparse_lp_regularizer_test.py b/caffe2/python/operator_test/sparse_lp_regularizer_test.py
index b0d0b4b5c9b3..7ea32bd69a29 100644
--- a/caffe2/python/operator_test/sparse_lp_regularizer_test.py
+++ b/caffe2/python/operator_test/sparse_lp_regularizer_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis
 from hypothesis import given, settings, HealthCheck
diff --git a/caffe2/python/operator_test/sparse_normalize_test.py b/caffe2/python/operator_test/sparse_normalize_test.py
index bd8dbd5f7b53..ecc4ae0c8d22 100644
--- a/caffe2/python/operator_test/sparse_normalize_test.py
+++ b/caffe2/python/operator_test/sparse_normalize_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis
 from hypothesis import given, settings, HealthCheck
diff --git a/caffe2/python/operator_test/sparse_ops_test.py b/caffe2/python/operator_test/sparse_ops_test.py
index 1cf243ed05c4..089174007b18 100644
--- a/caffe2/python/operator_test/sparse_ops_test.py
+++ b/caffe2/python/operator_test/sparse_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.test_util import rand_array
diff --git a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
index 03deb62d8513..41ec8808bb6a 100644
--- a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
+++ b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index 1186161e5f46..35f7bd2a5e29 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import brew, core, utils, workspace
diff --git a/caffe2/python/operator_test/specialized_segment_ops_test.py b/caffe2/python/operator_test/specialized_segment_ops_test.py
index fe768e193c88..4f1842ac4664 100644
--- a/caffe2/python/operator_test/specialized_segment_ops_test.py
+++ b/caffe2/python/operator_test/specialized_segment_ops_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import unittest
 
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
index 172c6cbafa16..5bd6cb1d08f8 100644
--- a/caffe2/python/operator_test/square_root_divide_op_test.py
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from functools import partial
diff --git a/caffe2/python/operator_test/stats_ops_test.py b/caffe2/python/operator_test/stats_ops_test.py
index edc36facb236..6114dfed3b10 100644
--- a/caffe2/python/operator_test/stats_ops_test.py
+++ b/caffe2/python/operator_test/stats_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py
index 0a42d5d23728..12a9e6826fd1 100644
--- a/caffe2/python/operator_test/stats_put_ops_test.py
+++ b/caffe2/python/operator_test/stats_put_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/storm_test.py b/caffe2/python/operator_test/storm_test.py
index 2ae402a8a290..c97f631d2160 100644
--- a/caffe2/python/operator_test/storm_test.py
+++ b/caffe2/python/operator_test/storm_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
index 969e8c7e11e5..eedb57be1d6c 100644
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/text_file_reader_test.py b/caffe2/python/operator_test/text_file_reader_test.py
index 41ba814af6ab..8889ddb9f53c 100644
--- a/caffe2/python/operator_test/text_file_reader_test.py
+++ b/caffe2/python/operator_test/text_file_reader_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.text_file_reader import TextFileReader
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/operator_test/thresholded_relu_op_test.py b/caffe2/python/operator_test/thresholded_relu_op_test.py
index 9c103c85c03c..0cd5c0f77895 100644
--- a/caffe2/python/operator_test/thresholded_relu_op_test.py
+++ b/caffe2/python/operator_test/thresholded_relu_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/tile_op_test.py b/caffe2/python/operator_test/tile_op_test.py
index 51471f797b34..d39dfeee0ad7 100644
--- a/caffe2/python/operator_test/tile_op_test.py
+++ b/caffe2/python/operator_test/tile_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/top_k_test.py b/caffe2/python/operator_test/top_k_test.py
index 85cf902812ee..fa628456c3a4 100644
--- a/caffe2/python/operator_test/top_k_test.py
+++ b/caffe2/python/operator_test/top_k_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import hypothesis.strategies as st
 import numpy as np
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index a1ddbaa9509e..55f26a89987f 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/transpose_op_test.py b/caffe2/python/operator_test/transpose_op_test.py
index e4b739a741ac..4ccec250e22b 100644
--- a/caffe2/python/operator_test/transpose_op_test.py
+++ b/caffe2/python/operator_test/transpose_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/trigonometric_op_test.py b/caffe2/python/operator_test/trigonometric_op_test.py
index 5d57940dc33e..04b98857c301 100644
--- a/caffe2/python/operator_test/trigonometric_op_test.py
+++ b/caffe2/python/operator_test/trigonometric_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/unique_ops_test.py b/caffe2/python/operator_test/unique_ops_test.py
index 016554321983..b49f4765539e 100644
--- a/caffe2/python/operator_test/unique_ops_test.py
+++ b/caffe2/python/operator_test/unique_ops_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given, settings
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/unique_uniform_fill_op_test.py b/caffe2/python/operator_test/unique_uniform_fill_op_test.py
index f858e8fa06bd..1026745db724 100644
--- a/caffe2/python/operator_test/unique_uniform_fill_op_test.py
+++ b/caffe2/python/operator_test/unique_uniform_fill_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/upsample_op_test.py b/caffe2/python/operator_test/upsample_op_test.py
index a56d1edebe68..61b01644bcf5 100644
--- a/caffe2/python/operator_test/upsample_op_test.py
+++ b/caffe2/python/operator_test/upsample_op_test.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py
index 2814d7a02775..241d1e4c1b56 100644
--- a/caffe2/python/operator_test/utility_ops_test.py
+++ b/caffe2/python/operator_test/utility_ops_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from hypothesis import assume, given, settings
diff --git a/caffe2/python/operator_test/video_input_op_test.py b/caffe2/python/operator_test/video_input_op_test.py
index c06183c0f1bb..f21f219bd90e 100644
--- a/caffe2/python/operator_test/video_input_op_test.py
+++ b/caffe2/python/operator_test/video_input_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import os
 import shutil
diff --git a/caffe2/python/operator_test/weight_scale_test.py b/caffe2/python/operator_test/weight_scale_test.py
index 9988ebc309d2..5cdc11eb4d11 100644
--- a/caffe2/python/operator_test/weight_scale_test.py
+++ b/caffe2/python/operator_test/weight_scale_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/weighted_multi_sample_test.py b/caffe2/python/operator_test/weighted_multi_sample_test.py
index 8b0966590594..830a9f9849c7 100644
--- a/caffe2/python/operator_test/weighted_multi_sample_test.py
+++ b/caffe2/python/operator_test/weighted_multi_sample_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/weighted_sample_test.py b/caffe2/python/operator_test/weighted_sample_test.py
index 24326d6337c4..032e9e9d755e 100644
--- a/caffe2/python/operator_test/weighted_sample_test.py
+++ b/caffe2/python/operator_test/weighted_sample_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py
index 4940bc69a052..2c7dffe92672 100644
--- a/caffe2/python/operator_test/weighted_sum_test.py
+++ b/caffe2/python/operator_test/weighted_sum_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/wngrad_test.py b/caffe2/python/operator_test/wngrad_test.py
index 2a48bed86358..48fe0f94731e 100644
--- a/caffe2/python/operator_test/wngrad_test.py
+++ b/caffe2/python/operator_test/wngrad_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 21a61a93d00c..9a2f9f541420 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -1,6 +1,6 @@
 # @package optimizer
 # Module caffe2.python.optimizer
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import copy
 import logging
diff --git a/caffe2/python/optimizer_context.py b/caffe2/python/optimizer_context.py
index 483f08dc5aff..d1593f440383 100644
--- a/caffe2/python/optimizer_context.py
+++ b/caffe2/python/optimizer_context.py
@@ -1,9 +1,9 @@
 ## @package optimizer_context
 # Module caffe2.python.optimizer_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context
 from caffe2.python.modifier_context import (
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
index a45571f19683..90f0932d23f6 100644
--- a/caffe2/python/optimizer_test.py
+++ b/caffe2/python/optimizer_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 from caffe2.proto import caffe2_pb2
 import caffe2.python.optimizer as optimizer
 from caffe2.python.optimizer import (
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index f7df35bfee70..02276b08c176 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -1,9 +1,9 @@
 ## @package optimizer_test_util
 # Module caffe2.python.optimizer_test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py
index 224dbf66b6ce..4ee446610bdb 100644
--- a/caffe2/python/parallel_workers.py
+++ b/caffe2/python/parallel_workers.py
@@ -1,9 +1,9 @@
 # @package parallel_workers
 # Module caffe2.python.parallel_workers
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 '''
diff --git a/caffe2/python/parallel_workers_test.py b/caffe2/python/parallel_workers_test.py
index a3367e6ee351..a9a7c6a078d7 100644
--- a/caffe2/python/parallel_workers_test.py
+++ b/caffe2/python/parallel_workers_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/parallelize_bmuf_distributed_test.py b/caffe2/python/parallelize_bmuf_distributed_test.py
index b3647a2007f5..c38a4ccc34d7 100644
--- a/caffe2/python/parallelize_bmuf_distributed_test.py
+++ b/caffe2/python/parallelize_bmuf_distributed_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from multiprocessing import Process, Manager
 
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
index 5b30da4387f3..4625d0b0458c 100644
--- a/caffe2/python/pipeline.py
+++ b/caffe2/python/pipeline.py
@@ -1,9 +1,9 @@
 ## @package pipeline
 # Module caffe2.python.pipeline
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, queue_util
 from caffe2.python.dataio import Reader, Writer
diff --git a/caffe2/python/pipeline_test.py b/caffe2/python/pipeline_test.py
index 5f57355b25d3..fe00933ac4e1 100644
--- a/caffe2/python/pipeline_test.py
+++ b/caffe2/python/pipeline_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.schema import (
     Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py
index 7eea50464504..e0fa90bffb6e 100644
--- a/caffe2/python/predictor/mobile_exporter.py
+++ b/caffe2/python/predictor/mobile_exporter.py
@@ -1,10 +1,10 @@
 ## @package mobile_exporter
 # Module caffe2.python.mobile_exporter
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, utils
 from caffe2.proto import caffe2_pb2
 import numpy as np
diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py
index 1c4cf77ea051..0269ec229888 100644
--- a/caffe2/python/predictor/mobile_exporter_test.py
+++ b/caffe2/python/predictor/mobile_exporter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python.test_util import TestCase
 from caffe2.python import workspace, brew
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/predictor/predictor_exporter.py b/caffe2/python/predictor/predictor_exporter.py
index e9759862fcb5..c8c68f9f30a0 100644
--- a/caffe2/python/predictor/predictor_exporter.py
+++ b/caffe2/python/predictor/predictor_exporter.py
@@ -1,9 +1,9 @@
 ## @package predictor_exporter
 # Module caffe2.python.predictor.predictor_exporter
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.proto import metanet_pb2
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
index 9c8b16c30705..2a0685fb955c 100644
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ b/caffe2/python/predictor/predictor_exporter_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import tempfile
 import unittest
diff --git a/caffe2/python/predictor/predictor_py_utils.py b/caffe2/python/predictor/predictor_py_utils.py
index 1af5923952dc..cc831454a08c 100644
--- a/caffe2/python/predictor/predictor_py_utils.py
+++ b/caffe2/python/predictor/predictor_py_utils.py
@@ -1,9 +1,9 @@
 ## @package predictor_py_utils
 # Module caffe2.python.predictor.predictor_py_utils
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, scope
 
diff --git a/caffe2/python/predictor/predictor_test.py b/caffe2/python/predictor/predictor_test.py
index 26c4cae63b57..64c88006686c 100644
--- a/caffe2/python/predictor/predictor_test.py
+++ b/caffe2/python/predictor/predictor_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 import numpy as np
diff --git a/caffe2/python/predictor/serde.py b/caffe2/python/predictor/serde.py
index af48b2920a87..2b8f1544803d 100644
--- a/caffe2/python/predictor/serde.py
+++ b/caffe2/python/predictor/serde.py
@@ -1,9 +1,9 @@
 ## @package serde
 # Module caffe2.python.predictor.serde
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 def serialize_protobuf_struct(protobuf_struct):
diff --git a/caffe2/python/predictor_constants.py b/caffe2/python/predictor_constants.py
index c1e1dedb8b09..eda0c66974f4 100644
--- a/caffe2/python/predictor_constants.py
+++ b/caffe2/python/predictor_constants.py
@@ -1,9 +1,9 @@
 ## @package predictor_constants
 # Module caffe2.python.predictor_constants
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import caffe2.proto.predictor_consts_pb2 as predictor_consts
 
 predictor_constants = predictor_consts.PredictorConsts()
diff --git a/caffe2/python/python_op_test.py b/caffe2/python/python_op_test.py
index 5a8cfe4a9b46..893671b96f45 100644
--- a/caffe2/python/python_op_test.py
+++ b/caffe2/python/python_op_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.core import CreatePythonOperator
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py
index 62265758c2f2..c9a91fc27d17 100644
--- a/caffe2/python/queue_util.py
+++ b/caffe2/python/queue_util.py
@@ -1,9 +1,9 @@
 ## @package queue_util
 # Module caffe2.python.queue_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, dataio
 from caffe2.python.task import TaskGroup
diff --git a/caffe2/python/record_queue.py b/caffe2/python/record_queue.py
index d5f129a2f902..1170c2bf3a82 100644
--- a/caffe2/python/record_queue.py
+++ b/caffe2/python/record_queue.py
@@ -3,10 +3,10 @@
 """
 Implementation of a queue wrapper.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core
 from caffe2.python.dataio import Reader, Writer
diff --git a/caffe2/python/recurrent.py b/caffe2/python/recurrent.py
index e5b48894efbc..d4762f08c683 100644
--- a/caffe2/python/recurrent.py
+++ b/caffe2/python/recurrent.py
@@ -1,9 +1,9 @@
 ## @package recurrent
 # Module caffe2.python.recurrent
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, workspace
 from future.utils import viewitems, viewkeys
diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py
index e994de8b0c44..4042149ca80c 100644
--- a/caffe2/python/regularizer.py
+++ b/caffe2/python/regularizer.py
@@ -1,6 +1,6 @@
 # @package optimizer
 # Module caffe2.python.regularizer
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 from caffe2.python import core, utils
 import numpy as np
diff --git a/caffe2/python/regularizer_context.py b/caffe2/python/regularizer_context.py
index 6935fdcb47c0..5d79e138b6b7 100644
--- a/caffe2/python/regularizer_context.py
+++ b/caffe2/python/regularizer_context.py
@@ -1,9 +1,9 @@
 # @package regularizer_context
 # Module caffe2.python.regularizer_context
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import context
 from caffe2.python.modifier_context import (
diff --git a/caffe2/python/regularizer_test.py b/caffe2/python/regularizer_test.py
index 2018040433b4..685feaf93ed2 100644
--- a/caffe2/python/regularizer_test.py
+++ b/caffe2/python/regularizer_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/python/rnn/__init__.py b/caffe2/python/rnn/__init__.py
index a37eb20fda26..3f2ff2d6cc8f 100644
--- a/caffe2/python/rnn/__init__.py
+++ b/caffe2/python/rnn/__init__.py
@@ -1,5 +1,5 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py
index c3bf9b30cea7..dee96413dbe5 100644
--- a/caffe2/python/rnn/lstm_comparison.py
+++ b/caffe2/python/rnn/lstm_comparison.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, lstm_benchmark, utils
 from copy import copy
diff --git a/caffe2/python/rnn/rnn_cell_test_util.py b/caffe2/python/rnn/rnn_cell_test_util.py
index 1533c1e3d418..95728d682bfa 100644
--- a/caffe2/python/rnn/rnn_cell_test_util.py
+++ b/caffe2/python/rnn/rnn_cell_test_util.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import workspace, scope
 from caffe2.python.model_helper import ModelHelper
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 8192b34dc12e..e16bfaaf491e 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -1,9 +1,9 @@
 ## @package rnn_cell
 # Module caffe2.python.rnn_cell
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import functools
 import inspect
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
index 50fe136a5a12..fb7cadf42847 100644
--- a/caffe2/python/schema.py
+++ b/caffe2/python/schema.py
@@ -13,10 +13,10 @@
 walkthrough on how to use schema to store and iterate through a structured
 in-memory dataset.
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import logging
 import numpy as np
diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py
index 28bf5c64a428..dca19a127ef2 100644
--- a/caffe2/python/schema_test.py
+++ b/caffe2/python/schema_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, schema
 import numpy as np
diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py
index be05aa468d10..11fddc7b0f62 100644
--- a/caffe2/python/scope.py
+++ b/caffe2/python/scope.py
@@ -1,9 +1,9 @@
 ## @package scope
 # Module caffe2.python.scope
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import contextlib
 import threading
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index b24fc6851428..9bd69eb32902 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import scope, core, workspace
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/serialized_test/coverage.py b/caffe2/python/serialized_test/coverage.py
index 7ba93f66af6b..2014847242c4 100644
--- a/caffe2/python/serialized_test/coverage.py
+++ b/caffe2/python/serialized_test/coverage.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
index 30810d9d8283..621adca9454e 100644
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ b/caffe2/python/serialized_test/serialized_test_util.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import argparse
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/session.py b/caffe2/python/session.py
index 9059e1eabc94..de3b09931a30 100644
--- a/caffe2/python/session.py
+++ b/caffe2/python/session.py
@@ -1,9 +1,9 @@
 ## @package session
 # Module caffe2.python.session
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 
 from caffe2.python import core, workspace
diff --git a/caffe2/python/session_test.py b/caffe2/python/session_test.py
index ae5e50d23ec7..fa505c296820 100644
--- a/caffe2/python/session_test.py
+++ b/caffe2/python/session_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python.schema import (
     Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py
index 375068ef537e..e62c7e6d41dc 100644
--- a/caffe2/python/sparse_to_dense_mask_test.py
+++ b/caffe2/python/sparse_to_dense_mask_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/sparse_to_dense_test.py b/caffe2/python/sparse_to_dense_test.py
index 5e6d10823e5f..dc43d2c03394 100644
--- a/caffe2/python/sparse_to_dense_test.py
+++ b/caffe2/python/sparse_to_dense_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
index 9dcb211274b3..f1b25ee26092 100644
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@@ -1,9 +1,9 @@
 ## @package task
 # Module caffe2.python.task
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, context
 from caffe2.python.schema import Field, from_blob_list
diff --git a/caffe2/python/task_test.py b/caffe2/python/task_test.py
index f1c51bc5b442..c44e93a3704c 100644
--- a/caffe2/python/task_test.py
+++ b/caffe2/python/task_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 from caffe2.python import task
diff --git a/caffe2/python/test/blob_deallocation_test.py b/caffe2/python/test/blob_deallocation_test.py
index 66d6835c4814..37886618ef45 100644
--- a/caffe2/python/test/blob_deallocation_test.py
+++ b/caffe2/python/test/blob_deallocation_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 import unittest
diff --git a/caffe2/python/test/do_op_test.py b/caffe2/python/test/do_op_test.py
index 72e9f83c9540..fcc6918d5350 100644
--- a/caffe2/python/test/do_op_test.py
+++ b/caffe2/python/test/do_op_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
diff --git a/caffe2/python/test/executor_test.py b/caffe2/python/test/executor_test.py
index 84df86fb05b0..b4db64005f62 100644
--- a/caffe2/python/test/executor_test.py
+++ b/caffe2/python/test/executor_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 from caffe2.python import core, workspace
 from caffe2.python.test.executor_test_util import (
diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py
index bf93c49d8cdc..ba10247eaa2e 100644
--- a/caffe2/python/test/executor_test_util.py
+++ b/caffe2/python/test/executor_test_util.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 
 from caffe2.python import (
diff --git a/caffe2/python/test/fakefp16_transform_test.py b/caffe2/python/test/fakefp16_transform_test.py
index d58d12ad60de..f98342eba54a 100644
--- a/caffe2/python/test/fakefp16_transform_test.py
+++ b/caffe2/python/test/fakefp16_transform_test.py
@@ -1,6 +1,6 @@
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
 
 import unittest
 from caffe2.python.fakefp16_transform_lib import fakeFp16FuseOps
diff --git a/caffe2/python/test/gpu_context_test.py b/caffe2/python/test/gpu_context_test.py
index 741f39d6dc8a..9ee8a308cc2e 100644
--- a/caffe2/python/test/gpu_context_test.py
+++ b/caffe2/python/test/gpu_context_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import unittest
 
diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py
index 817f5e21a563..7790e0f6d8f5 100644
--- a/caffe2/python/test/python_protobuf_test.py
+++ b/caffe2/python/test/python_protobuf_test.py
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 # make sure we use cpp implementation of protobuf
 import os
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
index a2cf3aced07c..94ac41524065 100644
--- a/caffe2/python/test_util.py
+++ b/caffe2/python/test_util.py
@@ -1,9 +1,9 @@
 ## @package test_util
 # Module caffe2.python.test_util
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import numpy as np
 from caffe2.python import core, workspace
 
diff --git a/caffe2/python/text_file_reader.py b/caffe2/python/text_file_reader.py
index 52a1b274f086..48f69f90c7b4 100644
--- a/caffe2/python/text_file_reader.py
+++ b/caffe2/python/text_file_reader.py
@@ -1,9 +1,9 @@
 ## @package text_file_reader
 # Module caffe2.python.text_file_reader
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 from caffe2.python import core
 from caffe2.python.dataio import Reader
 from caffe2.python.schema import Scalar, Struct, data_type_for_dtype
diff --git a/caffe2/python/timeout_guard.py b/caffe2/python/timeout_guard.py
index 07226c128ffe..2314a3ad9c24 100644
--- a/caffe2/python/timeout_guard.py
+++ b/caffe2/python/timeout_guard.py
@@ -1,9 +1,9 @@
 ## @package timeout_guard
 # Module caffe2.python.timeout_guard
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import contextlib
 import threading
diff --git a/caffe2/python/transformations.py b/caffe2/python/transformations.py
index ed0a32788de8..fc1bad34b201 100644
--- a/caffe2/python/transformations.py
+++ b/caffe2/python/transformations.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import caffe2.python._import_c_extension as C
 
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
index 363ceb19619d..14b97e4939ef 100644
--- a/caffe2/python/transformations_test.py
+++ b/caffe2/python/transformations_test.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from hypothesis import given
 import hypothesis.strategies as st
diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py
index e95cb4bd46e3..39d37ca9fa0a 100644
--- a/caffe2/python/trt/test_trt.py
+++ b/caffe2/python/trt/test_trt.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
index ce45ae3cb86d..0936941aac03 100644
--- a/caffe2/python/trt/transform.py
+++ b/caffe2/python/trt/transform.py
@@ -6,10 +6,10 @@
 Note that ONNX-TRT enforce an NCHW input!
 """
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
diff --git a/caffe2/python/tt_core.py b/caffe2/python/tt_core.py
index a2011da16b15..314718b76c9d 100644
--- a/caffe2/python/tt_core.py
+++ b/caffe2/python/tt_core.py
@@ -1,8 +1,8 @@
 ## @package tt_core
 # Module caffe2.python.tt_core
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+
+
 
 import numpy as np
 
diff --git a/caffe2/python/tt_core_test.py b/caffe2/python/tt_core_test.py
index aec5764e66e5..0cee3b254720 100644
--- a/caffe2/python/tt_core_test.py
+++ b/caffe2/python/tt_core_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import unittest
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 9cf30d9c06b3..947dd9bf296d 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -1,9 +1,9 @@
 # @package utils
 # Module caffe2.python.utils
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.proto import caffe2_pb2
 from caffe2.python.compatibility import container_abcs
diff --git a/caffe2/python/utils_test.py b/caffe2/python/utils_test.py
index 3921f3d67ca7..ef809bfd8154 100644
--- a/caffe2/python/utils_test.py
+++ b/caffe2/python/utils_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 from caffe2.python import core, utils, test_util
 
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index f76fcf75a33a..99983e84f097 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -1,9 +1,9 @@
 ## @package workspace
 # Module caffe2.python.workspace
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 import collections
 import contextlib
 from google.protobuf.message import Message
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 7e64220f480e..86dbcf5d70ba 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -1,7 +1,7 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+
+
+
+
 
 import numpy as np
 import os
diff --git a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py
index 08f658ba9608..4f4bad64980c 100644
--- a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 from itertools import product
diff --git a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py
index 27a07ece62be..1d3fd2cc369d 100644
--- a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py
index 82dd1772d5da..24a2269cc850 100644
--- a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/concat_dnnlowp_op_test.py b/caffe2/quantization/server/concat_dnnlowp_op_test.py
index 777c523aff87..fc7e897993d4 100644
--- a/caffe2/quantization/server/concat_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/concat_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py
index 70bcf53f44d4..a605ea3fc49e 100644
--- a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
index ae2f49cfe20c..68c14b69f058 100644
--- a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_dnnlowp_op_test.py b/caffe2/quantization/server/conv_dnnlowp_op_test.py
index 682a4d787aba..11cd12a4d5bc 100644
--- a/caffe2/quantization/server/conv_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/conv_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
index 9ed9106db0be..715b6f8c01a8 100644
--- a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py
index 773253743c6d..99e914c294b9 100644
--- a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py
index 399ae4363831..5694a553e744 100644
--- a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/dnnlowp_test_utils.py b/caffe2/quantization/server/dnnlowp_test_utils.py
index 1a41664cb2d1..0d56ea6ac127 100644
--- a/caffe2/quantization/server/dnnlowp_test_utils.py
+++ b/caffe2/quantization/server/dnnlowp_test_utils.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py
index 1cf65f37858a..75bd2f8e4d44 100644
--- a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py
index 3f199f981331..af1cd0f80684 100644
--- a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py
index b9104f598d08..e31b9d179071 100644
--- a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py
index 9b3caf41ecc5..faf526b8c48d 100644
--- a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py
index 68059421cfac..5d77eceb8e04 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
index b8c4a3e22812..f1939e198b84 100644
--- a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/fully_connected_fp16_test.py b/caffe2/quantization/server/fully_connected_fp16_test.py
index 710207f7caeb..be1e2c8a1ab5 100644
--- a/caffe2/quantization/server/fully_connected_fp16_test.py
+++ b/caffe2/quantization/server/fully_connected_fp16_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py
index a4ba681867ff..284ae56d743e 100644
--- a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/gather_dnnlowp_op_test.py b/caffe2/quantization/server/gather_dnnlowp_op_test.py
index c1f495260722..c2c7f35a66d4 100644
--- a/caffe2/quantization/server/gather_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/gather_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
index 93a4163c86bb..30051d95b59c 100644
--- a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/int8_gen_quant_params_test.py b/caffe2/quantization/server/int8_gen_quant_params_test.py
index f2c7fd81dabb..d208d6f9b575 100644
--- a/caffe2/quantization/server/int8_gen_quant_params_test.py
+++ b/caffe2/quantization/server/int8_gen_quant_params_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py
index f34081aeba24..70f9b0c2f1fa 100644
--- a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py
+++ b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 ##############################################################################
 
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 from caffe2.python import core, workspace
diff --git a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py
index 9cd22bd2c491..bcf06ce0274e 100644
--- a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/observer_test.py b/caffe2/quantization/server/observer_test.py
index 4299c146b2da..5c2b28e5e6fb 100644
--- a/caffe2/quantization/server/observer_test.py
+++ b/caffe2/quantization/server/observer_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import numpy as np
 from caffe2.python import core, workspace
diff --git a/caffe2/quantization/server/pool_dnnlowp_op_test.py b/caffe2/quantization/server/pool_dnnlowp_op_test.py
index d581fbef00cd..fedc87ee732a 100644
--- a/caffe2/quantization/server/pool_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/pool_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/quantize_dnnlowp_op_test.py b/caffe2/quantization/server/quantize_dnnlowp_op_test.py
index caaf456fb84e..e61a28b4b930 100644
--- a/caffe2/quantization/server/quantize_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/quantize_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/relu_dnnlowp_op_test.py b/caffe2/quantization/server/relu_dnnlowp_op_test.py
index 5e85b4e43ed6..68b5aed049f1 100644
--- a/caffe2/quantization/server/relu_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/relu_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
index 47ae47b81106..67017ee0afcc 100644
--- a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py
index 6af92a5d2fe5..b12b3908aafa 100644
--- a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
diff --git a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py
index 28ff4a0a750b..836745dcf543 100644
--- a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
index b1d34c19d3ae..d7253b1675f4 100644
--- a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/tanh_dnnlowp_op_test.py b/caffe2/quantization/server/tanh_dnnlowp_op_test.py
index e0af7af62bba..f73befd25e26 100644
--- a/caffe2/quantization/server/tanh_dnnlowp_op_test.py
+++ b/caffe2/quantization/server/tanh_dnnlowp_op_test.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import collections
 
diff --git a/caffe2/quantization/server/utils.py b/caffe2/quantization/server/utils.py
index 862ed5a9cd62..9e137cb5f6af 100644
--- a/caffe2/quantization/server/utils.py
+++ b/caffe2/quantization/server/utils.py
@@ -1,4 +1,4 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
+
 
 import copy
 import logging
diff --git a/scripts/get_python_cmake_flags.py b/scripts/get_python_cmake_flags.py
index 0fac6d20d4d4..9121c5ebf0db 100644
--- a/scripts/get_python_cmake_flags.py
+++ b/scripts/get_python_cmake_flags.py
@@ -12,9 +12,9 @@
 #   make
 #
 
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from __future__ import print_function
+
+
+
 from distutils import sysconfig
 import sys
 
diff --git a/setup.py b/setup.py
index 753e2b0f14a1..059188875e77 100644
--- a/setup.py
+++ b/setup.py
@@ -162,7 +162,7 @@
 #      When turned on, the following cmake variables will be toggled as well:
 #        USE_SYSTEM_CPUINFO=ON USE_SYSTEM_SLEEF=ON BUILD_CUSTOM_PROTOBUF=OFF
 
-from __future__ import print_function
+
 import sys
 if sys.version_info < (3,):
     print("Python 2 has reached end-of-life and is no longer supported by PyTorch.")
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index acecbe737e6d..026293a9281a 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-from __future__ import absolute_import, division, print_function
+
 import os
 import argparse
 import sys
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index e41c921f1e33..89bf64d8149e 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -22,7 +22,7 @@
 #     which will in turn dispatch back to VariableType for its
 #     differentiable subcomponents.
 #
-from __future__ import print_function
+
 from .utils import CodeTemplate, nested_dict, write, uninplace_api_name
 from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \
     MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT
diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py
index 354aedc601ad..f8e8e61857e5 100755
--- a/tools/clang_tidy.py
+++ b/tools/clang_tidy.py
@@ -12,7 +12,7 @@
 glob or regular expressions.
 """
 
-from __future__ import print_function
+
 
 import argparse
 import collections
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 4b91abf1c6c7..118a3e9b58b7 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -1,4 +1,4 @@
-from __future__ import print_function
+
 import os
 import collections
 from pprint import pformat
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index d5db749d1552..83253cc3a526 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -1,6 +1,6 @@
 "Manages CMake."
 
-from __future__ import print_function
+
 
 import multiprocessing
 import os

From 721cfbf8425cf2c1dc5e27d1332e32e1a42ef541 Mon Sep 17 00:00:00 2001
From: Dianshi Li <dianshi@fb.com>
Date: Wed, 23 Sep 2020 18:30:17 -0700
Subject: [PATCH 072/449] [PT Model Split] Support 2 operators in PT by C2
 conversion (#45231)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45231

There are two operators:
`PriorCorrectionCalibrationPrediction` and `GatherRangesToDense` is not supported in PT which makes GLOW cannot work.

To unblock, we first try to use C2->PT conversion. In the long-term, we need to implement PT custom ops.

This diff does this conversion to unblock current project.

Test Plan:
Run unit test. the Test input is from current DPER example.
All pass.
```buck test //caffe2/caffe2/python/operator_test:torch_integration_test -- test_prior_correct_calibration_prediction_op  --print-passing-details

> c2 reference output
> [0.14285715 0.27272728 0.39130434 0.5 ]

> PT converted output
> tensor([0.1429, 0.2727, 0.3913, 0.5000])

buck test //caffe2/caffe2/python/operator_test:torch_integration_test -- test_gather_ranges_to_dense_op  --print-passing-details

c2 reference output
> [array([[6, 5, 4, 3], [0, 0, 0, 0]], dtype=int64)]

> PT converted output
> [tensor([[6, 5, 4, 3], [0, 0, 0, 0]])]
```

Reviewed By: allwu, qizzzh

Differential Revision: D23858329

fbshipit-source-id: ed37118ca7f09e1cd0ad1fdec3d37f66dce60dd9
---
 caffe2/operators/gather_ranges_to_dense_op.cc |  8 +++
 caffe2/operators/gather_ranges_to_dense_op.h  |  3 +
 .../operator_test/torch_integration_test.py   | 69 ++++++++++++++++++-
 3 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc
index 10396aafc97e..aa31ef12b36a 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.cc
+++ b/caffe2/operators/gather_ranges_to_dense_op.cc
@@ -104,3 +104,11 @@ NO_GRADIENT(GatherRangesToDense);
 
 } // namespace
 } // namespace caffe2
+
+using GatherRangesToDenseCPUOp =
+    caffe2::GatherRangesToDenseOp<caffe2::CPUContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    GatherRangesToDense,
+    "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs",
+    GatherRangesToDenseCPUOp);
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index c1dd5a527005..217a61b25129 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -5,6 +5,7 @@
 
 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
@@ -15,6 +16,8 @@
 #include <map>
 #include <utility>
 
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense);
+
 namespace caffe2 {
 template <class Context>
 class GatherRangesToDenseOp final : public Operator<Context> {
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index 55f26a89987f..7194daa91203 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -7,10 +7,12 @@
 import torch
 import unittest
 
-from caffe2.python import core, workspace
+from caffe2.python import core, dyndep, workspace
 from hypothesis import given, settings
 from scipy.stats import norm
 
+dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/operators:calibration_op')
+
 
 def generate_rois(roi_counts, im_dims):
     assert len(roi_counts) == len(im_dims)
@@ -875,6 +877,71 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries):
         )
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
+    def test_gather_ranges_to_dense_op(self):
+        data = np.array([1, 2, 3, 4, 5, 6, 7, 8])
+        ranges = np.array([[[2, 4]], [[0, 0]]])
+        key = np.array([0, 1, 3, 2, 1, 0, 1, 0])
+        lengths = np.array([4])
+        min_observation = 2
+        max_mismatched_ratio = 0.5
+        max_empty_ratio = 1.0
+
+        outputs_name = ["X_{}".format(i) for i in range(len(lengths))]
+        ref_op = core.CreateOperator(
+            "GatherRangesToDense",
+            ["data", "ranges", "key"],
+            outputs_name,
+            lengths=lengths,
+            min_observation=min_observation,
+            max_mismatched_ratio=max_mismatched_ratio,
+            max_empty_ratio=max_empty_ratio,
+        )
+        workspace.FeedBlob("data", data)
+        workspace.FeedBlob("ranges", ranges)
+        workspace.FeedBlob("key", key)
+        workspace.RunOperatorOnce(ref_op)
+        ref_outputs = []
+        for output_name in outputs_name:
+            ref_outputs.append(workspace.FetchBlob(output_name))
+
+        outputs = torch.ops._caffe2.GatherRangesToDense(
+            torch.from_numpy(data),
+            torch.from_numpy(ranges),
+            torch.from_numpy(key),
+            lengths=lengths,
+            min_observation=min_observation,
+            max_mismatched_ratio=max_mismatched_ratio,
+            max_empty_ratio=max_empty_ratio,
+        )
+
+        self.assertEqual(len(ref_outputs), len(outputs))
+        for i in range(0, len(ref_outputs)):
+            np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy())
+
+    def test_prior_correct_calibration_prediction_op(self):
+        beta = np.array([1.0, 2.0], dtype=np.float32)
+        gamma = np.array([3.0, 4.0], dtype=np.float32)
+        pred = np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32)
+
+        ref_op = core.CreateOperator(
+            "PriorCorrectionCalibrationPrediction",
+            ["beta", "gamma", "pred"],
+            ["new_pred"],
+        )
+        workspace.FeedBlob("beta", beta)
+        workspace.FeedBlob("gamma", gamma)
+        workspace.FeedBlob("pred", pred)
+        workspace.RunOperatorOnce(ref_op)
+        ref_output = workspace.FetchBlob("new_pred")
+
+        output = torch.ops._caffe2.PriorCorrectionCalibrationPrediction(
+            torch.from_numpy(beta),
+            torch.from_numpy(gamma),
+            torch.from_numpy(pred),
+        )
+        torch.testing.assert_allclose(ref_output, output)
+
+
     @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10))
     @settings(deadline=1000)
     def test_merge_id_lists(self, lengths_0, lengths_1):

From 60665ace17b918d6a0548ebc42b6c5bca9014b31 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Wed, 23 Sep 2020 18:56:07 -0700
Subject: [PATCH 073/449] [quant] Add optimized approach to calculate qparams
 for qembedding_bag (#45149)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45149

The choose_qparams_optimized calculates the the optimized qparams.
It uses a greedy approach to nudge the min and max and calculate the l2 norm
  and tries to minimize the quant error by doing `torch.norm(x-fake_quant(x,s,z))`

Test Plan: Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D23848060

fbshipit-source-id: c6c57c9bb07664c3f1c87dd7664543e09f634aee
---
 aten/src/ATen/native/native_functions.yaml    |  4 +
 aten/src/ATen/native/quantized/QTensor.cpp    | 88 +++++++++++++++++++
 .../quantized/cpu/qembeddingbag_prepack.cpp   | 57 ++++++------
 aten/src/ATen/native/quantized/library.cpp    |  4 +-
 test/quantization/test_quantized_op.py        | 34 ++++---
 tools/autograd/gen_python_functions.py        |  1 +
 .../quantization/insert_quant_dequant.cpp     |  5 +-
 torch/overrides.py                            |  1 +
 8 files changed, 153 insertions(+), 41 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c61f021f8c5f..ae6afc3818a5 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4312,6 +4312,10 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (float, float)
+  use_c10_dispatcher: full
+  variants: function
+
 # to(Device) must not exist because all constructors of Device also works for
 # TensorOptions. Otherwise, an ambiguity error is thrown.
 # See NOTE [ TensorOptions Constructors ].
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 5efec6420906..9db2a6eb2ac4 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -232,5 +232,93 @@ std::tuple<double, int64_t> _choose_qparams_per_tensor(
   return std::make_tuple(q_params.scale, q_params.zero_point);
 }
 
+float calculate_quant_loss(
+    const float* input,
+    int numel,
+    float xmin,
+    float xmax,
+    float* q_input,
+    int bit_width) {
+  xmin = static_cast<at::Half>(xmin);
+  float data_range = xmax - xmin;
+  float qmax = (1 << bit_width) - 1;
+  float scale = data_range == 0
+      ? 1.0
+      : static_cast<float>(static_cast<at::Half>(data_range / qmax));
+  float inverse_scale = 1.0f / scale;
+
+  float norm = 0.0f;
+  constexpr int VLEN = 8;
+  int i = 0;
+
+// TODO add FBGEMM kernel
+// #ifdef USE_FBGEMM
+// #endif
+
+  // remainder loop
+  for (; i < numel; i++) {
+    q_input[i] = std::max(
+        0.0f, std::min<float>(nearbyint((input[i] - xmin) * inverse_scale), qmax));
+    q_input[i] = q_input[i] * scale + xmin;
+    norm += (input[i] - q_input[i]) * (input[i] - q_input[i]);
+  }
+  return std::sqrt(norm);
+}
+
+/*
+  Helper function to find the best min/max for a tensor to calculate qparams.
+  It uses a greedy approach to nudge the min and max and calculate the l2 norm
+  and tries to minimize the quant error by doing `torch.norm(x-fake_quant(x,s,z))`
+  Returns the optimized xmax and xmin value of the tensor.
+*/
+std::tuple<double, double> choose_qparams_optimized(
+    const at::Tensor& input_tensor,
+    int64_t numel,
+    const int64_t n_bins,
+    const double ratio,
+    int64_t bit_width) {
+
+  const float* input_row = input_tensor.data_ptr<float>();
+  float xmin = *std::min_element(input_row, input_row + numel);
+  float xmax = *std::max_element(input_row, input_row + numel);
+
+  float stepsize = (xmax - xmin) / n_bins;
+  int min_bins = n_bins * (1.0 - (float) ratio);
+  const float* input = input_tensor.contiguous().data_ptr<float>();
+  std::vector<float> q_input(numel);
+
+  float loss =
+      calculate_quant_loss(input, numel, xmin, xmax, q_input.data(), bit_width);
+  float best_loss = loss;
+
+  float cur_min = xmin;
+  float cur_max = xmax;
+  float cur_loss = loss;
+
+  float thr = min_bins * stepsize;
+  while (cur_min + thr < cur_max) {
+    // move left
+    float loss1 = calculate_quant_loss(
+        input, numel, cur_min + stepsize, cur_max, q_input.data(), bit_width);
+    // move right
+    float loss2 = calculate_quant_loss(
+        input, numel, cur_min, cur_max - stepsize, q_input.data(), bit_width);
+    if (cur_loss < loss1 && cur_loss < loss2 && cur_loss < best_loss) {
+      // found a local optima
+      best_loss = cur_loss;
+      xmin = cur_min;
+      xmax = cur_max;
+    }
+    if (loss1 < loss2) {
+      cur_min = cur_min + stepsize;
+      cur_loss = loss1;
+    } else {
+      cur_max = cur_max - stepsize;
+      cur_loss = loss2;
+    }
+  }
+
+  return std::make_tuple((float) xmax, (float) xmin);
+}
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index dc1f26345e62..6c67b6cc6c86 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -137,7 +137,10 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   return output;
 }
 
-Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
+Tensor _qembeddingbag_nbit_prepack_helper(
+    const Tensor& weight,
+    int bit_width,
+    bool optimized_qparams) {
   int64_t embedding_rows = weight.size(0);
   int64_t embedding_cols = weight.size(1);
 
@@ -145,16 +148,16 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
 
   const auto weight_data = weight.data_ptr<float>();
   TORCH_CHECK(
-    BIT_RATE == 4 || BIT_RATE == 2,
-    "BIT_RATE must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'."
-    "For 8bit, consider using 'embedding_bag_byte_prepack'.");
+      bit_width == 4 || bit_width == 2,
+      "bit_width must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'."
+      "For 8bit, consider using 'embedding_bag_byte_prepack'.");
 
-  int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
+  int NUM_ELEM_PER_BYTE = 8 / bit_width;
   TORCH_CHECK(
       weight_contig.size(weight.dim() - 1) % NUM_ELEM_PER_BYTE == 0,
-      "qembeddingbag_" + c10::to_string(BIT_RATE) +
-      "bit_prepack only works for the number of columns a multiple of "
-      + c10::to_string(NUM_ELEM_PER_BYTE));
+      "qembeddingbag_" + c10::to_string(bit_width) +
+          "bit_prepack only works for the number of columns a multiple of " +
+          c10::to_string(NUM_ELEM_PER_BYTE));
 
   // The "fused" representation stores the scale and bias with the
   // row-wise quantized data in one tensor.
@@ -178,16 +181,20 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
     const float* input_row = weight_data + row * embedding_cols;
     std::uint8_t* output_row = output_data + row * output_columns;
 
-    float Xmin = *std::min_element(input_row, input_row + embedding_cols);
-    float Xmax = *std::max_element(input_row, input_row + embedding_cols);
-
+    float Xmin, Xmax;
+    if (optimized_qparams) {
+      std::tie(Xmax, Xmin) = at::choose_qparams_optimized(
+          weight_contig[row], embedding_cols, 200, 0.16, bit_width);
+    } else {
+      Xmin = *std::min_element(input_row, input_row + embedding_cols);
+      Xmax = *std::max_element(input_row, input_row + embedding_cols);
+    }
     Xmin = static_cast<at::Half>(Xmin);
-    const float range = Xmax - Xmin;
-
+    float range = Xmax - Xmin;
     // Set scale to 1.0f for the corner case of Xmax == Xmin .
     // Any non-zero scale would work because during quantization
     // (X - Xmin) / scale will be 0 for all X unless scale is 0.
-    at::Half scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1);
+    at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1);
     float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
     if (scale == 0 || std::isinf(inverse_scale)) {
       // Corner case handling when Xmax == Xmin
@@ -195,7 +202,6 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
       scale = 1.0f;
       inverse_scale = 1.0f;
     }
-
     // Update the scale and zero_point of each row.
     at::Half* output_row_scale_zp = reinterpret_cast<at::Half*>(
         output_row +
@@ -209,15 +215,14 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
       float X = input_row[col];
       std::uint8_t quantized = std::max(
           0,
-          std::min<int>(
-              lrintf((X - Xmin) * inverse_scale), (1 << BIT_RATE) - 1));
+          std::min<int>(lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1));
       // We pack 2 4-bit values in a byte. Index 0 is packed in the lower 4-bits
       // and index 1 is packed in the upper 4-bits.
       if (col % NUM_ELEM_PER_BYTE == 0) {
         output_row[col / NUM_ELEM_PER_BYTE] = quantized;
       } else {
         output_row[col / NUM_ELEM_PER_BYTE] |=
-            (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE));
+            (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width));
       }
     } // embedding_cols
   } // embedding_rows
@@ -231,8 +236,9 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) {
 // To later de-quantize values, the scale (range / 15) and zero_point
 // are stored alongside the data. More precisely, each row first has quantized
 // values, and then 2-byte fp16 scale and 2-byte zero_offset.
-Tensor qembeddingbag_4bit_prepack(const Tensor& weight) {
-  return _qembeddingbag_nbit_prepack_helper(weight, 4 /*BIT_RATE*/);
+Tensor qembeddingbag_4bit_prepack(const Tensor& weight, bool optimized_qparams) {
+  return _qembeddingbag_nbit_prepack_helper(
+      weight, 4 /*bit_width*/, optimized_qparams);
 }
 
 // Applies 2-bit row-wise quantization by determining the range
@@ -243,8 +249,9 @@ Tensor qembeddingbag_4bit_prepack(const Tensor& weight) {
 // are stored alongside the data. More precisely, each row first has quantized
 // values, and then 2-byte fp16 scale and 2-byte zero_offset.
 // TODO() - Add 2Bit Embedding Lookup operator.
-Tensor qembeddingbag_2bit_prepack(const Tensor& weight) {
-  return _qembeddingbag_nbit_prepack_helper(weight, 2 /*BIT_RATE*/);
+Tensor qembeddingbag_2bit_prepack(const Tensor& weight, bool optimized_qparams) {
+  return _qembeddingbag_nbit_prepack_helper(
+      weight, 2 /*bit_width*/, optimized_qparams);
 }
 
 class QEmbeddingPackWeights final {
@@ -255,9 +262,9 @@ class QEmbeddingPackWeights final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("embedding_bag_byte_prepack", qembeddingbag_byte_prepack);
-  m.impl("embedding_bag_4bit_prepack", qembeddingbag_4bit_prepack);
-  m.impl("embedding_bag_2bit_prepack", qembeddingbag_2bit_prepack);
+  m.impl("embedding_bag_byte_prepack", TORCH_FN(qembeddingbag_byte_prepack));
+  m.impl("embedding_bag_4bit_prepack", TORCH_FN(qembeddingbag_4bit_prepack));
+  m.impl("embedding_bag_2bit_prepack", TORCH_FN(qembeddingbag_2bit_prepack));
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index c8e247b42365..6049ccbe1e46 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -107,9 +107,9 @@ TORCH_LIBRARY(quantized, m) {
   m.def("embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin");
   m.def("embedding_bag_byte_prepack(Tensor weight) -> Tensor");
   m.def("embedding_bag_byte_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_4bit_prepack(Tensor weight) -> Tensor");
+  m.def("embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor");
   m.def("embedding_bag_4bit_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_2bit_prepack(Tensor weight) -> Tensor");
+  m.def("embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor");
   m.def("embedding_bag_2bit_unpack(Tensor weight) -> Tensor");
   m.def("embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor");
   m.def("embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 674ace864343..9412332c238b 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -119,7 +119,6 @@ def _get_random_tensor_and_q_params(shapes, rand_scale, torch_type):
         X_scale = 1e-10
     return X, X_scale, X_zero_point
 
-
 class TestQuantizedOps(TestCase):
 
     """Helper function to test quantized activation functions."""
@@ -2718,11 +2717,14 @@ def test_qlinear_unpack(self, W, use_channelwise):
 
 @unittest.skipIf(sys.platform == "darwin", "Known test failure on Mac.")
 class TestQuantizedEmbeddingOps(TestCase):
-    def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate):
+    def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate, optimized_qparams):
         weights = torch.from_numpy((np.random.random_sample((
             num_embeddings, embedding_dim)) + 1).astype(np.float32))
 
-        w_packed = pack_fn(weights)
+        if bit_rate == 8:
+            w_packed = pack_fn(weights)
+        else:
+            w_packed = pack_fn(weights, optimized_qparams=optimized_qparams)
         w_unpacked = unpack_fn(w_packed)
 
         if bit_rate == 8:
@@ -2753,13 +2755,13 @@ def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embe
             conversion_op = "FloatToFused2BitRowwiseQuantized"
             reverse_conversion_op = "Fused2BitRowwiseQuantizedToFloat"
 
-        def get_c2_weights(weights):
+        def get_c2_weights(weights, engine_str):
             workspace.ResetWorkspace()
 
             workspace.FeedBlob("weights", weights)
             workspace.RunOperatorOnce(
                 core.CreateOperator(
-                    conversion_op, ["weights"], ["quantized_weights"]
+                    conversion_op, ["weights"], ["quantized_weights"], engine=engine_str
                 )
             )
             emb_q = workspace.FetchBlob("quantized_weights")
@@ -2776,7 +2778,11 @@ def get_c2_weights(weights):
                 )
             return torch.from_numpy(emb_q), dequantized_data
 
-        w_packed_c2, w_unpacked_c2 = get_c2_weights(weights)
+        if optimized_qparams:
+            engine = "GREEDY"
+        else:
+            engine = ""
+        w_packed_c2, w_unpacked_c2 = get_c2_weights(weights, engine)
 
         # Compare packed weights against C2.
         np.testing.assert_allclose(w_packed.numpy(), w_packed_c2.numpy(), atol=1e-6, rtol=1e-6)
@@ -2790,25 +2796,27 @@ def test_embedding_bag_byte_unpack(self, num_embeddings, embedding_dim):
         pack_fn = torch.ops.quantized.embedding_bag_byte_prepack
         unpack_fn = torch.ops.quantized.embedding_bag_byte_unpack
 
-        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=8)
+        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 8, False)
 
     """ Tests the correctness of the embedding_bag_4bit pack/unpack op against C2 """
     @given(num_embeddings=st.integers(10, 100),
-           embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),)
-    def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim):
+           embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),
+           optimized_qparams=st.booleans(),)
+    def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams):
         pack_fn = torch.ops.quantized.embedding_bag_4bit_prepack
         unpack_fn = torch.ops.quantized.embedding_bag_4bit_unpack
 
-        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=4)
+        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 4, optimized_qparams)
 
     """ Tests the correctness of the embedding_bag_2bit pack/unpack op against C2 """
     @given(num_embeddings=st.integers(10, 100),
-           embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),)
-    def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim):
+           embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),
+           optimized_qparams=st.booleans(),)
+    def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams):
         pack_fn = torch.ops.quantized.embedding_bag_2bit_prepack
         unpack_fn = torch.ops.quantized.embedding_bag_2bit_unpack
 
-        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=2)
+        self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 2, optimized_qparams)
 
     def embedding_bag_rowwise_offsets_run(
             self, bit_rate, num_embeddings,
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index b7fa4a3a8308..995dff38030b 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -404,6 +404,7 @@ def get_cpp_formal(arg, ensure_temp_safe=True):
     'std::tuple<Tensor,Tensor,Tensor,Tensor,int64_t>',
     'std::tuple<Tensor,Tensor,double,Tensor,int64_t>',
     'std::tuple<double,int64_t>',
+    'std::tuple<double,double>',
     'std::vector<Tensor>',
     'Scalar', 'bool', 'int64_t', 'void*', 'void',
     'QScheme', 'double',
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index ce6d5cbc23ff..5c6851ce4fab 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -291,9 +291,13 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) {
   auto observer_out = observer->output();
 
   std::string prepack_fn, quant_fn;
+  std::vector<Value*> prepack_inputs = {observer_out};
   if (op_name == "embedding_bag_4bit") {
+    bool optimized_qparams = false;
+    Value* optimized_qparams_false = g->insertConstant(optimized_qparams);
     prepack_fn = "quantized::embedding_bag_4bit_prepack";
     quant_fn = "quantized::embedding_bag_4bit_rowwise_offsets";
+    prepack_inputs.push_back(optimized_qparams_false);
   } else if (op_name == "embedding_bag_byte") {
     prepack_fn = "quantized::embedding_bag_byte_prepack";
     quant_fn = "quantized::embedding_bag_byte_rowwise_offsets";
@@ -302,7 +306,6 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) {
         "Graph Mode Quantization currently supports 4-bit and 8-bit embedding bag quantization.");
   }
 
-  std::vector<Value*> prepack_inputs = {observer_out};
   std::vector<Use> uses = observer_out->uses();
   Node* embedding_bag_float_op;
   // We expect that the output of the weight observer will be consumed by the
diff --git a/torch/overrides.py b/torch/overrides.py
index b287bf17958a..d5f247e5d51a 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -277,6 +277,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.cholesky: lambda input, upper=False, out=None: -1,
         torch.cholesky_inverse: lambda input, upper=False, out=None: -1,
         torch.cholesky_solve: lambda input1, input2, upper=False, out=None: -1,
+        torch.choose_qparams_optimized: lambda input, numel, n_bins, ratio, bit_width: -1,
         torch.chunk: lambda input, chunks, dim=0: -1,
         torch.clamp: lambda input, min=None, max=None, out=None: -1,
         torch.clip: lambda input, min=None, max=None, out=None: -1,

From c760bc8fb15a13a66d514b4107ff6ea5d1720e6c Mon Sep 17 00:00:00 2001
From: Jordan Fix <jfix@fb.com>
Date: Wed, 23 Sep 2020 20:47:51 -0700
Subject: [PATCH 074/449] Add GlowLoadAOTModel flag (#45189)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45189

Pull Request resolved: https://github.com/pytorch/glow/pull/4902

Test Plan: Test locally

Reviewed By: yinghai

Differential Revision: D23810445

fbshipit-source-id: 56e717d80abbfe76b15d0f4249e1e399a9722753
---
 caffe2/opt/onnxifi_op.h | 5 ++++-
 third_party/foxi        | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h
index 6a211a604d52..f19403a14e58 100644
--- a/caffe2/opt/onnxifi_op.h
+++ b/caffe2/opt/onnxifi_op.h
@@ -263,10 +263,13 @@ class OnnxifiOp final : public Operator<Context> {
         defered_blob_reader = ws->GetBlob("__DEFERRED_BLOB_READER__");
       }
       onnxGraph graph{nullptr};
+
+      static const uint64_t auxPropertiesListAOT[] = {
+          ONNXIFI_OPTIMIZATION_AOT, ONNXIFI_GRAPH_PROPERTY_NONE};
       CAFFE_ENFORCE_EQ(
           lib_->onnxInitGraph(
               backend,
-              nullptr,
+              use_glow_aot_ ? auxPropertiesListAOT : nullptr,
               onnx_model_str.size(),
               (const void*)(onnx_model_str.c_str()),
               weight_descs.size(),
diff --git a/third_party/foxi b/third_party/foxi
index 9ca418d2f4bc..4aba696ec8f3 160000
--- a/third_party/foxi
+++ b/third_party/foxi
@@ -1 +1 @@
-Subproject commit 9ca418d2f4bc8e022d843388afa0fd0a14bd57dc
+Subproject commit 4aba696ec8f31794fd42880346dc586486205e0a

From 2d00ebd29f2363a2f51e88e4c898244679d114f4 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Wed, 23 Sep 2020 21:13:48 -0700
Subject: [PATCH 075/449] Failing test demonstrating problems with mixed output
 shapes (#44455)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44455

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D23886119

Pulled By: bertmaher

fbshipit-source-id: 41787930f154cf4e8a1766613c4cf33b18246555
---
 test/test_jit_fuser_te.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index f9aca9a5dea1..dc7e67a14ee2 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1311,6 +1311,36 @@ def fn(x):
             self.assertEqual(ref, t(x))
             self.assertEqual(len(self.findFusionGroups(t.graph_for(x))), 0)
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_superslomo(self):
+        # Test extracted from Super-SloMo: https://github.com/avinashpaliwal/Super-SloMo
+        # A few interesting things happen here: strided inputs of mixed size,
+        # plus outputs of mixed shapes.  The latter characteristic happened to
+        # expose a memory corruption bug due to not properly guarding the
+        # outputs.
+        def eager(t0, t1, t2, t3, t4):
+            t5 = torch.mul(t0, t4)
+            t6 = torch.mul(t2, t3)
+            t7 = torch.mul(t6, t1)
+            t9 = torch.add(t5, t7)
+            t11 = torch.add(t0, t6)
+            ft_p = torch.div(t9, t11)
+            return (ft_p, t11, t9, t6)
+
+        t0 = torch.rand(1, 6, 352, 352, device="cuda").transpose(0, 1)
+        t1 = torch.rand(6, 3, 352, 352, device="cuda")
+        t2 = torch.rand(6, device="cuda")[None, None, None, :].permute(3, 0, 1, 2)
+        t3 = torch.rand(6, 1, 352, 352, device="cuda")
+        t4 = torch.rand(6, 3, 352, 352, device="cuda")
+        inputs = [t0, t1, t2, t3, t4]
+
+        script = torch.jit.script(eager)
+        for _ in range(4):
+            for pair in zip(script(*inputs), eager(*inputs)):
+                test, ref = pair
+                torch.testing.assert_allclose(test, ref)
+        self.assertAllFused(script.graph_for(*inputs))
+
 
 if __name__ == '__main__':
     run_tests()

From 956a25d0614dc783931e73e57ac993a885fac3ed Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Wed, 23 Sep 2020 21:16:52 -0700
Subject: [PATCH 076/449] Revert D23858329: [PT Model Split] Support 2
 operators in PT by C2 conversion

Test Plan: revert-hammer

Differential Revision:
D23858329 (https://github.com/pytorch/pytorch/commit/721cfbf8425cf2c1dc5e27d1332e32e1a42ef541)

Original commit changeset: ed37118ca7f0

fbshipit-source-id: 30c700f80665be11afc608b00a77766064e60b35
---
 caffe2/operators/gather_ranges_to_dense_op.cc |  8 ---
 caffe2/operators/gather_ranges_to_dense_op.h  |  3 -
 .../operator_test/torch_integration_test.py   | 69 +------------------
 3 files changed, 1 insertion(+), 79 deletions(-)

diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc
index aa31ef12b36a..10396aafc97e 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.cc
+++ b/caffe2/operators/gather_ranges_to_dense_op.cc
@@ -104,11 +104,3 @@ NO_GRADIENT(GatherRangesToDense);
 
 } // namespace
 } // namespace caffe2
-
-using GatherRangesToDenseCPUOp =
-    caffe2::GatherRangesToDenseOp<caffe2::CPUContext>;
-
-C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
-    GatherRangesToDense,
-    "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs",
-    GatherRangesToDenseCPUOp);
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index 217a61b25129..c1dd5a527005 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -5,7 +5,6 @@
 
 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
-#include "caffe2/core/export_caffe2_op_to_c10.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
@@ -16,8 +15,6 @@
 #include <map>
 #include <utility>
 
-C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense);
-
 namespace caffe2 {
 template <class Context>
 class GatherRangesToDenseOp final : public Operator<Context> {
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index 7194daa91203..55f26a89987f 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -7,12 +7,10 @@
 import torch
 import unittest
 
-from caffe2.python import core, dyndep, workspace
+from caffe2.python import core, workspace
 from hypothesis import given, settings
 from scipy.stats import norm
 
-dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/operators:calibration_op')
-
 
 def generate_rois(roi_counts, im_dims):
     assert len(roi_counts) == len(im_dims)
@@ -877,71 +875,6 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries):
         )
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
-    def test_gather_ranges_to_dense_op(self):
-        data = np.array([1, 2, 3, 4, 5, 6, 7, 8])
-        ranges = np.array([[[2, 4]], [[0, 0]]])
-        key = np.array([0, 1, 3, 2, 1, 0, 1, 0])
-        lengths = np.array([4])
-        min_observation = 2
-        max_mismatched_ratio = 0.5
-        max_empty_ratio = 1.0
-
-        outputs_name = ["X_{}".format(i) for i in range(len(lengths))]
-        ref_op = core.CreateOperator(
-            "GatherRangesToDense",
-            ["data", "ranges", "key"],
-            outputs_name,
-            lengths=lengths,
-            min_observation=min_observation,
-            max_mismatched_ratio=max_mismatched_ratio,
-            max_empty_ratio=max_empty_ratio,
-        )
-        workspace.FeedBlob("data", data)
-        workspace.FeedBlob("ranges", ranges)
-        workspace.FeedBlob("key", key)
-        workspace.RunOperatorOnce(ref_op)
-        ref_outputs = []
-        for output_name in outputs_name:
-            ref_outputs.append(workspace.FetchBlob(output_name))
-
-        outputs = torch.ops._caffe2.GatherRangesToDense(
-            torch.from_numpy(data),
-            torch.from_numpy(ranges),
-            torch.from_numpy(key),
-            lengths=lengths,
-            min_observation=min_observation,
-            max_mismatched_ratio=max_mismatched_ratio,
-            max_empty_ratio=max_empty_ratio,
-        )
-
-        self.assertEqual(len(ref_outputs), len(outputs))
-        for i in range(0, len(ref_outputs)):
-            np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy())
-
-    def test_prior_correct_calibration_prediction_op(self):
-        beta = np.array([1.0, 2.0], dtype=np.float32)
-        gamma = np.array([3.0, 4.0], dtype=np.float32)
-        pred = np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32)
-
-        ref_op = core.CreateOperator(
-            "PriorCorrectionCalibrationPrediction",
-            ["beta", "gamma", "pred"],
-            ["new_pred"],
-        )
-        workspace.FeedBlob("beta", beta)
-        workspace.FeedBlob("gamma", gamma)
-        workspace.FeedBlob("pred", pred)
-        workspace.RunOperatorOnce(ref_op)
-        ref_output = workspace.FetchBlob("new_pred")
-
-        output = torch.ops._caffe2.PriorCorrectionCalibrationPrediction(
-            torch.from_numpy(beta),
-            torch.from_numpy(gamma),
-            torch.from_numpy(pred),
-        )
-        torch.testing.assert_allclose(ref_output, output)
-
-
     @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10))
     @settings(deadline=1000)
     def test_merge_id_lists(self, lengths_0, lengths_1):

From 070fe15e4cc4b684205b869fb8ba4625c3311d8f Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 23 Sep 2020 22:01:14 -0700
Subject: [PATCH 077/449] Add link to profiling recipe from rpc main docs
 (#45235)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45235

This is so that users know that the profiler works as expected with
RPC and they can learn how to use it to profile RPC-based workloads.
ghstack-source-id: 112773748

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D23777888

fbshipit-source-id: 4805be9b949c8c7929182f291a6524c3c6a725c1
---
 docs/source/rpc.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst
index 37adc14faae1..1e4788c99634 100644
--- a/docs/source/rpc.rst
+++ b/docs/source/rpc.rst
@@ -293,8 +293,11 @@ The RRef design note covers the design of the :ref:`rref` (Remote REFerence) pro
 
 Tutorials
 ---------
-The RPC tutorial introduces users to the RPC framework and provides two example applications using :ref:`torch.distributed.rpc<distributed-rpc-framework>` APIs.
+The RPC tutorials introduce users to the RPC framework, provide several example applications
+using :ref:`torch.distributed.rpc<distributed-rpc-framework>` APIs, and demonstrate how
+to use `the profiler <https://pytorch.org/docs/stable/autograd.html#profiler>`__ to profile RPC-based workloads.
 
 -  `Getting started with Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_tutorial.html>`__
 -  `Implementing a Parameter Server using Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html>`__
 -  `Combining Distributed DataParallel with Distributed RPC Framework <https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html>`__
+-  `Profiling RPC-based Workloads <https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html>`__

From 6a2e9eb51c453589cfec7cbf79f429fdf46f1fd4 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Wed, 23 Sep 2020 22:07:54 -0700
Subject: [PATCH 078/449] torch.fft: Multi-dimensional transforms (#44550)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44550

Part of the `torch.fft` work (gh-42175).
This adds n-dimensional transforms: `fftn`, `ifftn`, `rfftn` and `irfftn`.

This is aiming for correctness first, with the implementation on top of the existing `_fft_with_size` restrictions. I plan to follow up later with a more efficient rewrite that makes `_fft_with_size` work with arbitrary numbers of dimensions.

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D23846032

Pulled By: mruberry

fbshipit-source-id: e6950aa8be438ec5cb95fb10bd7b8bc9ffb7d824
---
 aten/src/ATen/WrapDimUtils.h               |  14 +-
 aten/src/ATen/native/SpectralOps.cpp       | 168 +++++++++++++++
 aten/src/ATen/native/native_functions.yaml |  20 ++
 docs/source/fft.rst                        |   4 +
 test/test_spectral_ops.py                  | 178 +++++++++++++++-
 torch/csrc/api/include/torch/fft.h         |  60 ++++++
 torch/fft/__init__.py                      | 225 +++++++++++++++++++++
 7 files changed, 662 insertions(+), 7 deletions(-)

diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h
index c248ea461116..2768efe6e683 100644
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@@ -30,14 +30,15 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector<std::vector<
   return maybe_wrap_dim(dim, tensor_sizes[0].size());
 }
 
-// wrap each of dims basing on dim_post_expr
-static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_expr) {
+// wrap each dim in the dims array, taking dim_post_expr as the true number of dimensions
+static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_post_expr) {
   if (dim_post_expr <= 0) {
     dim_post_expr = 1; // this will make range [-1, 0]
   }
   int64_t min = -dim_post_expr;
   int64_t max = dim_post_expr - 1;
-  for (auto& dim : dims) {
+  for (int64_t i = 0; i < ndims; ++i) {
+    auto &dim = dims[i];
     if (dim < min || dim > max) {
       TORCH_CHECK_INDEX(false,
         "Dimension out of range (expected to be in range of [",
@@ -47,6 +48,13 @@ static inline void maybe_wrap_dims(std::vector<int64_t>& dims, int64_t dim_post_
   }
 }
 
+// Wrap each dim in a contiguous container, taking dim_post_expr as the true number of dimensions
+// E.g. could also be std::array or c10::SmallVector
+template <typename Container>
+inline void maybe_wrap_dims(Container& dims, int64_t dim_post_expr) {
+  return maybe_wrap_dims_n(dims.data(), dims.size(), dim_post_expr);
+}
+
 // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
 // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
 // to be "skipped" (both for wrap dimension behavior and dimension size checking).
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index e467c21a4a30..120ef9f73042 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -203,6 +203,116 @@ Tensor fft_c2c(Tensor input, c10::optional<int64_t> n_opt,
   return out;
 }
 
+// Dimensions to transform, and the signal shape in those dimensions
+struct ShapeAndDims {
+  DimVector shape, dim;
+};
+
+// Pre-process n-dimensional fft's `s` and `dim` arguments.
+// Wraps dimensions and applies defaulting behavior.
+// Also checks transform dims are unique and transform shape is non-empty.
+ShapeAndDims canonicalize_fft_shape_and_dim_args(
+    Tensor input, c10::optional<IntArrayRef> shape, c10::optional<IntArrayRef> dim) {
+  const int64_t input_dim = input.dim();
+  const IntArrayRef input_sizes = input.sizes();
+  ShapeAndDims ret;
+
+  if (dim) {
+    ret.dim.resize(dim->size());
+    std::copy(dim->begin(), dim->end(), ret.dim.begin());
+    maybe_wrap_dims(ret.dim, input_dim);
+
+    // Check dims are unique
+    DimVector copy = ret.dim;
+    std::sort(copy.begin(), copy.end());
+    auto duplicate = std::adjacent_find(copy.begin(), copy.end());
+    TORCH_CHECK(duplicate == copy.end(), "FFT dims must be unique");
+  }
+
+  if (shape) {
+    // Has shape, may have dim
+    TORCH_CHECK(!dim || dim->size() == shape->size(),
+                "When given, dim and shape arguments must have the same length");
+    TORCH_CHECK(shape->size() <= input_dim,
+                "Got shape with ", shape->size(), " values but input tensor "
+                "only has ", input_dim, " dimensions.");
+    const int64_t transform_ndim = shape->size();
+    // If shape is given, dims defaults to the last shape.size() dimensions
+    if (!dim) {
+      ret.dim.resize(transform_ndim);
+      std::iota(ret.dim.begin(), ret.dim.end(), input_dim - transform_ndim);
+    }
+
+    // Translate shape of -1 to the default length
+    ret.shape.resize(transform_ndim);
+    for (int64_t i = 0; i < transform_ndim; ++i) {
+      const auto n = (*shape)[i];
+      ret.shape[i] = n == -1 ? input_sizes[ret.dim[i]] : n;
+    }
+  } else if (!dim) {
+    // No shape, no dim
+    ret.dim.resize(input_dim);
+    std::iota(ret.dim.begin(), ret.dim.end(), int64_t{0});
+    ret.shape.resize(input_dim);
+    std::copy(input_sizes.begin(), input_sizes.end(), ret.shape.begin());
+  } else {
+    // No shape, has dim
+    ret.shape.resize(ret.dim.size());
+    for (int64_t i = 0; i < ret.dim.size(); ++i) {
+      ret.shape[i] = input_sizes[ret.dim[i]];
+    }
+  }
+  
+  for (int64_t i = 0; i < ret.shape.size(); ++i) {
+    TORCH_CHECK(ret.shape[i] > 0,
+                "Invalid number of data points (", ret.shape[i], ") specified");
+  }
+  
+  return ret;
+}
+
+// Complex to complex n-dimensional fft
+Tensor fftn_c2c(
+    const Tensor& input, IntArrayRef shape, IntArrayRef dim,
+    c10::optional<std::string> norm_str, bool forward) {
+  TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT");
+  const auto input_dim = input.dim();
+
+  Tensor x = resize_fft_input(input, dim, shape);
+  x = at::view_as_real(x);
+
+  const int64_t transform_ndim = dim.size();
+  const auto norm = norm_from_string(norm_str, forward);
+  // _fft_with_size only supports 3 dimensions being transformed at a time.
+  // This limit is inherited from cuFFT.
+  constexpr int64_t max_signal_ndim = 3;
+
+  // Transform n dimensions, up to 3 at a time
+  // TODO: rewrite _fft_with_size to transform more than 3 dimensions at once.
+  for (int64_t i = 0; i < transform_ndim; i += max_signal_ndim) {
+    const int64_t signal_ndim = std::min(transform_ndim - i, max_signal_ndim);
+    DimVector source_dim(signal_ndim);
+    DimVector dest_dim(signal_ndim);
+
+    for (int64_t j = 0; j < signal_ndim; ++j) {
+      source_dim[j] = dim[i + j];
+      dest_dim[j] = j + (input_dim - signal_ndim);
+    }
+
+    // _fft operates on up-to the last 3 dims, so move selected dims to the end
+    x = at::movedim(x, source_dim, dest_dim);
+
+    x = _fft(x, signal_ndim, /*complex_input=*/true, /*complex_output=*/true,
+             /*inverse=*/!forward, /*signal_sizes=*/{}, /*normalization=*/norm,
+             /*onesided=*/false);
+
+    // Move transform dims back to their original order
+    x = at::movedim(x, dest_dim, source_dim);
+  }
+
+  return at::view_as_complex(x);
+}
+
 }
 
 // torch.fft.fft, analogous to NumPy's numpy.fft.fft
@@ -240,6 +350,64 @@ Tensor fft_ihfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
   return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
 }
 
+Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  // TODO: For real input, perform rfftn then mirror with conjugate symmetry
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true);
+}
+
+Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false);
+}
+
+Tensor fft_rfftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis");
+
+  const auto last_dim = desc.dim.back();
+  const auto last_shape = desc.shape.back();
+  desc.shape.pop_back();
+  desc.dim.pop_back();
+
+  // rfft on last dim to get hermitian complex shape
+  auto x = native::fft_rfft(self, last_shape, last_dim, norm);
+  // Normal fft on remaining dims
+  return fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/true);
+}
+
+Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
+                c10::optional<IntArrayRef> dim,
+                c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis");
+
+  const auto last_dim = desc.dim.back();
+  const auto last_shape = [&]() -> c10::optional<int64_t> {
+    // If shape is defaulted in the last dimension,
+    // pass nullopt to irfft and let it calculate the default size
+    if (!s.has_value() || (s->back() == -1)) {
+      return c10::nullopt;
+    }
+    return desc.shape.back();
+  }();
+  desc.shape.pop_back();
+  desc.dim.pop_back();
+
+  // Normal ifft for all but last dim
+  Tensor x = promote_tensor_fft(self, /*require_complex=*/true);
+   x = fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/false);
+  // Then 1d irfft on last dim to get real output
+  return native::fft_irfft(x, last_shape, last_dim, norm);
+}
 
 // This is a pass-through wrapper function that does the size check and
 // inferences. The actual forward implementation function is called
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ae6afc3818a5..ae3579cd0aa9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7943,6 +7943,26 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
 - func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/docs/source/fft.rst b/docs/source/fft.rst
index 8ec06a3574d2..a732f3e5c652 100644
--- a/docs/source/fft.rst
+++ b/docs/source/fft.rst
@@ -19,7 +19,11 @@ Functions
 
 .. autofunction:: fft
 .. autofunction:: ifft
+.. autofunction:: fftn
+.. autofunction:: ifftn
 .. autofunction:: rfft
 .. autofunction:: irfft
+.. autofunction:: rfftn
+.. autofunction:: irfftn
 .. autofunction:: hfft
 .. autofunction:: ihfft
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 59b58fa202d6..d7ef731699b3 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -3,6 +3,7 @@
 import math
 from contextlib import contextmanager
 from itertools import product
+import itertools
 
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA)
@@ -11,7 +12,7 @@
      skipCPUIfNoMkl, skipCUDAIfRocm, deviceCountAtLeast, onlyCUDA)
 
 from distutils.version import LooseVersion
-from typing import Optional
+from typing import Optional, List
 
 
 if TEST_NUMPY:
@@ -115,6 +116,7 @@ def method_fn(t):
 
     @skipCPUIfNoMkl
     @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
     @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
     @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4})
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
@@ -226,11 +228,13 @@ def test_fft_round_trip(self, device, dtype):
     def test_empty_fft(self, device, dtype):
         t = torch.empty(0, device=device, dtype=dtype)
         match = r"Invalid number of data points \([-\d]*\) specified"
-        fft_functions = [torch.fft.fft, torch.fft.ifft, torch.fft.hfft,
-                         torch.fft.irfft]
+        fft_functions = [torch.fft.fft, torch.fft.fftn,
+                         torch.fft.ifft, torch.fft.ifftn,
+                         torch.fft.irfft, torch.fft.irfftn,
+                         torch.fft.hfft]
         # Real-only functions
         if not dtype.is_complex:
-            fft_functions += [torch.fft.rfft, torch.fft.ihfft]
+            fft_functions += [torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft]
 
         for fn in fft_functions:
             with self.assertRaisesRegex(RuntimeError, match):
@@ -242,6 +246,9 @@ def test_fft_invalid_dtypes(self, device):
         with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
             torch.fft.rfft(t)
 
+        with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
+            torch.fft.rfftn(t)
+
         with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
             torch.fft.ihfft(t)
 
@@ -292,7 +299,9 @@ def test_fft_half_errors(self, device, dtype):
         # TODO: Remove torch.half error when complex32 is fully implemented
         x = torch.randn(64, device=device).to(dtype)
         fft_functions = (torch.fft.fft, torch.fft.ifft,
+                         torch.fft.fftn, torch.fft.ifftn,
                          torch.fft.rfft, torch.fft.irfft,
+                         torch.fft.rfftn, torch.fft.irfftn,
                          torch.fft.hfft, torch.fft.ihfft)
         for fn in fft_functions:
             with self.assertRaisesRegex(RuntimeError, "Unsupported dtype "):
@@ -300,6 +309,7 @@ def test_fft_half_errors(self, device, dtype):
 
     @skipCPUIfNoMkl
     @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
     @dtypes(torch.double, torch.complex128)  # gradcheck requires double
     def test_fft_backward(self, device, dtype):
         test_args = list(product(
@@ -340,6 +350,166 @@ def test_fn(x):
 
                 self.assertTrue(torch.autograd.gradcheck(test_fn, (input,)))
 
+    # nd-fft tests
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4})
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_fftn_numpy(self, device, dtype):
+        norm_modes = ((None, "forward", "backward", "ortho")
+                      if LooseVersion(np.__version__) >= '1.20.0'
+                      else (None, "ortho"))
+
+        # input_ndim, s, dim
+        transform_desc = [
+            *product(range(2, 5), (None,), (None, (0,), (0, -1))),
+            *product(range(2, 5), (None, (4, 10)), (None,)),
+            (6, None, None),
+            (5, None, (1, 3, 4)),
+            (3, None, (0, -1)),
+            (3, None, (1,)),
+            (1, None, (0,)),
+            (4, (10, 10), None),
+            (4, (10, 10), (0, 1))
+        ]
+
+        fft_functions = ['fftn', 'ifftn', 'irfftn']
+        # Real-only functions
+        if not dtype.is_complex:
+            fft_functions += ['rfftn']
+
+        for input_ndim, s, dim in transform_desc:
+            shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
+            input = torch.randn(*shape, device=device, dtype=dtype)
+            for fname, norm in product(fft_functions, norm_modes):
+                torch_fn = getattr(torch.fft, fname)
+                numpy_fn = getattr(np.fft, fname)
+
+                def fn(t: torch.Tensor, s: Optional[List[int]], dim: Optional[List[int]], norm: Optional[str]):
+                    return torch_fn(t, s, dim, norm)
+
+                torch_fns = (torch_fn, torch.jit.script(fn))
+
+                expected = numpy_fn(input.cpu().numpy(), s, dim, norm)
+                exact_dtype = dtype in (torch.double, torch.complex128)
+                for fn in torch_fns:
+                    actual = fn(input, s, dim, norm)
+                    self.assertEqual(actual, expected, exact_dtype=exact_dtype)
+
+    @skipCUDAIfRocm
+    @skipCPUIfNoMkl
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_fftn_round_trip(self, device, dtype):
+        norm_modes = (None, "forward", "backward", "ortho")
+
+        # input_ndim, dim
+        transform_desc = [
+            *product(range(2, 5), (None, (0,), (0, -1))),
+            *product(range(2, 5), (None,)),
+            (7, None),
+            (5, (1, 3, 4)),
+            (3, (0, -1)),
+            (3, (1,)),
+            (1, 0),
+        ]
+
+        fft_functions = [(torch.fft.fftn, torch.fft.ifftn)]
+
+        # Real-only functions
+        if not dtype.is_complex:
+            fft_functions += [(torch.fft.rfftn, torch.fft.irfftn)]
+
+        for input_ndim, dim in transform_desc:
+            shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
+            x = torch.randn(*shape, device=device, dtype=dtype)
+
+            for (forward, backward), norm in product(fft_functions, norm_modes):
+                if isinstance(dim, tuple):
+                    s = [x.size(d) for d in dim]
+                else:
+                    s = x.size() if dim is None else x.size(dim)
+
+                kwargs = {'s': s, 'dim': dim, 'norm': norm}
+                y = backward(forward(x, **kwargs), **kwargs)
+                # For real input, ifftn(fftn(x)) will convert to complex
+                self.assertEqual(x, y, exact_dtype=(
+                    forward != torch.fft.fftn or x.is_complex()))
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.double, torch.complex128)  # gradcheck requires double
+    def test_fftn_backward(self, device, dtype):
+        # input_ndim, s, dim
+        transform_desc = [
+            *product((2, 3), (None,), (None, (0,), (0, -1))),
+            *product((2, 3), (None, (4, 10)), (None,)),
+            (4, None, None),
+            (3, (10, 10), (0, 1)),
+            (2, (1, 1), (0, 1)),
+            (2, None, (1,)),
+            (1, None, (0,)),
+            (1, (11,), (0,)),
+        ]
+        norm_modes = (None, "forward", "backward", "ortho")
+
+        fft_functions = ['fftn', 'ifftn', 'irfftn']
+        # Real-only functions
+        if not dtype.is_complex:
+            fft_functions += ['rfftn']
+
+        for input_ndim, s, dim in transform_desc:
+            shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim)
+            input = torch.randn(*shape, device=device, dtype=dtype)
+
+            for fname, norm in product(fft_functions, norm_modes):
+                torch_fn = getattr(torch.fft, fname)
+
+                # Workaround for gradcheck's poor support for complex input
+                # Use real input instead and put view_as_complex into the graph
+                if dtype.is_complex:
+                    def test_fn(x):
+                        return torch_fn(torch.view_as_complex(x), s, dim, norm)
+                    inputs = (torch.view_as_real(input).detach().requires_grad_(),)
+                else:
+                    def test_fn(x):
+                        return torch_fn(x, s, dim, norm)
+                    inputs = (input.detach().requires_grad_(),)
+
+                self.assertTrue(torch.autograd.gradcheck(test_fn, inputs))
+
+    @skipCUDAIfRocm
+    @skipCPUIfNoMkl
+    @onlyOnCPUAndCUDA
+    def test_fftn_invalid(self, device):
+        a = torch.rand(10, 10, 10, device=device)
+        fft_funcs = (torch.fft.fftn, torch.fft.ifftn,
+                     torch.fft.rfftn, torch.fft.irfftn)
+
+        for func in fft_funcs:
+            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+                func(a, dim=(0, 1, 0))
+
+            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+                func(a, dim=(2, -1))
+
+            with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"):
+                func(a, s=(1,), dim=(0, 1))
+
+            with self.assertRaisesRegex(IndexError, "Dimension out of range"):
+                func(a, dim=(3,))
+
+            with self.assertRaisesRegex(RuntimeError, "tensor only has 3 dimensions"):
+                func(a, s=(10, 10, 10, 10))
+
+        c = torch.complex(a, a)
+        with self.assertRaisesRegex(RuntimeError, "Expected a real input"):
+            torch.fft.rfftn(c)
+
     # Legacy fft tests
     def _test_fft_ifft_rfft_irfft(self, device, dtype):
         def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x):
diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h
index 9622f668214f..1c119ed75226 100644
--- a/torch/csrc/api/include/torch/fft.h
+++ b/torch/csrc/api/include/torch/fft.h
@@ -35,6 +35,36 @@ inline Tensor ifft(const Tensor& self,
   return torch::fft_ifft(self, n, dim, norm);
 }
 
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::fftn(t);
+/// ```
+inline Tensor fftn(const Tensor& self,
+                   c10::optional<IntArrayRef> s=c10::nullopt,
+                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_fftn(self, s, dim, norm);
+}
+
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::ifftn(t);
+/// ```
+inline Tensor ifftn(const Tensor& self,
+                   c10::optional<IntArrayRef> s=c10::nullopt,
+                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_ifftn(self, s, dim, norm);
+}
+
 /// Computes the 1 dimensional FFT of real input with onesided Hermitian output.
 /// See https://pytorch.org/docs/master/fft.html#torch.fft.rfft.
 ///
@@ -69,6 +99,36 @@ inline Tensor irfft(const Tensor& self,
   return torch::fft_irfft(self, n, dim, norm);
 }
 
+/// Computes the N dimensional FFT of real input with onesided Hermitian output.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftn
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kDouble);
+/// torch::fft::rfftn(t);
+/// ```
+inline Tensor rfftn(const Tensor& self,
+                    c10::optional<IntArrayRef> s=c10::nullopt,
+                    c10::optional<IntArrayRef> dim=c10::nullopt,
+                    c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_rfftn(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfftn.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::irfftn(t);
+/// ```
+inline Tensor irfftn(const Tensor& self,
+                   c10::optional<IntArrayRef> s=c10::nullopt,
+                   c10::optional<IntArrayRef> dim=c10::nullopt,
+                   c10::optional<std::string> norm=c10::nullopt) {
+  return torch::fft_irfftn(self, s, dim, norm);
+}
+
 /// Computes the 1 dimensional FFT of a onesided Hermitian signal
 ///
 /// The input represents a Hermitian symmetric time domain signal. The returned
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index 08b5d28b05ae..3e4bcc35464b 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -87,6 +87,101 @@
     tensor([0.+0.j, 1.+0.j, 2.+0.j, 3.+0.j])
 """)
 
+fftn = _add_docstr(_fft.fft_fftn, r"""
+fftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the N dimensional discrete Fourier transform of :attr:`input`.
+
+Note:
+
+    The Fourier domain representation of any real signal satisfies the
+    Hermitian property: ``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])``. This
+    function always returns all positive and negative frequency terms even
+    though, for real inputs, half of these values are redundant.
+    :func:`~torch.fft.rfftn` returns the more compact one-sided representation
+    where only the positive frequencies of the last dimension are returned.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.fftn`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.ifftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n``
+        between the two transforms. This is required to make
+        :func:`~torch.fft.ifftn` the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Example:
+
+    >>> import torch.fft
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> fftn = torch.fft.fftn(t)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.fftn`
+    here is equivalent to two one-dimensional :func:`~torch.fft.fft` calls:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1)
+    >>> torch.allclose(fftn, two_ffts)
+
+""")
+
+ifftn = _add_docstr(_fft.fft_ifftn, r"""
+ifftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the N dimensional inverse discrete Fourier transform of :attr:`input`.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the IFFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ifftn`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.fftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ifftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Example:
+
+    >>> import torch.fft
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> ifftn = torch.fft.ifftn(t)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.ifftn`
+    here is equivalent to two one-dimensional :func:`~torch.fft.ifft` calls:
+
+    >>> two_iffts = torch.fft.ifft(torch.fft.ifft(x, dim=0), dim=1)
+    >>> torch.allclose(ifftn, two_iffts)
+
+""")
+
 rfft = _add_docstr(_fft.fft_rfft, r"""
 rfft(input, n=None, dim=-1, norm=None) -> Tensor
 
@@ -199,6 +294,136 @@
     tensor([0.0000, 1.0000, 2.0000, 3.0000, 4.0000])
 """)
 
+rfftn = _add_docstr(_fft.fft_rfftn, r"""
+rfftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the N-dimensional discrete Fourier transform of real :attr:`input`.
+
+The FFT of a real signal is Hermitian-symmetric,
+``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])`` so the full
+:func:`~torch.fft.fftn` output contains redundant information.
+:func:`~torch.fft.rfftn` instead omits the negative frequencies in the
+last dimension.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.rfftn`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.irfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Example:
+
+    >>> import torch.fft
+    >>> t = torch.rand(10, 10)
+    >>> rfftn = torch.fft.rfftn(t)
+    >>> rfftn.size()
+    torch.Size([10, 6])
+
+    Compared against the full output from :func:`~torch.fft.fftn`, we have all
+    elements up to the Nyquist frequency.
+
+    >>> fftn = torch.fft.fftn(t)
+    >>> torch.allclose(fftn[..., :6], rfftn)
+    True
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.rfftn`
+    here is equivalent to a combination of :func:`~torch.fft.fft` and
+    :func:`~torch.fft.rfft`:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.rfft(x, dim=1), dim=0)
+    >>> torch.allclose(rfftn, two_ffts)
+
+""")
+
+irfftn = _add_docstr(_fft.fft_irfftn, r"""
+irfftn(input, s=None, dim=None, norm=None) -> Tensor
+
+Computes the inverse of :func:`~torch.fft.rfftn`.
+
+:attr:`input` is interpreted as a one-sided Hermitian signal in the Fourier
+domain, as produced by :func:`~torch.fft.rfftn`. By the Hermitian property, the
+output will be real-valued.
+
+Note:
+    Some input frequencies must be real-valued to satisfy the Hermitian
+    property. In these cases the imaginary component will be ignored.
+    For example, any imaginary component in the zero-frequency term cannot
+    be represented in a real output and so will always be ignored.
+
+Note:
+    The correct interpretation of the Hermitian input depends on the length of
+    the original data, as given by :attr:`s`. This is because each input shape
+    could correspond to either an odd or even length signal. By default, the
+    signal is assumed to be even length and odd signals will not round-trip
+    properly. So, it is recommended to always pass the signal shape :attr:`s`.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Defaults to even output in the last dimension:
+        ``s[-1] = 2*(input.size(dim[-1]) - 1)``.
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        The last dimension must be the half-Hermitian compressed dimension.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.irfftn`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.rfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Example:
+
+    >>> import torch.fft
+    >>> t = torch.rand(10, 9)
+    >>> T = torch.fft.rfftn(t)
+
+    Without specifying the output length to :func:`~torch.fft.irfft`, the output
+    will not round-trip properly because the input is odd-length in the last
+    dimension:
+
+    >>> torch.fft.irfftn(T).size()
+    torch.Size([10, 10])
+
+    So, it is recommended to always pass the signal shape :attr:`s`.
+
+    >>> roundtrip = torch.fft.irfftn(T, t.size())
+    >>> roundtrip.size()
+    torch.Size([10, 9])
+    >>> torch.allclose(roundtrip, t)
+    True
+
+""")
+
 hfft = _add_docstr(_fft.fft_hfft, r"""
 hfft(input, n=None, dim=-1, norm=None) -> Tensor
 

From 0b6b7358633dfaa84881ae00608b621e1e35c6fc Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Wed, 23 Sep 2020 22:21:23 -0700
Subject: [PATCH 079/449] [fix] type promotion atan2 (#43466)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/43360

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43466

Reviewed By: malfet

Differential Revision: D23834928

Pulled By: mruberry

fbshipit-source-id: 2e7e0b4fcf1a846efc171c275d65a6daffd3c631
---
 aten/src/ATen/native/BinaryOps.cpp |  8 +++++---
 test/test_torch.py                 | 21 ++++++++++++++++++-
 test/test_type_promotion.py        | 33 +++++++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index cab77c25b885..f8af756773c9 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -390,14 +390,16 @@ Tensor rsub(const Tensor& self, const Tensor& other, Scalar alpha) {
 }
 
 Tensor& atan2_out(Tensor& result, const Tensor& self, const Tensor& other) {
-  auto iter = TensorIterator::binary_op(result, self, other);
+  auto iter = TensorIterator::binary_float_op(result, self, other);
   atan2_stub(iter.device_type(), iter);
   return result;
 }
 
 Tensor atan2(const Tensor& self, const Tensor& other) {
-  Tensor result = at::empty({0}, self.options());
-  return native::atan2_out(result, self, other);
+  Tensor result;
+  auto iter = TensorIterator::binary_float_op(result, self, other);
+  atan2_stub(iter.device_type(), iter);
+  return iter.output();
 }
 
 Tensor& atan2_(Tensor& self, const Tensor& other) {
diff --git a/test/test_torch.py b/test/test_torch.py
index 4b08697a908c..70556dd2d2aa 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -19672,10 +19672,20 @@ def test_movedim_view(self, device):
     torch.int8, torch.short, torch.int, torch.long
 ]
 
+_integer_types = [
+    torch.uint8, torch.int8, torch.int16,
+    torch.int32, torch.int64
+]
+
 _cpu_types: List[torch.dtype] = []
 
 _unsigned_types = [torch.uint8]
 
+# Binary Float Ops
+# Operators which use TensorIterator::binary_float_op
+# These Ops promote integer inputs to Float.
+binary_float_ops_inplace = ['atan2_', 'div_']
+
 # Helper values and functions for producing tensors and scalars to use in tensor op tests.
 # Tensor dimension sizes (Small, Medium, Large, Giant)
 _S = 5
@@ -19896,7 +19906,7 @@ def inner(self, device, dtype):
         lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)],
         1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
         [_wrap_maybe_warns("This overload of addr_? is deprecated")]),
-    ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _float_types),
+    ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _types, _types_no_half),
     ('angle', '', _small_3d, lambda t, d: [], 0, 0, 0, _types_no_half, [torch.bfloat16], False),
     ('fmod', 'value', _small_3d, lambda t, d: [3], 1e-3),
     ('fmod', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-3),
@@ -20188,6 +20198,15 @@ def fn(self, device, dtype) -> None:
                            (isinstance(arg, torch.Tensor) and arg.dtype == torch.float) else arg
                            for arg in device_args]
 
+        # Special case for binary float ops (binary ops that promote int to float)
+        if op_str in binary_float_ops_inplace and \
+                'inplace' in subtest_str and dtype in _integer_types:
+            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "):
+                cpu_result = getattr(cpu_tensor, op_str)(*cpu_args)
+            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "):
+                device_result = getattr(device_tensor, op_str)(*device_args)
+            return  # Nothing more to check
+
         # Runs the tensor op on CPU and device
         cpu_result = getattr(cpu_tensor, op_str)(*cpu_args)
         device_result = getattr(device_tensor, op_str)(*device_args)
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index 9ee90c7cbcd8..7f10915a5ac4 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -7,7 +7,7 @@
 from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests,
                                                   TEST_NUMPY, torch_to_numpy_dtype_dict)
 from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyOnCPUAndCUDA,
-                                                        dtypes, onlyCPU)
+                                                        dtypes, dtypesIfCUDA, onlyCPU)
 
 if TEST_NUMPY:
     import numpy as np
@@ -958,6 +958,37 @@ def test_computation_ignores_out(self, device):
         self.assertEqual(result, a - b, exact_dtype=False)
         self.assertNotEqual(result, a.double() - b, exact_dtype=False)
 
+    @dtypesIfCUDA(*itertools.product(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False),
+                                     torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False)))
+    @dtypes(*itertools.product(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False,
+                                                            include_complex=False),
+                               torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False,
+                                                            include_complex=False)))
+    def test_atan2_type_promotion(self, device, dtypes):
+        dtype1, dtype2 = dtypes
+        default_float = torch.get_default_dtype()
+
+        def is_int(dtype):
+            return dtype in torch.testing.get_all_int_dtypes() + [torch.bool]
+
+        def is_float(dtype):
+            return dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False)
+
+        def get_binary_float_result_type(x, y):
+            dtype1 = x.dtype
+            dtype2 = y.dtype
+            if is_float(dtype1) and is_float(dtype2):
+                return torch.result_type(x, y)
+            elif is_float(dtype1) and is_int(dtype2):
+                return dtype1
+            elif is_int(dtype1) and is_float(dtype2):
+                return dtype2
+            elif is_int(dtype1) and is_int(dtype2):
+                return default_float
+
+        x = torch.tensor(1, dtype=dtype1, device=device)
+        y = torch.tensor(2, dtype=dtype2, device=device)
+        self.assertEqual(get_binary_float_result_type(x, y), torch.atan2(x, y).dtype)
 
 instantiate_device_type_tests(TestTypePromotion, globals())
 

From b470fa450038a5108f55894870373c763ff4c431 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Wed, 23 Sep 2020 23:01:01 -0700
Subject: [PATCH 080/449] Add complex number support for binary logical
 operators (#43174)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43174

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D23684425

Pulled By: mruberry

fbshipit-source-id: 4857b16e18ec4c65327136badd7f04c74e32d330
---
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp         | 12 ++++++------
 aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu |  9 ++++++---
 c10/util/complex.h                                   |  5 +++++
 test/test_torch.py                                   | 11 -----------
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 09847a010ee3..67a961401fb0 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -237,14 +237,14 @@ void logical_and_kernel(TensorIterator& iter) {
   // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because
   // common_dtype() is unavailable for bfloat16.
   if (iter.dtype() == ScalarType::Bool) {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> bool {
           return a && b;
         });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> scalar_t {
           return static_cast<scalar_t>(a && b);
@@ -257,14 +257,14 @@ void logical_or_kernel(TensorIterator& iter) {
   // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because
   // common_dtype() is unavailable for bfloat16.
   if (iter.dtype() == ScalarType::Bool) {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> bool {
           return a || b;
         });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> scalar_t {
           return static_cast<scalar_t>(a || b);
@@ -277,14 +277,14 @@ void logical_xor_kernel(TensorIterator& iter) {
   // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because
   // common_dtype() is unavailable for bfloat16.
   if (iter.dtype() == ScalarType::Bool) {
-    AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> bool {
           return bool(a) != bool(b);
         });
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() {
       cpu_kernel(iter,
         [](scalar_t a, scalar_t b) -> scalar_t {
           return static_cast<scalar_t>(bool(a) != bool(b));
diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
index 20a851d1b2ce..de11baa28210 100644
--- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu
@@ -10,7 +10,8 @@
 namespace at { namespace native {
 
 void logical_and_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_and_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
+                                         iter.common_dtype(), "logical_and_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return a && b;
     });
@@ -18,7 +19,8 @@ void logical_and_kernel_cuda(TensorIterator& iter) {
 }
 
 void logical_or_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_or_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
+                                         iter.common_dtype(), "logical_or_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return a || b;
     });
@@ -26,7 +28,8 @@ void logical_or_kernel_cuda(TensorIterator& iter) {
 }
 
 void logical_xor_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, ScalarType::BFloat16, iter.common_dtype(), "logical_xor_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16,
+                                         iter.common_dtype(), "logical_xor_cuda", [&]() {
     gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
       return bool(a) != bool(b);
     });
diff --git a/c10/util/complex.h b/c10/util/complex.h
index 53ec4f30e539..9c63a2b296fb 100644
--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@@ -257,6 +257,11 @@ struct alignas(sizeof(T) * 2) complex {
   }
 #endif
 
+  // consistent with NumPy behavior
+  explicit constexpr operator bool() const {
+    return real() || imag();
+  }
+
   constexpr T real() const {
     return real_;
   }
diff --git a/test/test_torch.py b/test/test_torch.py
index 70556dd2d2aa..ee27c8dd65cf 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6342,11 +6342,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
         a = torch.tensor(a_, dtype=dtypes[0], device=device)
         b = torch.tensor(b_, dtype=dtypes[1], device=device)
 
-        if dtypes[0].is_complex or dtypes[1].is_complex:
-            with self.assertRaises(RuntimeError):
-                getattr(a, op)(b)
-            return
-
         # new tensor
         self.assertEqual(expected_res.bool(), getattr(a, op)(b))
         # out
@@ -6361,12 +6356,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
                 getattr(a, op + '_')(b)
             return
 
-        # TODO: remove when complex ops are supported
-        if dtypes[0].is_complex:
-            with self.assertRaises(RuntimeError):
-                getattr(a, op + '_')(b)
-            return
-
         getattr(a, op + '_')(b)
         self.assertEqual(expected_res, a)
 

From 3dd0e362db0cadc37b377d3db898464e97e518d7 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Wed, 23 Sep 2020 23:17:32 -0700
Subject: [PATCH 081/449] [TensorExpr] Fix min and max for integral inputs in
 CUDA backend (#44984)

Summary:
For integral types, isnan is meaningless. Provide specializations for
maximum and minimum which don't call it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44984

Test Plan: python test/test_jit_fuser_te.py -k TestTEFuser.test_minmax_int_ops

Reviewed By: ezyang

Differential Revision: D23885259

Pulled By: asuhan

fbshipit-source-id: 2e6da2c43c0ed18f0b648a2383d510894c574437
---
 test/test_jit_fuser_te.py                  | 39 ++++++++++++++++++++++
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp | 12 +++++--
 torch/csrc/jit/tensorexpr/kernel.cpp       |  4 +--
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index dc7e67a14ee2..6fab65006927 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -536,6 +536,45 @@ def apply(fn):
                     " ".join(["Failed:", str(dtype), op.__name__, device])
                 )
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_minmax_int_ops(self):
+        def apply(fn):
+            return lambda x, y, z: fn(fn(x, y), z)
+
+        dtypes = [
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.bool,
+        ]
+        binary_ops = [
+            torch.min,
+            torch.max
+        ]
+        devices = ["cuda"]
+        for dtype, op, device in product(dtypes, binary_ops, devices):
+            try:
+                x = self.data_for(dtype, device)
+                y = self.data_for(dtype, device)
+                z = self.data_for(dtype, device)
+                fn = apply(op)
+                ref = fn(x, y, z)
+            except Exception:
+                # If eager mode doesn't support a dtype/op/device combo,
+                # neither does the fuser.  Catch everything to avoid needing to
+                # guess what errors might be thrown by eager.
+                continue
+            try:
+                t = torch.jit.trace(fn, (x, y, z))
+                self.assertEqual(ref, t(x, y, z))
+                self.assertAllFused(t.graph_for(x, y, z))
+            except Exception as e:
+                raise RuntimeError(
+                    " ".join(["Failed:", str(dtype), op.__name__, device])
+                )
+
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     def test_comparison_eq_ne(self):
         def f(x, y):
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index a64e413657d0..06e6703d494a 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -453,7 +453,11 @@ void CudaPrinter::visit(const AtomicAdd* v) {
 }
 
 void CudaPrinter::visit(const Max* v) {
-  os() << "maximum(";
+  if (is_integral(v->dtype().scalar_type())) {
+    os() << "max(";
+  } else {
+    os() << "maximum(";
+  }
   v->lhs()->accept(this);
   os() << ",";
   v->rhs()->accept(this);
@@ -461,7 +465,11 @@ void CudaPrinter::visit(const Max* v) {
 }
 
 void CudaPrinter::visit(const Min* v) {
-  os() << "minimum(";
+  if (is_integral(v->dtype().scalar_type())) {
+    os() << "min(";
+  } else {
+    os() << "minimum(";
+  }
   v->lhs()->accept(this);
   os() << ",";
   v->rhs()->accept(this);
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 293ea780ed27..833881cc0e4f 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -816,14 +816,14 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     case aten::min: {
       return computeTwoOperand(
           "aten_min", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return Min::make(lhs, rhs, false);
+            return Min::make(boolToInteger(lhs), boolToInteger(rhs), false);
           });
     } break;
 
     case aten::max: {
       return computeTwoOperand(
           "aten_max", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return Max::make(lhs, rhs, false);
+            return Max::make(boolToInteger(lhs), boolToInteger(rhs), false);
           });
     } break;
 

From b3d7c2f97859973c7282a772b811708379064d37 Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Wed, 23 Sep 2020 23:26:26 -0700
Subject: [PATCH 082/449] [ONNX] Update ONNX docs for release (#45086)

Summary:
ONNX doc updates.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45086

Reviewed By: ezyang

Differential Revision: D23880383

Pulled By: bzinodev

fbshipit-source-id: ca29782fd73024967ee7708c217a005233e7b970
---
 docs/source/onnx.rst | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index ea45a2d7070a..3c07486b0e89 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -231,6 +231,25 @@ The dynamic control flow is captured correctly. We can verify in backends with d
     #       [37, 37, 37]], dtype=int64)]
 
 
+To avoid exporting a variable scalar tensor as a fixed value constant as part of the ONNX model, please
+avoid use of ``torch.Tensor.item()``. Torch supports implicit cast of single-element tensors to numbers.
+E.g.: ::
+
+    class LoopModel(torch.nn.Module):
+        def forward(self, x, y):
+            res = []
+            arr = x.split(2, 0)
+            for i in range(int(y)):
+                res += [arr[i].sum(0, False)]
+            return torch.stack(res)
+
+    model = torch.jit.script(LoopModel())
+    inputs = (torch.randn(16), torch.tensor(8))
+
+    out = model(*inputs)
+    torch.onnx.export(model, inputs, 'loop_and_list.onnx', opset_version=11, example_outputs=out)
+
+
 TorchVision support
 -------------------
 
@@ -262,6 +281,7 @@ The following operators are supported:
 * Conv
 * Dropout
 * Embedding (no optional arguments supported)
+* EmbeddingBag
 * FeatureDropout (training mode not supported)
 * Index
 * MaxPool1d
@@ -289,6 +309,7 @@ The following operators are supported:
 * avg_pool2d
 * avg_pool2d
 * avg_pool3d
+* as_strided
 * baddbmm
 * bitshift
 * cat
@@ -314,6 +335,7 @@ The following operators are supported:
 * exp
 * expand
 * expand_as
+* eye
 * flatten
 * floor
 * floor_divide
@@ -335,9 +357,11 @@ The following operators are supported:
 * instance_norm
 * interpolate
 * isnan
+* KLDivLoss
 * layer_norm
 * le
 * leaky_relu
+* len
 * log
 * log1p
 * log2
@@ -358,6 +382,9 @@ The following operators are supported:
 * narrow
 * ne
 * neg
+* new_empty
+* new_full
+* new_zeros
 * nll_loss
 * nonzero
 * norm
@@ -811,7 +838,10 @@ Q: Is tensor list exportable to ONNX?
 
   Yes, this is supported now for ONNX opset version >= 11. ONNX introduced the concept of Sequence in opset 11.
   Similar to list, Sequence is a data type that contains arbitrary number of Tensors.
-  Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc. E.g.: ::
+  Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc.
+  However, in-place list append within loops is not exportable to ONNX. To implement this, please use inplace
+  add operator.
+  E.g.: ::
 
     class ListLoopModel(torch.nn.Module):
         def forward(self, x):
@@ -820,8 +850,8 @@ Q: Is tensor list exportable to ONNX?
             arr = x.split(2, 0)
             res2 = torch.zeros(3, 4, dtype=torch.long)
             for i in range(len(arr)):
-                res = res.append(arr[i].sum(0, False))
-                res1 = res1.append(arr[-1 - i].sum(0, False))
+                res += [arr[i].sum(0, False)]
+                res1 += [arr[-1 - i].sum(0, False)]
                 res2 += 1
             return torch.stack(res), torch.stack(res1), res2
 

From 29dc3c5ec821f5b9026e1c847c0ac605672e95af Mon Sep 17 00:00:00 2001
From: Alexander <aocsa.cs@gmail.com>
Date: Thu, 24 Sep 2020 00:05:25 -0700
Subject: [PATCH 083/449] Sparse softmax support (CUDA) (#42307)

Summary:
This PR implements softmax support for sparse tensors.

Resolves gh-23651 for CUDA.

- [x]  sparse softmax
    - [x]  CUDA C++ implementation
    - [x]  unittests
    - [x]  update softmax documentation
    - [x]  autograd support
- [x]  sparse log_softmax
    - [x]  CUDA C++ implementation
    - [x]  unittests
    - [x]  update log_softmax documentation
    - [x]  autograd support

Here are some benchmark (script is [here](https://gist.github.com/aocsa/fbc1827b3e49901512a33ba96092cbc1)) results for `torch.sparse.softmax and torch.softmax`,  using CPU and GPU, values are float64 scalars, timing repeat is 1000:

| size         | density | sparse CUDA | sparse CPU |
|--------------|---------|-------------|------------|
|  (32, 10000) |   0.01  |    380.2    |    687.5   |
| (32, 10000)  | 0.05    | 404.3       | 2357.9     |
| (32, 10000)  | 0.1     | 405.9       | 3677.2     |
| (512, 10000) | 0.01    | 438.0       | 5443.4     |
| (512, 10000) | 0.05    | 888.1       | 24485.0    |
| (512, 10000) | 0.1     | 1921.3      | 45340.5    |

| size         | density | dense CUDA | dense CPU |
|--------------|---------|-------------|------------|
|  (32, 10000) |   0.01  |     23.6    |   1943.2   |
| (32, 10000)  | 0.05    | 23.6        | 1954.0     |
| (32, 10000)  | 0.1     | 23.5        | 1950.0     |
| (512, 10000) | 0.01    | 639.3       | 39797.9    |
| (512, 10000) | 0.05    | 640.3       | 39374.4    |
| (512, 10000) | 0.1     | 639.6       | 39192.3    |

Times are in microseconds (us).

Quick note:  I updated the performance test again.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/42307

Reviewed By: ngimel

Differential Revision: D23774427

Pulled By: mruberry

fbshipit-source-id: bfabf726075b39dde544c10249f27ae1871f82c7
---
 aten/src/ATen/native/native_functions.yaml  |   4 +
 aten/src/ATen/native/sparse/ParamUtils.cpp  |  53 ++
 aten/src/ATen/native/sparse/ParamUtils.h    |  24 +
 aten/src/ATen/native/sparse/SoftMax.cpp     | 117 ++--
 aten/src/ATen/native/sparse/cuda/SoftMax.cu | 641 ++++++++++++++++++++
 test/test_sparse.py                         |   2 +-
 6 files changed, 777 insertions(+), 64 deletions(-)
 create mode 100644 aten/src/ATen/native/sparse/ParamUtils.cpp
 create mode 100644 aten/src/ATen/native/sparse/ParamUtils.h
 create mode 100644 aten/src/ATen/native/sparse/cuda/SoftMax.cu

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index ae3579cd0aa9..f5bbb263ed9c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3676,11 +3676,13 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_sparse_cpu
+    SparseCUDA: softmax_sparse_cuda
 
 - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_backward_sparse_cpu
+    SparseCUDA: softmax_backward_sparse_cuda
 
 - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
@@ -3693,11 +3695,13 @@
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_sparse_cpu
+    SparseCUDA: log_softmax_sparse_cuda
 
 - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
   use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_backward_sparse_cpu
+    SparseCUDA: log_softmax_backward_sparse_cuda
 
 - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
   use_c10_dispatcher: full
diff --git a/aten/src/ATen/native/sparse/ParamUtils.cpp b/aten/src/ATen/native/sparse/ParamUtils.cpp
new file mode 100644
index 000000000000..f2a4c97571b9
--- /dev/null
+++ b/aten/src/ATen/native/sparse/ParamUtils.cpp
@@ -0,0 +1,53 @@
+#include <ATen/native/sparse/ParamUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/ATen.h>
+#include <tuple>
+
+namespace at {
+namespace native {
+
+std::pair<Tensor, Tensor> softmax_sparse_input_preprocessing(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float,
+    CheckedFrom function_name) {
+  TORCH_INTERNAL_ASSERT(input_.is_sparse());
+  TORCH_CHECK(
+      !half_to_float,
+      std::string(function_name) +
+          ": with half to float conversion is not supported on " +
+          input_.device().str());
+  auto input = input_.coalesce();
+  Tensor output = at::native::empty_like(input);
+  TORCH_CHECK(
+      dim_ >= 0 && dim_ < input.dim(),
+      ": dim must be non-negative and less than input dimensions");
+  return std::make_pair(input, output);
+}
+
+std::tuple<Tensor, Tensor, Tensor> softmax_backward_sparse_input_preprocessing(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_,
+    CheckedFrom function_name) {
+  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
+  checkSameSize(function_name, grad_arg, output_arg);
+
+  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
+
+  auto grad = grad_.coalesce();
+  auto output = output_.coalesce();
+
+  Tensor grad_input = at::native::empty_like(output);
+  TORCH_CHECK(
+      dim >= 0 && dim < grad.dim(),
+      ": dim must be non-negative and less than input dimensions");
+  TORCH_CHECK(
+      grad.sparse_dim() == output.sparse_dim(),
+      ": grad and output sparse dimensions must be equal");
+  return std::make_tuple(grad_input, grad, output);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/ParamUtils.h b/aten/src/ATen/native/sparse/ParamUtils.h
new file mode 100644
index 000000000000..c9b2e3d999ad
--- /dev/null
+++ b/aten/src/ATen/native/sparse/ParamUtils.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+#include <tuple>
+
+namespace at {
+namespace native {
+
+TORCH_API std::pair<Tensor, Tensor> softmax_sparse_input_preprocessing(
+    const Tensor& input_,
+    const int64_t dim_,
+    const bool half_to_float,
+    CheckedFrom function_name);
+
+TORCH_API std::tuple<Tensor, Tensor, Tensor> softmax_backward_sparse_input_preprocessing(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_,
+    CheckedFrom function_name);
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp
index 1544c6e499e7..6070faf635c5 100644
--- a/aten/src/ATen/native/sparse/SoftMax.cpp
+++ b/aten/src/ATen/native/sparse/SoftMax.cpp
@@ -4,6 +4,7 @@
 #include <ATen/SparseTensorUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/native/sparse/ParamUtils.h>
 #include <map>
 
 namespace at {
@@ -291,10 +292,10 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
   if (dim >= sparse_dim) {
     if (LogSoftMax) {
       auto new_values = log_softmax_cpu(values, dim - sparse_dim + 1, false);
-      out_values.copy_(new_values);
+      out_values.set_(new_values);
     } else {
       auto new_values = softmax_cpu(values, dim - sparse_dim + 1, false);
-      out_values.copy_(new_values);
+      out_values.set_(new_values);
     }
     return;
   }
@@ -411,17 +412,27 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con
   auto grad_offsets = get_offsets(grad_indices, sizes, -1);
 
   if (dim >= sparse_dim) {
-    for(int64_t i=0; i<out_nnz; i++) {
-      Tensor unused;
-      auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
-      auto j = low - grad_offsets.begin();
-      if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
-        if (LogSoftMax) {
-          auto r = log_softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
-          values[i].copy_(r);
-        } else {
-          auto r = softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
-          values[i].copy_(r);
+    Tensor unused;
+    if (out_offsets == grad_offsets) {
+      if (LogSoftMax) {
+        auto r = log_softmax_backward_cpu(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      } else {
+        auto r = softmax_backward_cpu(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      }
+    } else {
+      for(int64_t i=0; i<out_nnz; i++) {
+        auto low = std::lower_bound(grad_offsets.begin(), grad_offsets.end(), out_offsets[i]);
+        auto j = low - grad_offsets.begin();
+        if (j < grad_nnz && out_offsets[i] == grad_offsets[j]) {
+          if (LogSoftMax) {
+            auto r = log_softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          } else {
+            auto r = softmax_backward_cpu(grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          }
         }
       }
     }
@@ -503,36 +514,36 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con
     });
 }
 
-} // namespace
+} // anonymous namespace
 
-Tensor softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) {
-  TORCH_INTERNAL_ASSERT(input_.is_sparse());
-  TORCH_CHECK(!half_to_float, "softmax with half to float conversion is not supported on CPU");
-  auto input = input_.coalesce();
-  Tensor output = at::native::empty_like(input);
+Tensor softmax_sparse_cpu(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "softmax");
   if (input.numel() == 0) {
     return output;
   }
-  TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(),
-              "dim must be non-negative and less than input dimensions");
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] {
-      cpu_sparse_coo_softmax<scalar_t, false>(output, input, dim_);
+    cpu_sparse_coo_softmax<scalar_t, false>(output, input, dim);
   });
   return output;
 }
 
-Tensor log_softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) {
-  TORCH_INTERNAL_ASSERT(input_.is_sparse());
-  TORCH_CHECK(!half_to_float, "log_softmax with half to float conversion is not supported on CPU");
-  auto input = input_.coalesce();
-  Tensor output = at::native::empty_like(input);
+Tensor log_softmax_sparse_cpu(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "log_softmax");
   if (input.numel() == 0) {
     return output;
   }
-  TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(),
-              "dim must be non-negative and less than input dimensions");
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] {
-      cpu_sparse_coo_softmax<scalar_t, true>(output, input, dim_);
+    cpu_sparse_coo_softmax<scalar_t, true>(output, input, dim);
   });
   return output;
 }
@@ -542,26 +553,16 @@ Tensor softmax_backward_sparse_cpu(
     const Tensor& output_,
     int64_t dim_,
     const Tensor& input_) {
-  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
-  checkSameSize("softmax_backward", grad_arg, output_arg);
-
-  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
-
-  auto grad = grad_.coalesce();
-  auto output = output_.coalesce();
-
-  Tensor grad_input = at::native::empty_like(output);
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "softmax_backward");
   if (output.numel() == 0) {
     return grad_input;
   }
-  TORCH_CHECK(
-      dim >= 0 && dim < grad.dim(),
-      "dim must be non-negative and less than input dimensions");
-  TORCH_CHECK(
-              grad.sparse_dim() == output.sparse_dim(),
-      "grad and output sparse dimensions must be equal");
   AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
-      cpu_sparse_coo_softmax_backward<scalar_t, false>(grad_input, grad, output, dim);
+    cpu_sparse_coo_softmax_backward<scalar_t, false>(
+        grad_input, grad, output, dim_);
   });
   return grad_input;
 }
@@ -571,26 +572,16 @@ Tensor log_softmax_backward_sparse_cpu(
     const Tensor& output_,
     int64_t dim_,
     const Tensor& input_) {
-  TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2};
-  checkSameSize("log_softmax_backward", grad_arg, output_arg);
-
-  int64_t dim = maybe_wrap_dim(dim_, grad_.dim());
-
-  auto grad = grad_.coalesce();
-  auto output = output_.coalesce();
-
-  Tensor grad_input = at::native::empty_like(output);
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "log_softmax_backward");
   if (output.numel() == 0) {
     return grad_input;
   }
-  TORCH_CHECK(
-      dim >= 0 && dim < grad.dim(),
-      "dim must be non-negative and less than input dimensions");
-  TORCH_CHECK(
-              grad.sparse_dim() == output.sparse_dim(),
-      "grad and output sparse dimensions must be equal");
-  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
-      cpu_sparse_coo_softmax_backward<scalar_t, true>(grad_input, grad, output, dim);
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] {
+    cpu_sparse_coo_softmax_backward<scalar_t, true>(
+        grad_input, grad, output, dim_);
   });
   return grad_input;
 }
diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
new file mode 100644
index 000000000000..26cb6aba04e0
--- /dev/null
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@@ -0,0 +1,641 @@
+#include <ATen/ATen.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/SparseTensorUtils.h>
+#include <ATen/WrapDimUtilsMulti.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <ATen/native/sparse/SparseTensorMath.h>
+#include <ATen/native/sparse/ParamUtils.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh>
+#include <ATen/native/sparse/cuda/SparseCUDABlas.cuh>
+
+#include <THC/THCTensorMathPointwise.cuh>
+#include <THC/THCThrustAllocator.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/device_ptr.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/system/cuda/execution_policy.h>
+
+#include <cuda_runtime_api.h>
+#include <cusparse.h>
+#include <bitset>
+
+#include <c10/cuda/CUDAMathCompat.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+
+#include <c10/macros/Macros.h>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/generate.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+
+#include <c10/cuda/CUDAMathCompat.h>
+
+namespace at {
+namespace native {
+namespace {
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static int getNumThreads(int nElem) {
+#if defined(__HIP_PLATFORM_HCC__)
+  int threadSizes[5] = {16, 32, 64, 128, 256};
+#else
+  int threadSizes[5] = {32, 64, 128, 256, 512};
+#endif
+  for (int i = 0; i != 5; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return threadSizes[4];
+}
+
+template <typename scalar_t, bool LogSoftMax>
+__global__ void cuda_sparse_coo_softmax_kernel(
+    int64_t* sorted_pool_indices,
+    int64_t size,
+    int64_t* pool_sizes,
+    int64_t* pool_offsets,
+    int64_t nvalues,
+    scalar_t* mx_rows,
+    PackedTensorAccessor<scalar_t, 2> input_values_acc,
+    PackedTensorAccessor<scalar_t, 2> output_values_acc) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU
+    implementation of the sparse softmax algorithm that this implementation is
+    based on.
+  */
+  int tid = threadIdx.x;
+  int blkid = blockIdx.x;
+  int blksz = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int index = tid + blkid * blksz;
+  int step = blksz * gridsz;
+
+  while (index < size) {
+    int64_t offset = pool_offsets[index];
+    int64_t* pool_indices = sorted_pool_indices + offset;
+    int64_t pool_indices_size = pool_sizes[index];
+    scalar_t* mx_row = mx_rows + index * nvalues;
+
+    for (int64_t j = 0; j < nvalues; j++) {
+      scalar_t exp_sums = 0;
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto values_row = input_values_acc[i];
+        auto out_values_row = output_values_acc[i];
+
+        auto v = c10::cuda::compat::exp(values_row[j] - mx_row[j]);
+        if (!LogSoftMax) {
+          out_values_row[j] = v;
+        }
+        exp_sums += v;
+      }
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto values_row = input_values_acc[i];
+        auto out_values_row = output_values_acc[i];
+        
+        if (LogSoftMax) {
+          out_values_row[j] = values_row[j] - mx_row[j] - c10::cuda::compat::log(exp_sums);
+        } else {
+          out_values_row[j] *= 1.0 / exp_sums;
+        }
+      }
+    }
+    index += step;
+  }
+}
+
+template <typename scalar_t, bool LogSoftMax>
+__global__ void cuda_sparse_coo_softmax_backward_kernel(
+    int64_t* sorted_pool_indices,
+    int64_t size,
+    int64_t* pool_sizes,
+    int64_t* pool_offsets,
+    int64_t nvalues,
+    int64_t grad_nnz,
+    int64_t* grad_offsets,
+    int64_t* out_offsets,
+    int64_t* lower_bound_values,
+    PackedTensorAccessor<scalar_t, 2> values_accessor,
+    PackedTensorAccessor<scalar_t, 2> out_values_accessor,
+    PackedTensorAccessor<scalar_t, 2> grad_values_accessor) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for
+    the CPU implementation of the sparse softmax backward algorithm that this
+    implementation is based on.
+  */
+  int tid = threadIdx.x;
+  int blkid = blockIdx.x;
+  int blksz = blockDim.x;
+  int gridsz = gridDim.x;
+
+  int index = tid + blkid * blksz;
+  int step = blksz * gridsz;
+
+  while (index < size) {
+    int64_t offset = pool_offsets[index];
+    int64_t* pool_indices = sorted_pool_indices + offset;
+    int64_t pool_indices_size = pool_sizes[index];
+
+    for (int64_t k = 0; k < nvalues; k++) {
+      scalar_t tmp_row{0};
+
+      /* Compute tmp = - sum_j output_j * grad_j */
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto out_values_row = out_values_accessor[i];
+        auto j = lower_bound_values[i];
+
+        /* Update `tmp_row` accumulator only when limits and pools are valid */
+        if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
+          auto grad_values_row = grad_values_accessor[j];
+          if (LogSoftMax) {
+            tmp_row -= grad_values_row[k];
+          } else {
+            tmp_row -= out_values_row[k] * grad_values_row[k];
+          }
+        }
+      }
+
+      /* Compute grad_input = output * (grad + tmp)*/
+      for (int64_t p = 0; p < pool_indices_size; p++) {
+        auto i = pool_indices[p];
+        auto out_values_row = out_values_accessor[i];
+        auto values_row = values_accessor[i];
+        auto j = lower_bound_values[i];
+        if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) {
+          auto grad_values_row = grad_values_accessor[j];
+          if (LogSoftMax) {
+            values_row[k] = grad_values_row[k] +
+                c10::cuda::compat::exp(out_values_row[k]) * tmp_row;
+          } else {
+            values_row[k] =
+                out_values_row[k] * (grad_values_row[k] + tmp_row);
+          } 
+        } else {
+          if (LogSoftMax) {
+            values_row[k] =
+                c10::cuda::compat::exp(out_values_row[k]) * tmp_row;
+          } else {
+            values_row[k] = out_values_row[k] * tmp_row;
+          }
+        }
+      }
+    }
+    index += step;
+  }
+}
+
+using thrust_ptr = thrust::device_ptr<int64_t>;
+
+Tensor get_offsets(
+    const Tensor& indices,
+    const IntArrayRef& sizes,
+    const int64_t dim) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:get_offsets for the CPU
+    implementation of get_offsets function that this implementation is based on.
+  */
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto ndim = indices.size(0);
+  auto nnz = indices.size(1);
+  std::vector<int64_t> host_strides(ndim, 1);
+  if (ndim > 1) {
+    for (int64_t i = ndim - 2; i >= 0; i--) {
+      host_strides[i] =
+          host_strides[i + 1] * (i + 1 == dim ? 1 : sizes[i + 1]);
+    }
+  }
+  auto strides = at::empty({ndim}, indices.options());
+  auto strides_ptr = strides.data_ptr<int64_t>();
+
+  AT_CUDA_CHECK(cudaMemcpyAsync(
+          strides_ptr, host_strides.data(), host_strides.size() * sizeof(int64_t),
+          cudaMemcpyHostToDevice,
+          stream));
+
+  auto indices_accessor = indices.packed_accessor<int64_t, 2>();
+
+  Tensor offsets = at::empty({nnz}, indices.options());
+
+  thrust::transform(
+      policy,
+      thrust::make_counting_iterator(int64_t(0)),
+      thrust::make_counting_iterator(int64_t(nnz)),
+      thrust::device_ptr<int64_t>(offsets.data_ptr<int64_t>()),
+      [indices_accessor, strides_ptr, dim, ndim] __device__(int64_t x) {
+        int64_t pool_index = 0;
+        for (int64_t j = 0; j < ndim; j++) {
+          if (j != dim) {
+            auto indices_row = indices_accessor[j];
+            auto stride = strides_ptr[j];
+            pool_index += stride * indices_row[x];
+          }
+        }
+        return pool_index;
+      });
+  return offsets;
+}
+
+template <class scalar_t, bool requireMxRows = true>
+std::tuple<Tensor, Tensor, Tensor, Tensor> compute_pool_max(
+    const Tensor& indices,
+    const Tensor& values,
+    const IntArrayRef& sizes,
+    int64_t nvalues,
+    const int64_t dim) {
+  /*
+    Return pools of indices that align with the given dimension and the
+    corresponding max values for each pool.
+
+    See ATen/native/sparse/Softmax.cpp:get_offsets and
+    ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU
+    implementation that this implementation is based on.
+  */
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto nnz = indices.size(1);
+  auto offsets = get_offsets(indices, sizes, dim);
+  int64_t* offsets_ptr = offsets.data_ptr<int64_t>();
+
+  auto sorted_indices = at::empty({nnz}, indices.options());
+  thrust_ptr sorted_indices_thrust_ptr(sorted_indices.data_ptr<int64_t>());
+  thrust::sequence(
+      policy, sorted_indices_thrust_ptr, sorted_indices_thrust_ptr + nnz, 0);
+
+  thrust::sort(
+      policy,
+      sorted_indices_thrust_ptr,
+      sorted_indices_thrust_ptr + nnz,
+      [offsets_ptr] __device__(int64_t x, int64_t y) {
+        return offsets_ptr[x] < offsets_ptr[y];
+      });
+  auto pool_sizes = at::empty({nnz}, indices.options());
+
+  auto new_end = thrust::reduce_by_key(
+      policy,
+      sorted_indices_thrust_ptr,
+      sorted_indices_thrust_ptr + nnz,
+      thrust::make_constant_iterator(int64_t(1)),
+      thrust::make_discard_iterator(),
+      thrust_ptr(pool_sizes.data_ptr<int64_t>()),
+      [offsets_ptr] __device__(int64_t x, int64_t y) {
+        return offsets_ptr[x] == offsets_ptr[y];
+      });
+  auto new_sz = thrust::distance(
+      thrust_ptr(pool_sizes.data_ptr<int64_t>()), new_end.second);
+  pool_sizes.resize_({new_sz});
+
+  auto pool_offsets = pool_sizes.clone();
+  thrust_ptr pool_offsets_thrust_ptr(
+      pool_offsets.data_ptr<int64_t>());
+  thrust::exclusive_scan(
+      policy,
+      pool_offsets_thrust_ptr,
+      pool_offsets_thrust_ptr + new_sz,
+      pool_offsets_thrust_ptr);
+
+  Tensor mx_buffer;
+  if (requireMxRows) {
+    
+    auto values_accessor =
+        values.packed_accessor<scalar_t, 2>(); // {nnz, nvalues}
+
+    mx_buffer = at::full({new_sz * nvalues}, Scalar(-std::numeric_limits<scalar_t>::infinity()), values.options());
+ 
+    auto mx_buffer_ptr = mx_buffer.data_ptr<scalar_t>();
+
+    auto pool_sizes_ptr = pool_sizes.data_ptr<int64_t>();
+    auto sorted_indices_ptr = sorted_indices.data_ptr<int64_t>();
+    auto pool_offsets_ptr = pool_offsets.data_ptr<int64_t>();
+
+    thrust::for_each(
+        policy,
+        thrust::make_counting_iterator(int64_t(0)),
+        thrust::make_counting_iterator(int64_t(new_sz)),
+        [values_accessor,
+         sorted_indices_ptr,
+         pool_sizes_ptr,
+         pool_offsets_ptr,
+         mx_buffer_ptr,
+         nvalues] __device__(int64_t index) {
+          int64_t curr_pool_size = pool_sizes_ptr[index];
+          auto mx_row = mx_buffer_ptr + index * nvalues;
+          int64_t offset = pool_offsets_ptr[index];
+          for (int64_t p = 0; p < curr_pool_size; p++) {
+            int64_t i = *(sorted_indices_ptr + offset + p);
+            auto values_row = values_accessor[i].data();
+            for (int64_t j = 0; j < nvalues; j++) {
+              mx_row[j] = c10::cuda::compat::max(mx_row[j], values_row[j]);
+            }
+          }
+        });
+  }
+  return std::make_tuple(
+      sorted_indices, pool_offsets, pool_sizes, mx_buffer);
+}
+
+template <typename scalar_t, bool LogSoftMax>
+void cuda_sparse_coo_softmax(
+    Tensor& output,
+    const Tensor& input,
+    const int64_t dim) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU
+    implementation of the sparse softmax algorithm that this implementation is
+    based on.
+  */
+  auto sparse_dim = input.sparse_dim();
+  auto indices = input._indices().contiguous();
+  auto values = input._values().contiguous();
+  auto out_values = output._values();
+  auto out_indices = output._indices();
+  out_values.resize_as_(values);
+  out_indices.resize_as_(indices);
+  out_indices.copy_(indices);
+
+  if (dim >= sparse_dim) {
+    if (LogSoftMax) {
+      auto new_values = log_softmax_cuda(values, dim - sparse_dim + 1, false);
+      out_values.set_(new_values);
+    } else {
+      auto new_values = softmax_cuda(values, dim - sparse_dim + 1, false);
+      out_values.set_(new_values);
+    }
+    return;
+  }
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  auto nnz = values.size(0);
+  auto sizes = input.sizes();
+  auto nvalues = values.numel() / nnz;
+
+  /* Prepare accessors */
+  auto values_2 = values.view({nnz, nvalues});
+  auto values_accessor = values_2.packed_accessor<scalar_t, 2>();
+
+  auto out_values_2 = out_values.view({nnz, nvalues});
+  auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();
+
+  Tensor sorted_indices;
+  Tensor pool_offsets;
+  Tensor pool_sizes;
+  Tensor mx_buffer;
+
+  std::tie(sorted_indices, pool_offsets, pool_sizes, mx_buffer) =
+      compute_pool_max<scalar_t, true>(indices, values_2, sizes, nvalues, dim);
+
+  auto pool_size = pool_offsets.size(0);
+  int block_size = getNumThreads(pool_size);
+  const int grid_size = (pool_size + block_size - 1) / block_size;
+
+  cuda_sparse_coo_softmax_kernel<scalar_t, LogSoftMax>
+      <<<grid_size, block_size, 0, stream>>>(
+          sorted_indices.data_ptr<int64_t>(),
+          pool_size,
+          pool_sizes.data_ptr<int64_t>(),
+          pool_offsets.data_ptr<int64_t>(),
+          nvalues,
+          mx_buffer.data_ptr<scalar_t>(),
+          values_accessor,
+          out_values_accessor);
+  THCudaCheck(cudaGetLastError());
+}
+
+template <typename scalar_t, bool LogSoftMax>
+void cuda_sparse_coo_softmax_backward(
+    Tensor& grad_input,
+    const Tensor& grad,
+    const Tensor& output,
+    const int64_t dim) {
+  /*
+    See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for
+    the CPU implementation of the sparse softmax backward algorithm that this
+    implementation is based on.
+  */
+  auto sparse_dim = output.sparse_dim();
+  auto sizes = output.sizes().vec();
+  auto grad_indices = grad._indices().contiguous();
+  auto grad_values = grad._values().contiguous();
+  auto out_indices = output._indices().contiguous();
+  auto out_values = output._values().contiguous();
+  auto values = grad_input._values();
+  auto indices = grad_input._indices();
+  auto out_nnz = out_values.size(0);
+  auto grad_nnz = grad_values.size(0);
+
+  values.resize_as_(out_values);
+  values.zero_();
+  indices.resize_as_(out_indices);
+  indices.copy_(out_indices);
+
+  auto out_offsets = get_offsets(out_indices, sizes, -1);
+  auto grad_offsets = get_offsets(grad_indices, sizes, -1);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA());
+  auto policy = thrust::cuda::par(allocator).on(stream);
+
+  /* when dim >= sparse_dim the dense backward is used */ 
+  if (dim >= sparse_dim) {
+    if (at::native::cuda_equal(out_offsets, grad_offsets) == true) {
+      Tensor unused = at::native::empty_like(grad_values);
+      if (LogSoftMax) {
+        auto r = log_softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      } else {
+        auto r = softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused);
+        values.set_(r);
+      }
+    } else {
+      auto host_out_offsets =
+          out_offsets.to(at::Device(kCPU), indices.dtype(), false, true);
+      auto host_grad_offsets =
+          grad_offsets.to(at::Device(kCPU), indices.dtype(), false, true);
+      auto out_offsets_accessor = host_out_offsets.data_ptr<int64_t>();
+      auto grad_offsets_accessor = host_grad_offsets.data_ptr<int64_t>();
+      for (int64_t i = 0; i < out_nnz; i++) {
+        Tensor unused = at::native::empty_like(grad_values);
+        auto low = thrust::lower_bound(
+            grad_offsets_accessor,
+            grad_offsets_accessor + grad_offsets.size(0),
+            out_offsets_accessor[i]);
+        auto j = low - grad_offsets_accessor;
+        /* 
+          Compute output using dense backward only when limits and pools are valid 
+          If this check is false then a sparse tensor with full of zeros is returned 
+        */ 
+        if (j < grad_nnz && out_offsets_accessor[i] == grad_offsets_accessor[j]) {
+          if (LogSoftMax) {
+            auto r = log_softmax_backward_cuda(
+                grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          } else {
+            auto r = softmax_backward_cuda(
+                grad_values[j], out_values[i], dim - sparse_dim, unused);
+            values[i].copy_(r);
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  auto nnz = values.size(0);
+  auto nvalues = values.numel() / nnz;
+
+  auto values_2 = values.view({nnz, nvalues});
+  auto values_accessor = values_2.packed_accessor<scalar_t, 2>();
+
+  auto out_values_2 = out_values.view({out_nnz, nvalues});
+  auto out_values_accessor = out_values_2.packed_accessor<scalar_t, 2>();
+
+  auto grad_values_2 = grad_values.view({grad_nnz, nvalues});
+  auto grad_values_accessor = grad_values_2.packed_accessor<scalar_t, 2>();
+
+  Tensor lower_bound_values =
+      at::empty({out_offsets.size(0)}, indices.options());
+
+  thrust::lower_bound(
+      policy,
+      thrust_ptr(grad_offsets.data_ptr<int64_t>()),
+      thrust_ptr(grad_offsets.data_ptr<int64_t>() + grad_offsets.size(0)),
+      thrust_ptr(out_offsets.data_ptr<int64_t>()),
+      thrust_ptr(out_offsets.data_ptr<int64_t>()) + out_offsets.size(0),
+      thrust_ptr(lower_bound_values.data_ptr<int64_t>()));
+
+  Tensor sorted_indices;
+  Tensor pool_offsets;
+  Tensor pool_sizes;
+
+  /* Compute independent pools of indices */
+  std::tie(
+      sorted_indices, pool_offsets, pool_sizes, std::ignore) =
+      compute_pool_max<scalar_t, false>(
+          out_indices, values_2, sizes, nvalues, dim);
+
+  auto pool_size = pool_offsets.size(0);
+
+  int block_size = getNumThreads(pool_size);
+  const int grid_size = (pool_size + block_size - 1) / block_size;
+
+  cuda_sparse_coo_softmax_backward_kernel<scalar_t, LogSoftMax>
+      <<<grid_size, block_size, 0, stream>>>(
+          sorted_indices.data_ptr<int64_t>(),
+          pool_size,
+          pool_sizes.data_ptr<int64_t>(),
+          pool_offsets.data_ptr<int64_t>(),
+          nvalues,
+          grad_nnz,
+          grad_offsets.data_ptr<int64_t>(),
+          out_offsets.data_ptr<int64_t>(),
+          lower_bound_values.data_ptr<int64_t>(),
+          values_accessor,
+          out_values_accessor,
+          grad_values_accessor);
+  THCudaCheck(cudaGetLastError());
+}
+
+} // end anonymous namespace
+
+Tensor softmax_sparse_cuda(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "softmax");
+  if (input.numel() == 0) {
+    return output;
+  }
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] {
+    cuda_sparse_coo_softmax<scalar_t, false>(output, input, dim);
+  });
+  return output;
+}
+
+Tensor log_softmax_sparse_cuda(
+    const Tensor& input_,
+    const int64_t dim,
+    const bool half_to_float) {
+  Tensor input, output;
+  std::tie(input, output) = softmax_sparse_input_preprocessing(
+      input_, dim, half_to_float, "log_softmax");
+  if (input.numel() == 0) {
+    return output;
+  }
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] {
+    cuda_sparse_coo_softmax<scalar_t, true>(output, input, dim);
+  });
+  return output;
+}
+
+Tensor softmax_backward_sparse_cuda(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "softmax_backward");
+  if (output.numel() == 0) {
+    return grad_input;
+  }
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] {
+    cuda_sparse_coo_softmax_backward<scalar_t, false>(
+        grad_input, grad, output, dim_);
+  });
+  return grad_input;
+}
+
+Tensor log_softmax_backward_sparse_cuda(
+    const Tensor& grad_,
+    const Tensor& output_,
+    int64_t dim_,
+    const Tensor& input_) {
+  Tensor grad_input, grad, output;
+  std::tie(grad_input, grad, output) =
+      softmax_backward_sparse_input_preprocessing(
+          grad_, output_, dim_, input_, "log_softmax_backward");
+  if (output.numel() == 0) {
+    return grad_input;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] {
+    cuda_sparse_coo_softmax_backward<scalar_t, true>(
+        grad_input, grad, output, dim_);
+  });
+  return grad_input;
+}
+
+} // namespace native
+} // namespace at
diff --git a/test/test_sparse.py b/test/test_sparse.py
index af833be6810c..2a0e76afe36a 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -2589,7 +2589,7 @@ def test_sparse_to_numpy(self):
         t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, 4]))
         self.assertRaises(TypeError, lambda: t.numpy())
 
-    @cpu_only
+    @skipIfRocm
     def test_softmax(self):
         import torch.nn.functional as F
 

From 6d21d5f0b33c755f715efa1ed498c017629fcd93 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Thu, 24 Sep 2020 00:19:05 -0700
Subject: [PATCH 084/449] gtest-ify JIT tests, through the letter c (#45249)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45249

Reland of https://github.com/pytorch/pytorch/pull/45055 and
https://github.com/pytorch/pytorch/pull/45020

See https://github.com/pytorch/pytorch/pull/45018 for context.

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D23892645

Pulled By: suo

fbshipit-source-id: e7fe58d5e1a5a0c44f4e2aec9694145afabde0fd
---
 test/cpp/jit/CMakeLists.txt                   |   7 +-
 test/cpp/jit/test_autodiff.cpp                |   9 +-
 test/cpp/jit/test_class_import.cpp            |  12 +-
 test/cpp/jit/test_class_parser.cpp            |   4 +-
 test/cpp/jit/test_cleanup_passes.cpp          |  37 +-
 test/cpp/jit/test_code_template.cpp           |  50 ++-
 test/cpp/jit/test_constant_pooling.cpp        |  87 ++---
 .../jit/test_create_autodiff_subgraphs.cpp    |   5 +-
 test/cpp/jit/test_custom_class.cpp            | 315 +---------------
 .../jit/test_custom_class_registrations.cpp   | 291 +++++++++++++++
 .../cpp/jit/test_custom_class_registrations.h |  36 ++
 test/cpp/jit/test_custom_operators.cpp        | 342 +++++++++---------
 test/cpp/jit/test_dce.cpp                     |   6 +-
 test/cpp/jit/test_fuser.cpp                   |  41 ++-
 test/cpp/jit/test_misc.cpp                    |  10 +
 test/cpp/jit/tests.h                          |  23 +-
 16 files changed, 641 insertions(+), 634 deletions(-)
 create mode 100644 test/cpp/jit/test_custom_class_registrations.cpp
 create mode 100644 test/cpp/jit/test_custom_class_registrations.h

diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 84f7193ad8c0..b8f6ef195226 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -2,7 +2,10 @@ set(JIT_TEST_ROOT ${TORCH_ROOT}/test/cpp/jit)
 
 # Build separate libraries the define custom classes/operators used from our Python tests.
 # These are intended to be used with torch.ops.load_library() in our Python test suite.
-add_library(torchbind_test SHARED ${JIT_TEST_ROOT}/test_custom_class.cpp)
+add_library(torchbind_test SHARED
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.h
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp
+)
 target_link_libraries(torchbind_test torch)
 
 add_library(jitbackend_test SHARED ${JIT_TEST_ROOT}/test_backend.cpp)
@@ -30,6 +33,8 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_cleanup_passes.cpp
   ${JIT_TEST_ROOT}/test_create_autodiff_subgraphs.cpp
   ${JIT_TEST_ROOT}/test_custom_class.cpp
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.h
+  ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp
   ${JIT_TEST_ROOT}/test_custom_operators.cpp
   ${JIT_TEST_ROOT}/test_dce.cpp
   ${JIT_TEST_ROOT}/test_fuser.cpp
diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp
index 7d431776a971..3993c63b1708 100644
--- a/test/cpp/jit/test_autodiff.cpp
+++ b/test/cpp/jit/test_autodiff.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/frontend/tracer.h"
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
@@ -83,7 +84,7 @@ variable_list grad(
       fmap(inputs, get_edge));
 }
 
-void testADFormulas() {
+TEST(AutodiffTest, ADFormulas) {
   const auto cast = [](const Variable& v) {
     return static_cast<at::Tensor>(v);
   };
@@ -174,7 +175,7 @@ void testADFormulas() {
   }
 }
 
-void testDifferentiate() {
+TEST(AutodiffTest, Differentiate) {
   // Note: can't use IRParser for this test due to issue #23989
   auto graph = std::make_shared<Graph>();
   std::vector<int64_t> sizes{2, 3, 4};
@@ -229,7 +230,7 @@ void testDifferentiate() {
       ->run(*grad_spec.df);
 }
 
-void testDifferentiateWithRequiresGrad() {
+TEST(AutodiffTest, DifferentiateWithRequiresGrad) {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp
index 82bc0cf3bccc..ffa845b3e2a8 100644
--- a/test/cpp/jit/test_class_import.cpp
+++ b/test/cpp/jit/test_class_import.cpp
@@ -1,7 +1,7 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
+#include <gtest/gtest.h>
 
 #include <ATen/core/qualified_name.h>
+#include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 #include <torch/csrc/jit/serialization/import_source.h>
 #include <torch/torch.h>
@@ -45,7 +45,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-void testClassImport() {
+TEST(ClassImportTest, Basic) {
   auto cu1 = std::make_shared<CompilationUnit>();
   auto cu2 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
@@ -80,7 +80,7 @@ void testClassImport() {
   ASSERT_FALSE(c);
 }
 
-void testScriptObject() {
+TEST(ClassImportTest, ScriptObject) {
   Module m1("m1");
   Module m2("m2");
   std::vector<at::IValue> constantTable;
@@ -114,7 +114,7 @@ def __init__(self, x):
     return x
 )JIT";
 
-void testClassDerive() {
+TEST(ClassImportTest, ClassDerive) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu);
   const auto self = SimpleSelf(cls);
@@ -142,7 +142,7 @@ class FooBar1234(Module):
     return (self.f).top()
 )JIT";
 
-void testSaveLoadTorchbind() {
+TEST(ClassImportTest, CustomClass) {
   auto cu1 = std::make_shared<CompilationUnit>();
   std::vector<at::IValue> constantTable;
   // Import different versions of FooTest into two namespaces.
diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp
index 45e37103bb5a..a5b19f63fd3f 100644
--- a/test/cpp/jit/test_class_parser.cpp
+++ b/test/cpp/jit/test_class_parser.cpp
@@ -1,3 +1,5 @@
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_base.h>
 #include <torch/csrc/jit/frontend/parser.h>
 #include <torch/csrc/jit/frontend/resolver.h>
@@ -15,7 +17,7 @@ const auto testSource = R"JIT(
     an_attribute : Tensor
 )JIT";
 
-void testClassParser() {
+TEST(ClassParserTest, Basic) {
   Parser p(std::make_shared<Source>(testSource));
   std::vector<Def> definitions;
   std::vector<Resolver> resolvers;
diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp
index 2f2ca4e0a19b..38ceef932eb0 100644
--- a/test/cpp/jit/test_cleanup_passes.cpp
+++ b/test/cpp/jit/test_cleanup_passes.cpp
@@ -1,19 +1,19 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/frontend/ir_emitter.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 namespace torch {
 namespace jit {
 
-void testCleanUpPasses() {
+TEST(CleanupPassTest, Basic) {
   // Tests stability of clean up passes when dealing with constant pooling
   // and constant propagation.
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%cond.1 : Tensor,
       %suffix.1 : str):
   %3 : bool = aten::Bool(%cond.1) # o.py:6:7
@@ -31,20 +31,19 @@ graph(%cond.1 : Tensor,
       -> (%12)
   return (%25)
   )IR",
-        &*graph);
-    runCleanupPasses(graph);
-    testing::FileCheck()
-        .check_count(
-            "prim::Constant[value=\"same string with a twist\"]",
-            1,
-            /*exactly=*/true)
-        ->run(*graph);
+      &*graph);
+  runCleanupPasses(graph);
+  testing::FileCheck()
+      .check_count(
+          "prim::Constant[value=\"same string with a twist\"]",
+          1,
+          /*exactly=*/true)
+      ->run(*graph);
 
-    auto graph_after_pass_once = graph->toString();
-    runCleanupPasses(graph);
-    auto graph_after_pass_twice = graph->toString();
-    ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
-  }
+  auto graph_after_pass_once = graph->toString();
+  runCleanupPasses(graph);
+  auto graph_after_pass_twice = graph->toString();
+  ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp
index e4d7d1ef856e..bf539e3d169f 100644
--- a/test/cpp/jit/test_code_template.cpp
+++ b/test/cpp/jit/test_code_template.cpp
@@ -1,6 +1,6 @@
-#include "test/cpp/jit/test_base.h"
-#include "test/cpp/jit/test_utils.h"
+#include <gtest/gtest.h>
 
+#include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/frontend/code_template.h"
 
 namespace torch {
@@ -33,31 +33,29 @@ static const auto ct_expect = R"(
   int notest(int a)
   )";
 
-void testCodeTemplate() {
-  {
-    TemplateEnv e;
-    e.s("hi", "foo");
-    e.v("what", {"is", "this"});
-    TemplateEnv c(e);
-    c.s("hi", "foo2");
-    ASSERT_EQ(e.s("hi"), "foo");
-    ASSERT_EQ(c.s("hi"), "foo2");
-    ASSERT_EQ(e.v("what")[0], "is");
-  }
+TEST(TestCodeTemplate, Copying) {
+  TemplateEnv e;
+  e.s("hi", "foo");
+  e.v("what", {"is", "this"});
+  TemplateEnv c(e);
+  c.s("hi", "foo2");
+  ASSERT_EQ(e.s("hi"), "foo");
+  ASSERT_EQ(c.s("hi"), "foo2");
+  ASSERT_EQ(e.v("what")[0], "is");
+}
 
-  {
-    TemplateEnv e;
-    e.v("args", {"hi", "8"});
-    e.v("bar", {"what\non many\nlines...", "7"});
-    e.s("a", "3");
-    e.s("b", "4");
-    e.v("stuff", {"things...", "others"});
-    e.v("empty", {});
-    auto s = ct.format(e);
-    // std::cout << "'" << s << "'\n";
-    // std::cout << "'" << ct_expect << "'\n";
-    ASSERT_EQ(s, ct_expect);
-  }
+TEST(TestCodeTemplate, Formatting) {
+  TemplateEnv e;
+  e.v("args", {"hi", "8"});
+  e.v("bar", {"what\non many\nlines...", "7"});
+  e.s("a", "3");
+  e.s("b", "4");
+  e.v("stuff", {"things...", "others"});
+  e.v("empty", {});
+  auto s = ct.format(e);
+  // std::cout << "'" << s << "'\n";
+  // std::cout << "'" << ct_expect << "'\n";
+  ASSERT_EQ(s, ct_expect);
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp
index b949c9a45b25..c8cb58e1886a 100644
--- a/test/cpp/jit/test_constant_pooling.cpp
+++ b/test/cpp/jit/test_constant_pooling.cpp
@@ -1,9 +1,10 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 #include <sstream>
 #include <string>
@@ -11,26 +12,26 @@
 namespace torch {
 namespace jit {
 
-void testConstantPooling() {
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(ConstantPoolingTest, Int) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %8 : int = prim::Constant[value=1]()
   %10 : int = prim::Constant[value=1]()
   return (%8, %10)
   )IR",
-        &*graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count("prim::Constant", 1, /*exactly*/ true)
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count("prim::Constant", 1, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(ConstantPoolingTest, PoolingAcrossBlocks) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%cond : Tensor):
   %a : str = prim::Constant[value="bcd"]()
   %3 : bool = aten::Bool(%cond)
@@ -44,17 +45,18 @@ graph(%cond : Tensor):
   %7 : (str, str) = prim::TupleConstruct(%a, %b)
   return (%7)
   )IR",
-        &*graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
-        ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true)
+      ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true)
+      ->run(*graph);
+}
+
+TEST(ConstantPoolingTest, PoolingDifferentDevices) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %2 : int = prim::Constant[value=2]()
   %1 : int = prim::Constant[value=1]()
@@ -70,22 +72,21 @@ graph():
   prim::Print(%x, %y, %z)
   return (%1)
   )IR",
-        &*graph);
-    // three tensors created - two different devices among the three
-    // don't have good support for parsing tensor constants
-    ConstantPropagation(graph);
-    ConstantPooling(graph);
-    testing::FileCheck()
-        .check_count(
-            "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
-            1,
-            /*exactly*/ true)
-        ->check_count(
-            "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
-            1,
-            /*exactly*/ true)
-        ->run(*graph);
-  }
+      &*graph);
+  // three tensors created - two different devices among the three
+  // don't have good support for parsing tensor constants
+  ConstantPropagation(graph);
+  ConstantPooling(graph);
+  testing::FileCheck()
+      .check_count(
+          "Float(2:1, requires_grad=0, device=cpu) = prim::Constant",
+          1,
+          /*exactly*/ true)
+      ->check_count(
+          "Long(2:1, requires_grad=0, device=cpu) = prim::Constant",
+          1,
+          /*exactly*/ true)
+      ->run(*graph);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
index 8da6d9d6a1b2..e97043f84d24 100644
--- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp
+++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
@@ -6,7 +7,7 @@
 namespace torch {
 namespace jit {
 
-void testCreateAutodiffSubgraphs() {
+TEST(CreateAutodiffSubgraphsTest, Basic) {
   auto graph = build_lstm();
   CreateAutodiffSubgraphs(graph, /*threshold=*/2);
   // all of the ops are within the DifferentiableGraph
diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp
index 543fbc20eb3d..a96a3b4a5635 100644
--- a/test/cpp/jit/test_custom_class.cpp
+++ b/test/cpp/jit/test_custom_class.cpp
@@ -1,3 +1,6 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/jit/test_custom_class_registrations.h>
 #include <torch/custom_class.h>
 #include <torch/script.h>
 
@@ -8,317 +11,7 @@
 namespace torch {
 namespace jit {
 
-namespace {
-
-struct Foo : torch::CustomClassHolder {
-  int x, y;
-  Foo() : x(0), y(0) {}
-  Foo(int x_, int y_) : x(x_), y(y_) {}
-  int64_t info() {
-    return this->x * this->y;
-  }
-  int64_t add(int64_t z) {
-    return (x + y) * z;
-  }
-  void increment(int64_t z) {
-    this->x += z;
-    this->y += z;
-  }
-  int64_t combine(c10::intrusive_ptr<Foo> b) {
-    return this->info() + b->info();
-  }
-  ~Foo() {
-    // std::cout<<"Destroying object with values: "<<x<<' '<<y<<std::endl;
-  }
-};
-
-struct NoInit : torch::CustomClassHolder {
-  int64_t x;
-};
-
-template <class T>
-struct MyStackClass : torch::CustomClassHolder {
-  std::vector<T> stack_;
-  MyStackClass(std::vector<T> init) : stack_(init.begin(), init.end()) {}
-
-  void push(T x) {
-    stack_.push_back(x);
-  }
-  T pop() {
-    auto val = stack_.back();
-    stack_.pop_back();
-    return val;
-  }
-
-  c10::intrusive_ptr<MyStackClass> clone() const {
-    return c10::make_intrusive<MyStackClass>(stack_);
-  }
-
-  void merge(const c10::intrusive_ptr<MyStackClass>& c) {
-    for (auto& elem : c->stack_) {
-      push(elem);
-    }
-  }
-
-  std::tuple<double, int64_t> return_a_tuple() const {
-    return std::make_tuple(1337.0f, 123);
-  }
-};
-
-struct PickleTester : torch::CustomClassHolder {
-  PickleTester(std::vector<int64_t> vals) : vals(std::move(vals)) {}
-  std::vector<int64_t> vals;
-};
-
-at::Tensor take_an_instance(const c10::intrusive_ptr<PickleTester>& instance) {
-  return torch::zeros({instance->vals.back(), 4});
-}
-
-struct ElementwiseInterpreter : torch::CustomClassHolder {
-  using InstructionType = std::tuple<
-      std::string /*op*/,
-      std::vector<std::string> /*inputs*/,
-      std::string /*output*/>;
-
-  ElementwiseInterpreter() {}
-
-  // Load a list of instructions into the interpreter. As specified above,
-  // instructions specify the operation (currently support "add" and "mul"),
-  // the names of the input values, and the name of the single output value
-  // from this instruction
-  void setInstructions(std::vector<InstructionType> instructions) {
-    instructions_ = std::move(instructions);
-  }
-
-  // Add a constant. The interpreter maintains a set of constants across
-  // calls. They are keyed by name, and constants can be referenced in
-  // Instructions by the name specified
-  void addConstant(const std::string& name, at::Tensor value) {
-    constants_.insert_or_assign(name, std::move(value));
-  }
-
-  // Set the string names for the positional inputs to the function this
-  // interpreter represents. When invoked, the interpreter will assign
-  // the positional inputs to the names in the corresponding position in
-  // input_names.
-  void setInputNames(std::vector<std::string> input_names) {
-    input_names_ = std::move(input_names);
-  }
-
-  // Specify the output name for the function this interpreter represents. This
-  // should match the "output" field of one of the instructions in the
-  // instruction list, typically the last instruction.
-  void setOutputName(std::string output_name) {
-    output_name_ = std::move(output_name);
-  }
-
-  // Invoke this interpreter. This takes a list of positional inputs and returns
-  // a single output. Currently, inputs and outputs must all be Tensors.
-  at::Tensor __call__(std::vector<at::Tensor> inputs) {
-    // Environment to hold local variables
-    std::unordered_map<std::string, at::Tensor> environment;
-
-    // Load inputs according to the specified names
-    if (inputs.size() != input_names_.size()) {
-      std::stringstream err;
-      err << "Expected " << input_names_.size() << " inputs, but got "
-          << inputs.size() << "!";
-      throw std::runtime_error(err.str());
-    }
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      environment[input_names_[i]] = inputs[i];
-    }
-
-    for (InstructionType& instr : instructions_) {
-      // Retrieve all input values for this op
-      std::vector<at::Tensor> inputs;
-      for (const auto& input_name : std::get<1>(instr)) {
-        // Operator output values shadow constants.
-        // Imagine all constants are defined in statements at the beginning
-        // of a function (a la K&R C). Any definition of an output value must
-        // necessarily come after constant definition in textual order. Thus,
-        // We look up values in the environment first then the constant table
-        // second to implement this shadowing behavior
-        if (environment.find(input_name) != environment.end()) {
-          inputs.push_back(environment.at(input_name));
-        } else if (constants_.find(input_name) != constants_.end()) {
-          inputs.push_back(constants_.at(input_name));
-        } else {
-          std::stringstream err;
-          err << "Instruction referenced unknown value " << input_name << "!";
-          throw std::runtime_error(err.str());
-        }
-      }
-
-      // Run the specified operation
-      at::Tensor result;
-      const auto& op = std::get<0>(instr);
-      if (op == "add") {
-        if (inputs.size() != 2) {
-          throw std::runtime_error("Unexpected number of inputs for add op!");
-        }
-        result = inputs[0] + inputs[1];
-      } else if (op == "mul") {
-        if (inputs.size() != 2) {
-          throw std::runtime_error("Unexpected number of inputs for mul op!");
-        }
-        result = inputs[0] * inputs[1];
-      } else {
-        std::stringstream err;
-        err << "Unknown operator " << op << "!";
-        throw std::runtime_error(err.str());
-      }
-
-      // Write back result into environment
-      const auto& output_name = std::get<2>(instr);
-      environment[output_name] = std::move(result);
-    }
-
-    if (!output_name_) {
-      throw std::runtime_error("Output name not specififed!");
-    }
-
-    return environment.at(*output_name_);
-  }
-
-  // Ser/De infrastructure. See
-  // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes
-  // for more info.
-
-  // This is the type we will use to marshall information on disk during
-  // ser/de. It is a simple tuple composed of primitive types and simple
-  // collection types like vector, optional, and dict.
-  using SerializationType = std::tuple<
-      std::vector<std::string> /*input_names_*/,
-      c10::optional<std::string> /*output_name_*/,
-      c10::Dict<std::string, at::Tensor> /*constants_*/,
-      std::vector<InstructionType> /*instructions_*/
-      >;
-
-  // This function yields the SerializationType instance for `this`.
-  SerializationType __getstate__() const {
-    return SerializationType{
-        input_names_, output_name_, constants_, instructions_};
-  }
-
-  // This function will create an instance of `ElementwiseInterpreter` given
-  // an instance of `SerializationType`.
-  static c10::intrusive_ptr<ElementwiseInterpreter> __setstate__(
-      SerializationType state) {
-    auto instance = c10::make_intrusive<ElementwiseInterpreter>();
-    std::tie(
-        instance->input_names_,
-        instance->output_name_,
-        instance->constants_,
-        instance->instructions_) = std::move(state);
-    return instance;
-  }
-
-  // Class members
-  std::vector<std::string> input_names_;
-  c10::optional<std::string> output_name_;
-  c10::Dict<std::string, at::Tensor> constants_;
-  std::vector<InstructionType> instructions_;
-};
-
-TORCH_LIBRARY(_TorchScriptTesting, m) {
-  m.class_<Foo>("_Foo")
-      .def(torch::init<int64_t, int64_t>())
-      // .def(torch::init<>())
-      .def("info", &Foo::info)
-      .def("increment", &Foo::increment)
-      .def("add", &Foo::add)
-      .def("combine", &Foo::combine);
-
-  m.class_<NoInit>("_NoInit").def(
-      "get_x", [](const c10::intrusive_ptr<NoInit>& self) { return self->x; });
-
-  m.class_<MyStackClass<std::string>>("_StackString")
-      .def(torch::init<std::vector<std::string>>())
-      .def("push", &MyStackClass<std::string>::push)
-      .def("pop", &MyStackClass<std::string>::pop)
-      .def("clone", &MyStackClass<std::string>::clone)
-      .def("merge", &MyStackClass<std::string>::merge)
-      .def_pickle(
-          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-            return self->stack_;
-          },
-          [](std::vector<std::string> state) { // __setstate__
-            return c10::make_intrusive<MyStackClass<std::string>>(
-                std::vector<std::string>{"i", "was", "deserialized"});
-          })
-      .def("return_a_tuple", &MyStackClass<std::string>::return_a_tuple)
-      .def(
-          "top",
-          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
-              -> std::string { return self->stack_.back(); })
-      .def(
-          "__str__",
-          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
-            std::stringstream ss;
-            ss << "[";
-            for (size_t i = 0; i < self->stack_.size(); ++i) {
-              ss << self->stack_[i];
-              if (i != self->stack_.size() - 1) {
-                ss << ", ";
-              }
-            }
-            ss << "]";
-            return ss.str();
-          });
-  // clang-format off
-        // The following will fail with a static assert telling you you have to
-        // take an intrusive_ptr<MyStackClass> as the first argument.
-        // .def("foo", [](int64_t a) -> int64_t{ return 3;});
-  // clang-format on
-
-  m.class_<PickleTester>("_PickleTester")
-      .def(torch::init<std::vector<int64_t>>())
-      .def_pickle(
-          [](c10::intrusive_ptr<PickleTester> self) { // __getstate__
-            return std::vector<int64_t>{1, 3, 3, 7};
-          },
-          [](std::vector<int64_t> state) { // __setstate__
-            return c10::make_intrusive<PickleTester>(std::move(state));
-          })
-      .def(
-          "top",
-          [](const c10::intrusive_ptr<PickleTester>& self) {
-            return self->vals.back();
-          })
-      .def("pop", [](const c10::intrusive_ptr<PickleTester>& self) {
-        auto val = self->vals.back();
-        self->vals.pop_back();
-        return val;
-      });
-
-  m.def(
-      "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y",
-      take_an_instance);
-  // test that schema inference is ok too
-  m.def("take_an_instance_inferred", take_an_instance);
-
-  m.class_<ElementwiseInterpreter>("_ElementwiseInterpreter")
-      .def(torch::init<>())
-      .def("set_instructions", &ElementwiseInterpreter::setInstructions)
-      .def("add_constant", &ElementwiseInterpreter::addConstant)
-      .def("set_input_names", &ElementwiseInterpreter::setInputNames)
-      .def("set_output_name", &ElementwiseInterpreter::setOutputName)
-      .def("__call__", &ElementwiseInterpreter::__call__)
-      .def_pickle(
-          /* __getstate__ */
-          [](const c10::intrusive_ptr<ElementwiseInterpreter>& self) {
-            return self->__getstate__();
-          },
-          /* __setstate__ */
-          [](ElementwiseInterpreter::SerializationType state) {
-            return ElementwiseInterpreter::__setstate__(std::move(state));
-          });
-}
-
-} // namespace
-
-void testTorchbindIValueAPI() {
+TEST(CustomClassTest, TorchbindIValueAPI) {
   script::Module m("m");
 
   // test make_custom_class API
diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp
new file mode 100644
index 000000000000..f563120bbc6c
--- /dev/null
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@@ -0,0 +1,291 @@
+#include <test/cpp/jit/test_custom_class_registrations.h>
+
+#include <torch/custom_class.h>
+#include <torch/script.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+using namespace torch::jit;
+
+namespace {
+
+struct Foo : torch::CustomClassHolder {
+  int x, y;
+  Foo() : x(0), y(0) {}
+  Foo(int x_, int y_) : x(x_), y(y_) {}
+  int64_t info() {
+    return this->x * this->y;
+  }
+  int64_t add(int64_t z) {
+    return (x + y) * z;
+  }
+  void increment(int64_t z) {
+    this->x += z;
+    this->y += z;
+  }
+  int64_t combine(c10::intrusive_ptr<Foo> b) {
+    return this->info() + b->info();
+  }
+  ~Foo() {
+    // std::cout<<"Destroying object with values: "<<x<<' '<<y<<std::endl;
+  }
+};
+
+struct NoInit : torch::CustomClassHolder {
+  int64_t x;
+};
+
+struct PickleTester : torch::CustomClassHolder {
+  PickleTester(std::vector<int64_t> vals) : vals(std::move(vals)) {}
+  std::vector<int64_t> vals;
+};
+
+at::Tensor take_an_instance(const c10::intrusive_ptr<PickleTester>& instance) {
+  return torch::zeros({instance->vals.back(), 4});
+}
+
+struct ElementwiseInterpreter : torch::CustomClassHolder {
+  using InstructionType = std::tuple<
+      std::string /*op*/,
+      std::vector<std::string> /*inputs*/,
+      std::string /*output*/>;
+
+  ElementwiseInterpreter() {}
+
+  // Load a list of instructions into the interpreter. As specified above,
+  // instructions specify the operation (currently support "add" and "mul"),
+  // the names of the input values, and the name of the single output value
+  // from this instruction
+  void setInstructions(std::vector<InstructionType> instructions) {
+    instructions_ = std::move(instructions);
+  }
+
+  // Add a constant. The interpreter maintains a set of constants across
+  // calls. They are keyed by name, and constants can be referenced in
+  // Instructions by the name specified
+  void addConstant(const std::string& name, at::Tensor value) {
+    constants_.insert_or_assign(name, std::move(value));
+  }
+
+  // Set the string names for the positional inputs to the function this
+  // interpreter represents. When invoked, the interpreter will assign
+  // the positional inputs to the names in the corresponding position in
+  // input_names.
+  void setInputNames(std::vector<std::string> input_names) {
+    input_names_ = std::move(input_names);
+  }
+
+  // Specify the output name for the function this interpreter represents. This
+  // should match the "output" field of one of the instructions in the
+  // instruction list, typically the last instruction.
+  void setOutputName(std::string output_name) {
+    output_name_ = std::move(output_name);
+  }
+
+  // Invoke this interpreter. This takes a list of positional inputs and returns
+  // a single output. Currently, inputs and outputs must all be Tensors.
+  at::Tensor __call__(std::vector<at::Tensor> inputs) {
+    // Environment to hold local variables
+    std::unordered_map<std::string, at::Tensor> environment;
+
+    // Load inputs according to the specified names
+    if (inputs.size() != input_names_.size()) {
+      std::stringstream err;
+      err << "Expected " << input_names_.size() << " inputs, but got "
+          << inputs.size() << "!";
+      throw std::runtime_error(err.str());
+    }
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      environment[input_names_[i]] = inputs[i];
+    }
+
+    for (InstructionType& instr : instructions_) {
+      // Retrieve all input values for this op
+      std::vector<at::Tensor> inputs;
+      for (const auto& input_name : std::get<1>(instr)) {
+        // Operator output values shadow constants.
+        // Imagine all constants are defined in statements at the beginning
+        // of a function (a la K&R C). Any definition of an output value must
+        // necessarily come after constant definition in textual order. Thus,
+        // We look up values in the environment first then the constant table
+        // second to implement this shadowing behavior
+        if (environment.find(input_name) != environment.end()) {
+          inputs.push_back(environment.at(input_name));
+        } else if (constants_.find(input_name) != constants_.end()) {
+          inputs.push_back(constants_.at(input_name));
+        } else {
+          std::stringstream err;
+          err << "Instruction referenced unknown value " << input_name << "!";
+          throw std::runtime_error(err.str());
+        }
+      }
+
+      // Run the specified operation
+      at::Tensor result;
+      const auto& op = std::get<0>(instr);
+      if (op == "add") {
+        if (inputs.size() != 2) {
+          throw std::runtime_error("Unexpected number of inputs for add op!");
+        }
+        result = inputs[0] + inputs[1];
+      } else if (op == "mul") {
+        if (inputs.size() != 2) {
+          throw std::runtime_error("Unexpected number of inputs for mul op!");
+        }
+        result = inputs[0] * inputs[1];
+      } else {
+        std::stringstream err;
+        err << "Unknown operator " << op << "!";
+        throw std::runtime_error(err.str());
+      }
+
+      // Write back result into environment
+      const auto& output_name = std::get<2>(instr);
+      environment[output_name] = std::move(result);
+    }
+
+    if (!output_name_) {
+      throw std::runtime_error("Output name not specififed!");
+    }
+
+    return environment.at(*output_name_);
+  }
+
+  // Ser/De infrastructure. See
+  // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes
+  // for more info.
+
+  // This is the type we will use to marshall information on disk during
+  // ser/de. It is a simple tuple composed of primitive types and simple
+  // collection types like vector, optional, and dict.
+  using SerializationType = std::tuple<
+      std::vector<std::string> /*input_names_*/,
+      c10::optional<std::string> /*output_name_*/,
+      c10::Dict<std::string, at::Tensor> /*constants_*/,
+      std::vector<InstructionType> /*instructions_*/
+      >;
+
+  // This function yields the SerializationType instance for `this`.
+  SerializationType __getstate__() const {
+    return SerializationType{
+        input_names_, output_name_, constants_, instructions_};
+  }
+
+  // This function will create an instance of `ElementwiseInterpreter` given
+  // an instance of `SerializationType`.
+  static c10::intrusive_ptr<ElementwiseInterpreter> __setstate__(
+      SerializationType state) {
+    auto instance = c10::make_intrusive<ElementwiseInterpreter>();
+    std::tie(
+        instance->input_names_,
+        instance->output_name_,
+        instance->constants_,
+        instance->instructions_) = std::move(state);
+    return instance;
+  }
+
+  // Class members
+  std::vector<std::string> input_names_;
+  c10::optional<std::string> output_name_;
+  c10::Dict<std::string, at::Tensor> constants_;
+  std::vector<InstructionType> instructions_;
+};
+
+TORCH_LIBRARY(_TorchScriptTesting, m) {
+  m.class_<Foo>("_Foo")
+      .def(torch::init<int64_t, int64_t>())
+      // .def(torch::init<>())
+      .def("info", &Foo::info)
+      .def("increment", &Foo::increment)
+      .def("add", &Foo::add)
+      .def("combine", &Foo::combine);
+
+  m.class_<NoInit>("_NoInit").def(
+      "get_x", [](const c10::intrusive_ptr<NoInit>& self) { return self->x; });
+
+  m.class_<MyStackClass<std::string>>("_StackString")
+      .def(torch::init<std::vector<std::string>>())
+      .def("push", &MyStackClass<std::string>::push)
+      .def("pop", &MyStackClass<std::string>::pop)
+      .def("clone", &MyStackClass<std::string>::clone)
+      .def("merge", &MyStackClass<std::string>::merge)
+      .def_pickle(
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+            return self->stack_;
+          },
+          [](std::vector<std::string> state) { // __setstate__
+            return c10::make_intrusive<MyStackClass<std::string>>(
+                std::vector<std::string>{"i", "was", "deserialized"});
+          })
+      .def("return_a_tuple", &MyStackClass<std::string>::return_a_tuple)
+      .def(
+          "top",
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self)
+              -> std::string { return self->stack_.back(); })
+      .def(
+          "__str__",
+          [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+            std::stringstream ss;
+            ss << "[";
+            for (size_t i = 0; i < self->stack_.size(); ++i) {
+              ss << self->stack_[i];
+              if (i != self->stack_.size() - 1) {
+                ss << ", ";
+              }
+            }
+            ss << "]";
+            return ss.str();
+          });
+  // clang-format off
+        // The following will fail with a static assert telling you you have to
+        // take an intrusive_ptr<MyStackClass> as the first argument.
+        // .def("foo", [](int64_t a) -> int64_t{ return 3;});
+  // clang-format on
+
+  m.class_<PickleTester>("_PickleTester")
+      .def(torch::init<std::vector<int64_t>>())
+      .def_pickle(
+          [](c10::intrusive_ptr<PickleTester> self) { // __getstate__
+            return std::vector<int64_t>{1, 3, 3, 7};
+          },
+          [](std::vector<int64_t> state) { // __setstate__
+            return c10::make_intrusive<PickleTester>(std::move(state));
+          })
+      .def(
+          "top",
+          [](const c10::intrusive_ptr<PickleTester>& self) {
+            return self->vals.back();
+          })
+      .def("pop", [](const c10::intrusive_ptr<PickleTester>& self) {
+        auto val = self->vals.back();
+        self->vals.pop_back();
+        return val;
+      });
+
+  m.def(
+      "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y",
+      take_an_instance);
+  // test that schema inference is ok too
+  m.def("take_an_instance_inferred", take_an_instance);
+
+  m.class_<ElementwiseInterpreter>("_ElementwiseInterpreter")
+      .def(torch::init<>())
+      .def("set_instructions", &ElementwiseInterpreter::setInstructions)
+      .def("add_constant", &ElementwiseInterpreter::addConstant)
+      .def("set_input_names", &ElementwiseInterpreter::setInputNames)
+      .def("set_output_name", &ElementwiseInterpreter::setOutputName)
+      .def("__call__", &ElementwiseInterpreter::__call__)
+      .def_pickle(
+          /* __getstate__ */
+          [](const c10::intrusive_ptr<ElementwiseInterpreter>& self) {
+            return self->__getstate__();
+          },
+          /* __setstate__ */
+          [](ElementwiseInterpreter::SerializationType state) {
+            return ElementwiseInterpreter::__setstate__(std::move(state));
+          });
+}
+
+} // namespace
diff --git a/test/cpp/jit/test_custom_class_registrations.h b/test/cpp/jit/test_custom_class_registrations.h
new file mode 100644
index 000000000000..4e6b7bd43883
--- /dev/null
+++ b/test/cpp/jit/test_custom_class_registrations.h
@@ -0,0 +1,36 @@
+#include <torch/custom_class.h>
+#include <torch/script.h>
+
+namespace torch {
+namespace jit {
+
+template <class T>
+struct MyStackClass : torch::CustomClassHolder {
+  std::vector<T> stack_;
+  MyStackClass(std::vector<T> init) : stack_(init.begin(), init.end()) {}
+
+  void push(T x) {
+    stack_.push_back(x);
+  }
+  T pop() {
+    auto val = stack_.back();
+    stack_.pop_back();
+    return val;
+  }
+
+  c10::intrusive_ptr<MyStackClass> clone() const {
+    return c10::make_intrusive<MyStackClass>(stack_);
+  }
+
+  void merge(const c10::intrusive_ptr<MyStackClass>& c) {
+    for (auto& elem : c->stack_) {
+      push(elem);
+    }
+  }
+
+  std::tuple<double, int64_t> return_a_tuple() const {
+    return std::make_tuple(1337.0f, 123);
+  }
+};
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp
index 529b36385bd4..d3f61268e8f1 100644
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/ir/alias_analysis.h"
@@ -11,134 +12,135 @@
 namespace torch {
 namespace jit {
 
-void testCustomOperators() {
-  {
-    torch::RegisterOperators reg(
-        "foo::bar", [](double a, at::Tensor b) { return a + b; });
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, InferredSchema) {
+  torch::RegisterOperators reg(
+      "foo::bar", [](double a, at::Tensor b) { return a + b; });
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+  ASSERT_EQ(ops.size(), 1);
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::bar");
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::bar");
 
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "_0");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "_1");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
 
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
-  {
-    torch::RegisterOperators reg(
-        "foo::bar_with_schema(float a, Tensor b) -> Tensor",
-        [](double a, at::Tensor b) { return a + b; });
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+}
 
-    auto& ops =
-        getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, ExplicitSchema) {
+  torch::RegisterOperators reg(
+      "foo::bar_with_schema(float a, Tensor b) -> Tensor",
+      [](double a, at::Tensor b) { return a + b; });
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
+  auto& ops =
+      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+  ASSERT_EQ(ops.size(), 1);
 
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::bar_with_schema");
 
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
 
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
 
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
-  {
-    // Check that lists work well.
-    torch::RegisterOperators reg(
-        "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
-        [](torch::List<int64_t> ints,
-           torch::List<double> floats,
-           torch::List<at::Tensor> tensors) { return floats; });
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
-    ASSERT_EQ(ops.size(), 1);
-
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::lists");
-
-    ASSERT_EQ(op->schema().arguments().size(), 3);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
-    ASSERT_TRUE(
-        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
-    ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
-    ASSERT_TRUE(
-        op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
-    ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
-    ASSERT_TRUE(
-        op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
-
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_TRUE(
-        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
-
-    Stack stack;
-    push(stack, c10::List<int64_t>({1, 2}));
-    push(stack, c10::List<double>({1.0, 2.0}));
-    push(stack, c10::List<at::Tensor>({at::ones(5)}));
-    op->getOperation()(&stack);
-    c10::List<double> output;
-    pop(stack, output);
-
-    ASSERT_EQ(output.size(), 2);
-    ASSERT_EQ(output.get(0), 1.0);
-    ASSERT_EQ(output.get(1), 2.0);
-  }
-  {
-    torch::RegisterOperators reg(
-        "foo::lists2(Tensor[] tensors) -> Tensor[]",
-        [](torch::List<at::Tensor> tensors) { return tensors; });
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
+
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
+}
 
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
-    ASSERT_EQ(ops.size(), 1);
+TEST(CustomOperatorTest, ListParameters) {
+  // Check that lists work well.
+  torch::RegisterOperators reg(
+      "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]",
+      [](torch::List<int64_t> ints,
+         torch::List<double> floats,
+         torch::List<at::Tensor> tensors) { return floats; });
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+  ASSERT_EQ(ops.size(), 1);
+
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::lists");
+
+  ASSERT_EQ(op->schema().arguments().size(), 3);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "ints");
+  ASSERT_TRUE(
+      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts()));
+  ASSERT_EQ(op->schema().arguments()[1].name(), "floats");
+  ASSERT_TRUE(
+      op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats()));
+  ASSERT_EQ(op->schema().arguments()[2].name(), "tensors");
+  ASSERT_TRUE(
+      op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors()));
+
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_TRUE(
+      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats()));
+
+  Stack stack;
+  push(stack, c10::List<int64_t>({1, 2}));
+  push(stack, c10::List<double>({1.0, 2.0}));
+  push(stack, c10::List<at::Tensor>({at::ones(5)}));
+  op->getOperation()(&stack);
+  c10::List<double> output;
+  pop(stack, output);
+
+  ASSERT_EQ(output.size(), 2);
+  ASSERT_EQ(output.get(0), 1.0);
+  ASSERT_EQ(output.get(1), 2.0);
+}
 
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foo::lists2");
+TEST(CustomOperatorTest, ListParameters2) {
+  torch::RegisterOperators reg(
+      "foo::lists2(Tensor[] tensors) -> Tensor[]",
+      [](torch::List<at::Tensor> tensors) { return tensors; });
 
-    ASSERT_EQ(op->schema().arguments().size(), 1);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
-    ASSERT_TRUE(
-        op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+  ASSERT_EQ(ops.size(), 1);
 
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_TRUE(
-        op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foo::lists2");
 
-    Stack stack;
-    push(stack, c10::List<at::Tensor>({at::ones(5)}));
-    op->getOperation()(&stack);
-    c10::List<at::Tensor> output;
-    pop(stack, output);
+  ASSERT_EQ(op->schema().arguments().size(), 1);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "tensors");
+  ASSERT_TRUE(
+      op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors()));
 
-    ASSERT_EQ(output.size(), 1);
-    ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
-  }
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_TRUE(
+      op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors()));
+
+  Stack stack;
+  push(stack, c10::List<at::Tensor>({at::ones(5)}));
+  op->getOperation()(&stack);
+  c10::List<at::Tensor> output;
+  pop(stack, output);
+
+  ASSERT_EQ(output.size(), 1);
+  ASSERT_TRUE(output.get(0).allclose(at::ones(5)));
 }
 
-void testCustomOperatorAliasing() {
+TEST(CustomOperatorTest, Aliasing) {
   torch::RegisterOperators reg(
       "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor {
         a.add_(b);
@@ -182,77 +184,65 @@ graph(%x: Tensor, %y: Tensor):
   }
 }
 
-void testIValueKWargs() {
-  const auto text = R"(
-    def foo(a : int, b : int, c : int = 4):
-      return a + 2*b + 3*c
-  )";
-  auto cu = compile(text);
-  auto result = cu->get_function("foo")({1}, {{"b", 3}});
-  ASSERT_EQ(result.toInt(), 19);
-}
-
-void testTemplatedOperatorCreator() {
-  constexpr char op_list[] = "foofoo::bar.template;foo::another";
+static constexpr char op_list[] = "foofoo::bar.template;foo::another";
 #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n)                                   \
   torch::detail::SelectiveStr<c10::impl::op_whitelist_contains_name_in_schema( \
       l, n)>(n)
 
-  {
-    // Try to register an op name that does not exist in op_list.
-    // Expected: the op name is not registered.
-    torch::jit::RegisterOperators reg({OperatorGenerator(
-        TORCH_SELECTIVE_NAME_IN_SCHEMA(
-            op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
-        [](Stack* stack) {
-          double a;
-          at::Tensor b;
-          pop(stack, a, b);
-          push(stack, a + b);
-        },
-        aliasAnalysisFromSchema())});
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
-    ASSERT_EQ(ops.size(), 0);
-  }
+TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
+  // Try to register an op name that does not exist in op_list.
+  // Expected: the op name is not registered.
+  torch::jit::RegisterOperators reg({OperatorGenerator(
+      TORCH_SELECTIVE_NAME_IN_SCHEMA(
+          op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"),
+      [](Stack* stack) {
+        double a;
+        at::Tensor b;
+        pop(stack, a, b);
+        push(stack, a + b);
+      },
+      aliasAnalysisFromSchema())});
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+  ASSERT_EQ(ops.size(), 0);
+}
 
-  {
-    // The operator should be successfully registered since its name is in the
-    // whitelist.
-    torch::jit::RegisterOperators reg({OperatorGenerator(
-        TORCH_SELECTIVE_NAME_IN_SCHEMA(
-            op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
-        [](Stack* stack) {
-          double a;
-          at::Tensor b;
-          pop(stack, a, b);
-          push(stack, a + b);
-        },
-        aliasAnalysisFromSchema())});
-
-    auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
-    ASSERT_EQ(ops.size(), 1);
-
-    auto& op = ops.front();
-    ASSERT_EQ(op->schema().name(), "foofoo::bar");
-
-    ASSERT_EQ(op->schema().arguments().size(), 2);
-    ASSERT_EQ(op->schema().arguments()[0].name(), "a");
-    ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
-    ASSERT_EQ(op->schema().arguments()[1].name(), "b");
-    ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
-
-    ASSERT_EQ(op->schema().returns().size(), 1);
-    ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
-
-    Stack stack;
-    push(stack, 2.0f, at::ones(5));
-    op->getOperation()(&stack);
-    at::Tensor output;
-    pop(stack, output);
-
-    ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
-  }
+TEST(TestCustomOperator, OperatorGeneratorBasic) {
+  // The operator should be successfully registered since its name is in the
+  // whitelist.
+  torch::jit::RegisterOperators reg({OperatorGenerator(
+      TORCH_SELECTIVE_NAME_IN_SCHEMA(
+          op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"),
+      [](Stack* stack) {
+        double a;
+        at::Tensor b;
+        pop(stack, a, b);
+        push(stack, a + b);
+      },
+      aliasAnalysisFromSchema())});
+
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+  ASSERT_EQ(ops.size(), 1);
+
+  auto& op = ops.front();
+  ASSERT_EQ(op->schema().name(), "foofoo::bar");
+
+  ASSERT_EQ(op->schema().arguments().size(), 2);
+  ASSERT_EQ(op->schema().arguments()[0].name(), "a");
+  ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType);
+  ASSERT_EQ(op->schema().arguments()[1].name(), "b");
+  ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType);
+
+  ASSERT_EQ(op->schema().returns().size(), 1);
+  ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType);
+
+  Stack stack;
+  push(stack, 2.0f, at::ones(5));
+  op->getOperation()(&stack);
+  at::Tensor output;
+  pop(stack, output);
+
+  ASSERT_TRUE(output.allclose(at::full(5, 3.0f)));
 }
 
 } // namespace jit
diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp
index 5799913c316a..6f9161d0d9ae 100644
--- a/test/cpp/jit/test_dce.cpp
+++ b/test/cpp/jit/test_dce.cpp
@@ -1,12 +1,12 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
+#include <gtest/gtest.h>
 
+#include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/testing/file_check.h>
 
 namespace torch {
 namespace jit {
-void testDCE() {
+TEST(EliminateDeadCodeTest, Basic) {
   auto graph = std::make_shared<Graph>();
 
   // Consider the following loop:
diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp
index ee0ea060f02f..ef595215b882 100644
--- a/test/cpp/jit/test_fuser.cpp
+++ b/test/cpp/jit/test_fuser.cpp
@@ -1,4 +1,4 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
 
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include "ATen/core/interned_strings.h"
@@ -56,28 +56,27 @@
 namespace torch {
 namespace jit {
 
-void testFusion() {
-  auto testSimple = [&] {
-    const auto graph_string = R"IR(
+TEST(FuserTest, TestSimple_CUDA) {
+  const auto graph_string = R"IR(
       graph(%0 : Tensor,
             %1 : Tensor):
         %2 : Tensor = aten::mul(%0, %1)
         return (%2))IR";
-    Graph graph;
-    torch::jit::parseIR(graph_string, &graph);
-
-    auto a = at::rand({3, 4}, at::kCUDA);
-    auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
-    auto o = at::zeros({3, 4}, at::kCUDA);
-    auto outputs = debugLaunchGraph(graph, {a, b});
-    ASSERT_EQ(outputs.size(), 1);
-    auto o2 = a * b;
-    float max_diff = (o2 - outputs[0]).abs().max().item<double>();
-    // std::cout << "max diff: " << max_diff << "\n";
-    ASSERT_EQ(max_diff, 0);
-  };
-  testSimple();
+  Graph graph;
+  torch::jit::parseIR(graph_string, &graph);
+
+  auto a = at::rand({3, 4}, at::kCUDA);
+  auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1);
+  auto o = at::zeros({3, 4}, at::kCUDA);
+  auto outputs = debugLaunchGraph(graph, {a, b});
+  ASSERT_EQ(outputs.size(), 1);
+  auto o2 = a * b;
+  float max_diff = (o2 - outputs[0]).abs().max().item<double>();
+  // std::cout << "max diff: " << max_diff << "\n";
+  ASSERT_EQ(max_diff, 0);
+}
 
+TEST(FuserTest, TestOne_CUDA) {
   auto testOne = [&](int ti, int tj) {
     const auto graph_string = R"IR(
       graph(%0 : Tensor,
@@ -132,7 +131,9 @@ void testFusion() {
   testOne(0, 1);
   testOne(1, 2);
   testOne(0, 2);
+}
 
+TEST(FuserTest, FusedConcat_CUDA) {
   const auto graph_string0 = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -175,7 +176,7 @@ void testFusion() {
   };
 }
 
-void testFusionAliasing() {
+TEST(FuserTest, FusionAliasing) {
   const auto graph_string = R"IR(
     graph(%0 : Tensor,
           %1 : Tensor):
@@ -200,7 +201,7 @@ void testFusionAliasing() {
       ->run(*g);
 }
 
-void testRegisterFusionCachesKernel() {
+TEST(FuserTest, KernelCaching) {
   // Constructs two functionally equivalent graphs
   const auto graph0_string = R"IR(
     graph(%0 : Float(2, 3, 4),
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 953d1bf42fc0..92baba1168da 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2225,5 +2225,15 @@ void testProfilerDisableInCallback() {
   t.join();
 }
 
+void testIValueKWargs() {
+  const auto text = R"(
+    def foo(a : int, b : int, c : int = 4):
+      return a + 2*b + 3*c
+  )";
+  auto cu = compile(text);
+  auto result = cu->get_function("foo")({1}, {{"b", 3}});
+  ASSERT_EQ(result.toInt(), 19);
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 45d7f48b1f8a..186aaaec2bba 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -9,38 +9,26 @@
 namespace torch {
 namespace jit {
 #define TH_FORALL_TESTS(_)                        \
-  _(ADFormulas)                                   \
   _(Attributes)                                   \
   _(Blocks)                                       \
   _(CallStack)                                    \
   _(CallStackCaching)                             \
-  _(CodeTemplate)                                 \
   _(ControlFlow)                                  \
-  _(CreateAutodiffSubgraphs)                      \
-  _(CustomOperators)                              \
-  _(CustomOperatorAliasing)                       \
-  _(TemplatedOperatorCreator)                     \
   _(IValueKWargs)                                 \
   _(CustomFusion)                                 \
   _(SchemaMatching)                               \
-  _(Differentiate)                                \
-  _(DifferentiateWithRequiresGrad)                \
   _(FromQualString)                               \
   _(InternedStrings)                              \
   _(PassManagement)                               \
   _(Proto)                                        \
-  _(RegisterFusionCachesKernel)                   \
   _(SchemaParser)                                 \
   _(TopologicalIndex)                             \
   _(SubgraphUtils)                                \
   _(SubgraphUtilsVmap)                            \
   _(IRParser)                                     \
-  _(ConstantPooling)                              \
-  _(CleanUpPasses)                                \
   _(THNNConv)                                     \
   _(ATenNativeBatchNorm)                          \
   _(NoneSchemaMatch)                              \
-  _(ClassParser)                                  \
   _(UnifyTypes)                                   \
   _(Profiler)                                     \
   _(FallbackGraphs)                               \
@@ -61,15 +49,10 @@ namespace jit {
   _(ModuleDeepcopyAliasing)                       \
   _(ModuleDefine)                                 \
   _(QualifiedName)                                \
-  _(ClassImport)                                  \
-  _(ScriptObject)                                 \
   _(ExtraFilesHookPreference)                     \
   _(SaveExtraFilesHook)                           \
   _(TypeTags)                                     \
-  _(DCE)                                          \
   _(CustomFusionNestedBlocks)                     \
-  _(ClassDerive)                                  \
-  _(SaveLoadTorchbind)                            \
   _(ModuleInterfaceSerialization)                 \
   _(ModuleCloneWithModuleInterface)               \
   _(ClassTypeAddRemoveAttr)                       \
@@ -100,7 +83,6 @@ namespace jit {
   _(LiteInterpreterHierarchyModuleInfo)           \
   _(LiteInterpreterDuplicatedClassTypeModuleInfo) \
   _(LiteInterpreterEval)                          \
-  _(TorchbindIValueAPI)                           \
   _(LiteInterpreterDict)                          \
   _(LiteInterpreterFindAndRunMethod)              \
   _(LiteInterpreterFindWrongMethodName)           \
@@ -109,12 +91,10 @@ namespace jit {
   _(MobileSaveLoadParameters)                     \
   _(MobileSaveLoadParametersEmpty)                \
   _(LiteSGD)                                      \
-  _(LiteSequentialSampler)                        \
-  _(FusionAliasing)
+  _(LiteSequentialSampler)
 
 #if defined(USE_CUDA)
 #define TH_FORALL_TESTS_CUDA(_)                     \
-  _(Fusion)                                         \
   _(GraphExecutor)                                  \
   _(ModuleConversion)                               \
   _(Interp)                                         \
@@ -219,7 +199,6 @@ namespace jit {
   _(GPU_FusionThreadPredicate)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
-  _(Fusion)                     \
   _(GraphExecutor)              \
   _(ModuleConversion)           \
   _(Interp)                     \

From dc67b47bc9d53dbeb898a4d920b0225ac73629ec Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Thu, 24 Sep 2020 02:38:00 -0700
Subject: [PATCH 085/449] Deprecate old fft functions (#44876)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44876

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23866715

Pulled By: mruberry

fbshipit-source-id: 73305eb02f92cbd1ef7d175419529d19358fedda
---
 aten/src/ATen/native/SpectralOps.cpp | 16 ++++++++++++++++
 docs/source/fft.rst                  |  2 ++
 torch/_torch_docs.py                 | 26 ++++++++++++++++++++++++++
 3 files changed, 44 insertions(+)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 120ef9f73042..1e9c1bce67d3 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -561,6 +561,10 @@ void _cufft_clear_plan_cache(int64_t device_index) {
 }
 
 Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
+  TORCH_WARN_ONCE(
+    "The function torch.fft is deprecated and will be removed in PyTorch 1.8. "
+    "Use the new torch.fft module functions, instead, by importing torch.fft "
+    "and calling torch.fft.fft or torch.fft.fftn.");
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ true, /* inverse */ false, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
@@ -568,6 +572,10 @@ Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized)
 }
 
 Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) {
+  TORCH_WARN_ONCE(
+    "The function torch.ifft is deprecated and will be removed in a future "
+    "PyTorch release. Use the new torch.fft module functions, instead, by "
+    "importing torch.fft and calling torch.fft.ifft or torch.fft.ifftn.");
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ true, /* inverse */ true, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
@@ -576,6 +584,10 @@ Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized
 
 Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
             const bool onesided) {
+  TORCH_WARN_ONCE(
+    "The function torch.rfft is deprecated and will be removed in a future "
+    "PyTorch release. Use the new torch.fft module functions, instead, by "
+    "importing torch.fft and calling torch.fft.fft or torch.fft.rfft.");
   return _fft(self, signal_ndim, /* complex_input */ false,
               /* complex_output */ true, /* inverse */ false, {},
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none,
@@ -584,6 +596,10 @@ Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized
 
 Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized,
              const bool onesided,  IntArrayRef signal_sizes) {
+  TORCH_WARN_ONCE(
+    "The function torch.irfft is deprecated and will be removed in a future "
+    "PyTorch release. Use the new torch.fft module functions, instead, by "
+    "importing torch.fft and calling torch.fft.ifft or torch.fft.irfft.");
   return _fft(self, signal_ndim, /* complex_input */ true,
               /* complex_output */ false, /* inverse */ true, signal_sizes,
               normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n,
diff --git a/docs/source/fft.rst b/docs/source/fft.rst
index a732f3e5c652..ab50bd271d32 100644
--- a/docs/source/fft.rst
+++ b/docs/source/fft.rst
@@ -1,6 +1,8 @@
 .. role:: hidden
     :class: hidden-section
 
+.. _torch-fft-module:
+
 torch.fft
 =========
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 5a3b2339fde5..32806259df35 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -8514,6 +8514,12 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.ifft`.
 
+.. deprecated:: 1.7.0
+    The function :func:`torch.fft` is deprecated and will be removed in
+    PyTorch 1.8. Use the new :ref:`torch.fft <torch-fft-module>` module
+    functions, instead, by importing :ref:`torch.fft <torch-fft-module>` and
+    calling :func:`torch.fft.fft` or :func:`torch.fft.fftn`.
+
 .. note::
     For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
     repeatedly running FFT methods on tensors of same geometry with same
@@ -8617,6 +8623,12 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.fft`.
 
+.. deprecated:: 1.7.0
+    The function :func:`torch.ifft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.ifft` or :func:`torch.fft.ifftn`.
+
 .. note::
     For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
     repeatedly running FFT methods on tensors of same geometry with same
@@ -8705,6 +8717,13 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.irfft`.
 
+.. deprecated:: 1.7.0
+    The function :func:`torch.rfft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.rfft` for one-sided output, or
+    :func:`torch.fft.fft` for two-sided output.
+
 .. note::
     For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
     repeatedly running FFT methods on tensors of same geometry with same
@@ -8777,6 +8796,13 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.rfft`.
 
+.. deprecated:: 1.7.0
+    The function :func:`torch.irfft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.irfft` for one-sided input, or
+    :func:`torch.fft.ifft` for two-sided input.
+
 .. warning::
     Generally speaking, input to this function should contain values
     following conjugate symmetry. Note that even if :attr:`onesided` is

From bea7901e387011248cf00e083af71dd92168c211 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Thu, 24 Sep 2020 08:20:06 -0700
Subject: [PATCH 086/449] Enable torch.tensor typechecks (#45077)

Summary:
this fixes https://github.com/pytorch/pytorch/issues/42983.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45077

Reviewed By: ezyang

Differential Revision: D23842493

Pulled By: walterddr

fbshipit-source-id: 1c516a5ff351743a187d00cba7ed0be11678edf1
---
 mypy.ini                 |  3 --
 tools/pyi/gen_pyi.py     | 11 ++++----
 torch/_C/__init__.pyi.in |  7 +++++
 torch/tensor.py          | 61 ++++++++++++++++++++++++----------------
 torch/types.py           | 16 +++++++++--
 5 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 07cdbc4dd6fa..a7c82cb69359 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -102,9 +102,6 @@ ignore_errors = True
 [mypy-torch.distributions.*]
 ignore_errors = True
 
-[mypy-torch.tensor]
-ignore_errors = True
-
 [mypy-torch._tensor_str]
 ignore_errors = True
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 118a3e9b58b7..7079c6750223 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -74,11 +74,7 @@
     # Somehow, these are defined in both _C and in functional. Ick!
     'broadcast_tensors',
     # Manually define named tensor type stubs in __init__.pyi.in
-    'rename',
-    'refine_names',
-    'align_to',
     'align_tensors',
-    'unflatten',
     'meshgrid',
     'cartesian_prod',
     'block_diag',
@@ -87,7 +83,6 @@
     'stft',
     'istft',
     'tensordot',
-    'norm',
     'split',
     'unique_consecutive',
     'atleast_1d',
@@ -536,6 +531,7 @@ def gen_pyi(declarations_path, out):
                      'def __init__(self, other: Tensor) -> None: ...',
                      'def __init__(self, size: {}, *, {}) -> None: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM),
                      ],
+        'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."],
         # clamp has no default values in the Declarations
         'clamp': ["def clamp(self, min: _float=-inf, max: _float=inf,"
                   " *, out: Optional[Tensor]=None) -> Tensor: ..."],
@@ -546,6 +542,7 @@ def gen_pyi(declarations_path, out):
         'tolist': ['def tolist(self) -> List: ...'],
         'requires_grad_': ['def requires_grad_(self, mode: _bool=True) -> Tensor: ...'],
         'element_size': ['def element_size(self) -> _int: ...'],
+        'data_ptr': ['def data_ptr(self) -> _int: ...'],
         'dim': ['def dim(self) -> _int: ...'],
         'nonzero': ['def nonzero(self, *, as_tuple: _bool=...) -> Tensor: ...'],
         'numel': ['def numel(self) -> _int: ...'],
@@ -576,6 +573,10 @@ def gen_pyi(declarations_path, out):
                ],
         'item': ["def item(self) -> Number: ..."],
         'copy_': ["def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."],
+        'set_': ['def set_(self, storage: Storage, offset: _int, size: _size, stride: _size) -> Tensor: ...',
+                 'def set_(self, storage: Storage) -> Tensor: ...'],
+        'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...',
+                  'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'],
     })
     for binop in ['mul', 'div', 'true_divide', 'floor_divide']:
         for inplace in [False, True]:
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 41e0e887f829..2543e724b1e0 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -87,6 +87,9 @@ ${dtype_class_hints}
 class layout:
     ...
 
+# Defined in torch/csrc/utils/disable_torch_function.cpp
+def DisableTorchFunction(): ...
+
 # Defined in torch/csrc/utils/tensor_layouts.cpp
 strided : layout = ...
 sparse_coo : layout = ...
@@ -105,6 +108,10 @@ class qscheme: ...
 
 # Defined in torch/csrc/utils/tensor_qschemes.cpp
 per_tensor_affine: qscheme = ...
+per_channel_affine: qscheme = ...
+per_tensor_symmetric: qscheme = ...
+per_channel_symmetric: qscheme = ...
+per_channel_affine_float_qparams: qscheme = ...
 
 # Defined in torch/csrc/autograd/python_function.cpp
 class _FunctionBase(object):
diff --git a/torch/tensor.py b/torch/tensor.py
index 18dccfda7c8b..3eadb4667e87 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -7,6 +7,7 @@
 import warnings
 import weakref
 from torch._C import _add_docstr
+from typing import Any, Dict, Tuple, Union
 from numbers import Number
 import functools
 from typing import Optional
@@ -53,6 +54,8 @@ def __deepcopy__(self, memo):
             else:
                 new_storage = self.storage().__deepcopy__(memo)
                 if self.is_quantized:
+                    # quantizer_params can be different type based on torch attribute
+                    quantizer_params: Union[Tuple[torch.qscheme, float, int], Tuple[torch.qscheme, Tensor, Tensor, int]]
                     if self.qscheme() == torch.per_tensor_affine:
                         quantizer_params = self.qscheme(), self.q_scale(), self.q_zero_point()
                     elif self.qscheme() in (torch.per_channel_affine, torch.per_channel_affine_float_qparams):
@@ -85,6 +88,7 @@ def __reduce_ex__(self, proto):
         check_serializing_named_tensor(self)
         # See Note [Don't serialize hooks]
         torch.utils.hooks.warn_if_has_hooks(self)
+        backward_hooks: Dict[Any, Any] = OrderedDict()
         # Note: Numpy array is chosen to be the rebuild component for XLA Tensor.
         # We considered a few options:
         # 1. CPU tensor can't be used here.
@@ -96,12 +100,14 @@ def __reduce_ex__(self, proto):
         #    `tolist()` converts every single element in the tensor into python objects
         #    and serialize them one by one.
         if self.device.type == 'xla':
-            args = (self.cpu().numpy(),
-                    self.dtype,
-                    str(self.device),
-                    self.requires_grad)
-            return (torch._utils._rebuild_xla_tensor, args)
+            arg_xla = (self.cpu().numpy(),
+                       self.dtype,
+                       str(self.device),
+                       self.requires_grad)
+            return (torch._utils._rebuild_xla_tensor, arg_xla)
         if self.is_quantized:
+            # quantizer_params can be different type based on torch attribute
+            quantizer_params: Union[Tuple[torch.qscheme, float, int], Tuple[Any, Tensor, Tensor, int]]
             if self.qscheme() == torch.per_tensor_affine:
                 quantizer_params = (torch.per_tensor_affine,
                                     self.q_scale(),
@@ -116,31 +122,31 @@ def __reduce_ex__(self, proto):
                                     self.q_per_channel_axis())
             else:
                 raise RuntimeError(f"Serialization is not supported for tensors of type {self.qscheme()}")
-            args = (self.storage(),
-                    self.storage_offset(),
-                    tuple(self.size()),
-                    self.stride(),
-                    quantizer_params,
-                    self.requires_grad,
-                    OrderedDict())
-            return (torch._utils._rebuild_qtensor, args)
+            args_qtensor = (self.storage(),
+                            self.storage_offset(),
+                            tuple(self.size()),
+                            self.stride(),
+                            quantizer_params,
+                            self.requires_grad,
+                            backward_hooks)
+            return (torch._utils._rebuild_qtensor, args_qtensor)
         elif self.is_sparse:
             if self.layout == torch.sparse_coo:
-                args = (self.layout,
-                        (self._indices(),
-                         self._values(),
-                         self.size()))
+                args_sparse = (self.layout,
+                               (self._indices(),
+                                self._values(),
+                                self.size()))
             else:
                 raise NotImplementedError(
                     'sparse tensor __reduce_ex__ for layout `%s`' % (self.layout))
-            return (torch._utils._rebuild_sparse_tensor, args)
+            return (torch._utils._rebuild_sparse_tensor, args_sparse)
         else:
             args = (self.storage(),
                     self.storage_offset(),
                     tuple(self.size()),
                     self.stride(),
                     self.requires_grad,
-                    OrderedDict())  # previously was self._backward_hooks
+                    backward_hooks)  # previously was self._backward_hooks
             return (torch._utils._rebuild_tensor_v2, args)
 
     def __setstate__(self, state):
@@ -528,7 +534,7 @@ def __format__(self, format_spec):
             return self.item().__format__(format_spec)
         return object.__format__(self, format_spec)
 
-    def __ipow__(self, other):
+    def __ipow__(self, other):  # type: ignore[misc]
         relevant_args = (self, other)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and type(other) is not Tensor and has_torch_function(relevant_args):
@@ -652,7 +658,8 @@ def __contains__(self, element):
         if type(self) is not Tensor and has_torch_function(relevant_args):
             return handle_torch_function(Tensor.__contains__, relevant_args, self, element)
         if isinstance(element, (torch.Tensor, Number)):
-            return (element == self).any().item()
+            # type hint doesn't understand the __contains__ result array
+            return (element == self).any().item()  # type: ignore[union-attr]
 
         raise RuntimeError(
             "Tensor.__contains__ only supports Tensor or scalar, but you passed in a %s." %
@@ -669,7 +676,8 @@ def __cuda_array_interface__(self):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.__cuda_array_interface__.__get__, relevant_args, self)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.__cuda_array_interface__.__get__, relevant_args, self)  # type: ignore[attr-defined]
 
         # raise AttributeError for unsupported tensors, so that
         # hasattr(cpu_tensor, "__cuda_array_interface__") is False.
@@ -936,7 +944,8 @@ def grad(self):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.grad.__get__, relevant_args, self)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.grad.__get__, relevant_args, self)  # type: ignore[attr-defined]
 
         if self.requires_grad and not hasattr(self, "retains_grad") and not self.is_leaf and self._grad is None:
             warnings.warn("The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad "
@@ -951,7 +960,8 @@ def grad(self, new_grad):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.grad.__set__, relevant_args, self, new_grad)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.grad.__set__, relevant_args, self, new_grad)  # type: ignore[attr-defined]
         self._grad = new_grad
 
     @grad.deleter
@@ -959,7 +969,8 @@ def grad(self):
         relevant_args = (self,)
         from torch.overrides import has_torch_function, handle_torch_function
         if type(self) is not Tensor and has_torch_function(relevant_args):
-            return handle_torch_function(Tensor.grad.__delete__, relevant_args, self)
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.grad.__delete__, relevant_args, self)  # type: ignore[attr-defined]
         del self._grad
 
     @classmethod
diff --git a/torch/types.py b/torch/types.py
index 0e386fc3e134..2aee8cd7ddde 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -34,13 +34,25 @@
 class Storage(object):
     _cdata: int
 
+    def __deepcopy__(self, memo) -> 'Storage':
+        ...
+
+    def _new_shared(self, int) -> 'Storage':
+        ...
+
     def _write_file(self, f: Any, is_real_file: _bool, save_size: _bool) -> None:
         ...
 
-    def size(self) -> int:
+    def element_size(self) -> int:
         ...
 
-    def _new_shared(self, int) -> 'Storage':
+    def is_shared(self) -> bool:
+        ...
+
+    def share_memory_(self) -> 'Storage':
+        ...
+
+    def size(self) -> int:
         ...
 
     ...

From 71d1b5b0e227e407e60c0a3dd6a4caabdcd6c89a Mon Sep 17 00:00:00 2001
From: iurii zdebskyi <47012416+izdeby@users.noreply.github.com>
Date: Thu, 24 Sep 2020 08:24:46 -0700
Subject: [PATCH 087/449] Add foreach APIs for binary ops with ScalarList
 (#44743)

Summary:
In this PR:
1) Added binary operations with ScalarLists.
2) Fixed _foreach_div(...) bug in native_functions
3) Covered all possible cases with scalars and scalar lists in tests
4) [minor] fixed bug in native_functions by adding "use_c10_dispatcher: full" to all _foreach functions

tested via unit tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44743

Reviewed By: bwasti, malfet

Differential Revision: D23753711

Pulled By: izdeby

fbshipit-source-id: bf3e8c54bc07867e8f6e82b5d3d35ff8e99b5a0a
---
 aten/src/ATen/native/ForeachOpsKernels.cpp    |  24 +
 aten/src/ATen/native/ForeachUtils.h           |  14 +
 .../native/cuda/ForeachBinaryOpScalarList.cu  |  60 ++
 aten/src/ATen/native/cuda/ForeachFunctors.cuh | 115 ++++
 .../src/ATen/native/cuda/MultiTensorApply.cuh |  70 +++
 aten/src/ATen/native/native_functions.yaml    |  97 +++-
 .../check_backward_compatibility.py           |   4 +
 test/test_foreach.py                          | 529 ++++++++++++++----
 test/test_native_functions.py                 |   2 +-
 tools/autograd/gen_python_functions.py        |   1 +
 .../templates/python_torch_functions.cpp      |   1 +
 tools/codegen/model.py                        |   4 +
 tools/pyi/gen_pyi.py                          |   1 +
 torch/csrc/utils/python_arg_parser.cpp        |  22 +-
 torch/csrc/utils/python_arg_parser.h          |  18 +-
 15 files changed, 843 insertions(+), 119 deletions(-)
 create mode 100644 aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 912b5116c4cc..73eb2070c07d 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -24,6 +24,26 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor
   return result;                                                                                          \
 }
 
+#define FOREACH_BINARY_OP_SCALARLIST(NAME)                                                                              \
+void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef<double> scalars) {                \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+                                                                                                                        \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+      tensors[i].NAME##_(scalars[i]);                                                                                   \
+    }                                                                                                                   \
+}                                                                                                                       \
+                                                                                                                        \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef<double> scalars) {  \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+  std::vector<Tensor> result;                                                                                           \
+  result.reserve(tensors.size());                                                                                       \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+    result.emplace_back(tensors[i].NAME(scalars[i]));                                                                   \
+  }                                                                                                                     \
+                                                                                                                        \
+  return result;                                                                                                        \
+}
+
 #define FOREACH_BINARY_OP_LIST(NAME)                                                                      \
 std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) {  \
   check_foreach_api_restrictions(tensors1, tensors2);                                                     \
@@ -117,6 +137,10 @@ FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);
 FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
+FOREACH_BINARY_OP_SCALARLIST(add);
+FOREACH_BINARY_OP_SCALARLIST(sub);
+FOREACH_BINARY_OP_SCALARLIST(mul);
+FOREACH_BINARY_OP_SCALARLIST(div);
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 5a7aced74702..44e6a50297db 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -31,6 +31,12 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
   }
 }
 
+void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+}
+
 // To go via 'fast' path, several conditions must be satisfied
 // - All tensors must be on the same device
 // - All tensors must have strided layout
@@ -132,5 +138,13 @@ bool can_use_fast_route(TensorList tensors) {
   return true;
 }
 
+bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+
+  return can_use_fast_route(tensors);
+}
+
 }
 }} // at::native
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
new file mode 100644
index 000000000000..684f12732ffc
--- /dev/null
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -0,0 +1,60 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
+
+namespace at { namespace native {
+
+template<template<class> class Op>
+std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<at::Tensor> vec_res;
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(tensors.vec());
+    tensor_lists.emplace_back(vec_res);
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
+        multi_tensor_apply<2>(tensor_lists, scalars, BinaryOpScalarListFunctor<scalar_t, Op>());
+    });
+    return tensor_lists[1];
+}
+
+template<template<class> class Op>
+void foreach_binary_op_(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
+        multi_tensor_apply<1>(tensor_lists, scalars, BinaryOpScalarListFunctor_<scalar_t, Op>());
+    });
+}
+
+#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP)                                                                           \
+void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<double> scalars) {                 \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars);                            \
+    }                                                                                                                    \
+                                                                                                                         \
+    foreach_binary_op_<OP>(tensors, scalars);                                                                            \
+}                                                                                                                        \
+                                                                                                                         \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef<double> scalars) {   \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars);                             \
+    }                                                                                                                    \
+                                                                                                                         \
+    return foreach_binary_op<OP>(tensors, scalars);                                                                      \
+}
+
+FOREACH_BINARY_OP_SCALARLIST(add, std::plus);
+FOREACH_BINARY_OP_SCALARLIST(sub, std::minus);
+FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies);
+FOREACH_BINARY_OP_SCALARLIST(div, std::divides);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index a04d27110c9a..e83eca3dd8e1 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -118,6 +118,121 @@ struct BinaryOpScalarFunctor {
         }
 };
 
+template<typename T, template<class> class Op>
+struct BinaryOpScalarListFunctor_ {
+    __device__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<1>& tl) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            double y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+                    // store
+                    load_store(x, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            x[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
+template<typename T, template<class> class Op>
+struct BinaryOpScalarListFunctor {
+    __device__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<2>& tl) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            T* out = (T*)tl.addresses[1][tensor_loc];
+            out += chunk_idx * chunk_size;
+
+            double y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+                    // store
+                    load_store(out, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            out[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
 template<typename T, template<class> class Op>
 struct BinaryOpListAlphaFunctor_ {
     __device__ void operator() (
diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
index f82a0d9a58c8..d162af19fd1b 100644
--- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh
+++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
@@ -26,6 +26,7 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s
 // TensorListMetadata has to be < 4KB - the limit for kernel launch argument
 static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
 
 template<int n> struct TensorListMetadata
 {
@@ -35,6 +36,15 @@ template<int n> struct TensorListMetadata
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
+template<int n> struct TensorListScalarListMetadata
+{
+  void* addresses[n][depth_to_max_tensors_scalarlist[n-1]];
+  int sizes[depth_to_max_tensors_scalarlist[n-1]];
+  double scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+};
+
 template<typename T, typename U, typename... ArgTypes>
 C10_LAUNCH_BOUNDS_1(kBlockSize)
 __global__ void 
@@ -49,11 +59,71 @@ multi_tensor_apply_kernel(
 template<int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<double> scalars,
     T callable,
     ArgTypes... args) {
         TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
         const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+        size_t n_tensors = tensor_lists[0].size();
+        TensorListScalarListMetadata<depth> tensorListMeta;
+
+        int loc_block_info = 0;
+        int loc_tensor_info = 0;
+        for(size_t t = 0; t < n_tensors; t++) {
+
+            tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t];
+
+            tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+            for (int d = 0; d < depth; d++) {
+                tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+            }
+            loc_tensor_info++;
+
+            int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize;
+            for (int chunk = 0; chunk < chunks; chunk++) {
+                tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+                tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+                loc_block_info++;
+
+                bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] &&
+                    chunk == chunks - 1);
+                bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+                bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1);
+
+                if (tensors_full || blocks_full || last_chunk) {
+                    multi_tensor_apply_kernel<<<loc_block_info, kBlockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        tensorListMeta,
+                        callable,
+                        args...);
+
+                    AT_CUDA_CHECK(cudaGetLastError());
+
+                    // Reset.
+                    loc_block_info = 0;
+                    if(chunk == chunks - 1) {
+                        loc_tensor_info = 0; 
+                    }
+                    else {
+                        tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
+                        tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1];
+                        for(int d = 0; d < depth; d++) {
+                            tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1];
+                        }
+                        loc_tensor_info = 1;
+                    }
+                }
+            }
+        }
+    }
+
 
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+        TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
+        const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
         size_t n_tensors = tensor_lists[0].size();
         TensorListMetadata<depth> tensorListMeta;
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f5bbb263ed9c..8068bc1721df 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6187,6 +6187,7 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6194,6 +6195,7 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
 
 - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6201,6 +6203,7 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6208,6 +6211,7 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
 
 - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6215,6 +6219,7 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6222,6 +6227,7 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
 
 - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6229,34 +6235,39 @@
     CUDA: foreach_tensor_div_scalar_kernel_cuda
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
 
-- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
+- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
 
-- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
 
-- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
+- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
 
-- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6264,6 +6275,7 @@
     CUDA: foreach_tensor_sub_list_kernel_cuda_
 
 - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6271,13 +6283,15 @@
     CUDA: foreach_tensor_mul_list_kernel_cuda
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
 
-- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[]
+- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6285,13 +6299,79 @@
     CUDA: foreach_tensor_div_list_kernel_cuda
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
 
+- func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+
+- func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+
+- func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+
+- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+
+- func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+
+- func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+
+- func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+
+- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+
 - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6299,6 +6379,7 @@
     CUDA: foreach_tensor_exp_cuda
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6306,6 +6387,7 @@
     CUDA: foreach_tensor_exp_cuda_
 
 - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6313,6 +6395,7 @@
     CUDA: foreach_tensor_sqrt_cuda
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6320,6 +6403,7 @@
     CUDA: foreach_tensor_sqrt_cuda_
 
 - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6327,6 +6411,7 @@
     CUDA: foreach_tensor_addcdiv_cuda_
 
 - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6334,6 +6419,7 @@
     CUDA: foreach_tensor_addcmul_cuda_
 
 - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6341,6 +6427,7 @@
     CUDA: foreach_tensor_addcdiv_cuda
 
 - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 739a4de51951..4303fc563cfc 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -99,6 +99,10 @@
     ("preprocess", datetime.date(2020, 10, 1)),
     ("compile", datetime.date(2020, 10, 1)),
     ("execute", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_add", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_div", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
 ]
 
 
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 8369ba5b9be5..85d79096b2ad 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -4,21 +4,30 @@
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm
 
 class TestForeach(TestCase):
-    bin_ops = [
+    foreach_bin_ops = [
         torch._foreach_add,
-        torch._foreach_add_,
         torch._foreach_sub,
-        torch._foreach_sub_,
         torch._foreach_mul,
-        torch._foreach_mul_,
         torch._foreach_div,
+    ]
+
+    foreach_bin_ops_ = [
+        torch._foreach_add_,
+        torch._foreach_sub_,
+        torch._foreach_mul_,
         torch._foreach_div_,
     ]
 
+    torch_bin_ops = [
+        torch.add,
+        torch.sub,
+        torch.mul,
+        torch.div,
+    ]
+
     def _get_test_data(self, device, dtype, N):
         if dtype in [torch.bfloat16, torch.bool, torch.float16]:
             tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)]
-
         elif dtype in torch.testing.get_all_int_dtypes():
             tensors = [torch.randint(1, 100, (N, N), device=device, dtype=dtype) for _ in range(N)]
         else:
@@ -26,36 +35,39 @@ def _get_test_data(self, device, dtype, N):
 
         return tensors
 
-    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-
-        expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
-        res = foreach_op(tensors1, tensors2)
-        foreach_op_(tensors1, tensors2)
-        self.assertEqual(res, tensors1)
-        self.assertEqual(tensors1, expected)
-
-    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        expected = [torch_op(tensors1[i]) for i in range(N)]
-        res = foreach_op(tensors1)
-        foreach_op_(tensors1)
-        self.assertEqual(res, tensors1)
-        self.assertEqual(tensors1, expected)
-
-    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors = self._get_test_data(device, dtype, N)
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-        value = 2
-
-        expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
-
-        res = foreach_op(tensors, tensors1, tensors2, value)
-        foreach_op_(tensors, tensors1, tensors2, value)
-        self.assertEqual(res, tensors)
-        self.assertEqual(tensors, expected)
+    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+
+            expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
+            res = foreach_op(tensors1, tensors2)
+            foreach_op_(tensors1, tensors2)
+            self.assertEqual(res, tensors1)
+            self.assertEqual(tensors1, res)
+
+    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            expected = [torch_op(tensors1[i]) for i in range(N)]
+            res = foreach_op(tensors1)
+            foreach_op_(tensors1)
+            self.assertEqual(res, tensors1)
+            self.assertEqual(tensors1, expected)
+
+    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors = self._get_test_data(device, dtype, N)
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+            value = 2
+
+            expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
+
+            res = foreach_op(tensors, tensors1, tensors2, value)
+            foreach_op_(tensors, tensors1, tensors2, value)
+            self.assertEqual(res, tensors)
+            self.assertEqual(tensors, expected)
 
     def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
         tensors1 = self._get_test_data(device, dtype, N)
@@ -63,8 +75,8 @@ def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_
         alpha = 2
 
         expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)]
-        res = foreach_op(tensors1, tensors2, alpha)
-        foreach_op_(tensors1, tensors2, alpha)
+        res = foreach_op(tensors1, tensors2, alpha=alpha)
+        foreach_op_(tensors1, tensors2, alpha=alpha)
         self.assertEqual(res, tensors1)
 
         if dtype == torch.bool:
@@ -88,7 +100,7 @@ def test_exp(self, device, dtype):
     @skipCUDAIfRocm
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
     def test_addcmul(self, device, dtype):
-        if device == 'cpu':
+        if self.device_type == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcmul_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcmul,
@@ -105,7 +117,7 @@ def test_addcdiv(self, device, dtype):
                 self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, torch._foreach_addcdiv_, torch.addcdiv)
             return
 
-        if device == 'cpu':
+        if self.device_type == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcdiv_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcdiv,
@@ -118,83 +130,372 @@ def test_addcdiv(self, device, dtype):
     #
     @dtypes(*torch.testing.get_all_dtypes())
     def test_int_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        int_scalar = 1
-
-        # bool tensor + 1 will result in int64 tensor
-        if dtype == torch.bool:
-            expected = [torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10)]
-        else:
-            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, int_scalar)
-        self.assertEqual(res, expected)
-
-        if dtype in [torch.bool]:
-            with self.assertRaisesRegex(RuntimeError,
-                                        "result type Long can't be cast to the desired output type Bool"):
-                torch._foreach_add_(tensors, int_scalar)
-        else:
-            torch._foreach_add_(tensors, int_scalar)
-            self.assertEqual(res, tensors)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                res = foreach_bin_op(tensors, scalar)
+
+                if dtype == torch.bool:
+                    self.assertEqual(res, expected)
+
+                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+
+                if foreach_bin_op_ == torch._foreach_div_ and dtype in torch.testing.integral_types() and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError,
+                                                "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+                # TODO[type promotion]: Fix once type promotion is enabled.
+                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                    self.assertEqual(res, [e.to(dtype) for e in expected])
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, [e.to(dtype) for e in expected])
+                else:
+                    self.assertEqual(res, expected)
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, expected)
+
+    # TODO[Fix scalar list]: 
+    # We need to update codegen to correctly handle function overloads with float[] and int[].
+    # As optimizers work with float tensors, the result will always be torch.float32 for now.
+    # Current schema is using 'float[]' as scalar list type.
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_int_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [1 for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                # we dont support bool and complex types on CUDA for now
+                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op_(tensors, scalars)
+
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op(tensors, scalars)
+                    return
+
+                res = foreach_bin_op(tensors, scalars)
+
+                if dtype == torch.bool:
+                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
+
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                if dtype in torch.testing.integral_types():
+                    if self.device_type == 'cpu':
+                        self.assertEqual(res, [e.to(torch.float32) for e in expected])
+                    else:
+                        # TODO[type promotion]: Fix once type promotion is enabled.
+                        self.assertEqual(res, [e.to(dtype) for e in expected])
+                else: 
+                    self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types() and self.device_type == 'cpu':
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+                else:
+                    foreach_bin_op_(tensors, scalars)
+                    self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_float_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        float_scalar = 1.
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3.3
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                res = foreach_bin_op(tensors, scalar)
+                self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types():
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+                foreach_bin_op_(tensors, scalar)
+                self.assertEqual(tensors, expected)
 
-        # float scalar + integral tensor will result in float tensor
-        if dtype in [torch.uint8, torch.int8, torch.int16,
-                     torch.int32, torch.int64, torch.bool]:
-            expected = [torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10)]
-        else:
-            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, float_scalar)
-        self.assertEqual(res, expected)
-
-        if dtype in [torch.uint8, torch.int8, torch.int16,
-                     torch.int32, torch.int64, torch.bool]:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar))
-        else:
-            torch._foreach_add_(tensors, float_scalar)
-            self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_float_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [1.1 for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                # we dont support bool and complex types on CUDA for now
+                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op_(tensors, scalars)
+
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op(tensors, scalars)
+                    return
+
+                res = foreach_bin_op(tensors, scalars)
+
+                if dtype == torch.bool:
+                    # see TODO[Fix scalar list]
+                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
+
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                    # see TODO[Fix scalar list]
+                    self.assertEqual(res, [e.to(dtype) for e in expected])
+
+                    foreach_bin_op_(tensors, scalars)
+                    self.assertEqual(tensors, res)
+                    return
+                else:
+                    self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types() and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                foreach_bin_op_(tensors, scalars)
+                self.assertEqual(tensors, expected)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_complex_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        complex_scalar = 3 + 5j
-
-        # bool tensor + 1 will result in int64 tensor
-        expected = [torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10)]
-
-        if dtype in [torch.float16, torch.float32, torch.float64, torch.bfloat16] and device == 'cuda:0':
-            # value cannot be converted to dtype without overflow:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar))
-            return
-
-        res = torch._foreach_add(tensors, complex_scalar)
-        self.assertEqual(res, expected)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3 + 5j
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                if dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=True) and \
+                   self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
+                        foreach_bin_op_(tensors, scalar)
+
+                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
+                        foreach_bin_op(tensors, scalar)
+                    return
+
+                res = foreach_bin_op(tensors, scalar)
+                self.assertEqual(res, expected)
+
+                if dtype not in [torch.complex64, torch.complex128]:
+                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                else:
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(res, tensors)
 
-        if dtype not in [torch.complex64, torch.complex128]:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
-        else:
-            torch._foreach_add_(tensors, complex_scalar)
-            self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_complex_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [3 + 5j for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
+                    res = foreach_bin_op(tensors, scalars)
+
+                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
+                    foreach_bin_op_(tensors, scalars)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_bool_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        bool_scalar = True
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = True
+
+                if dtype == torch.bool:
+                    expected = [torch_bin_op(t, scalar) for t in tensors]
+                    res = foreach_bin_op(tensors, scalar)
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, res)
+                    return
+
+                if foreach_bin_op == torch._foreach_sub and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
+                        res = foreach_bin_op(tensors, scalar)
+
+                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
+                        foreach_bin_op_(tensors, scalar)
+                elif foreach_bin_op == torch._foreach_sub and self.device_type == 'cuda':
+                    res = foreach_bin_op(tensors, scalar)
+                    self.assertEqual(res, foreach_bin_op(tensors, 1))
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, res)
+                else:
+                    expected = [torch_bin_op(t, scalar) for t in tensors]
+                    res = foreach_bin_op(tensors, scalar)
+
+                    # TODO[type promotion]: Fix once type promotion is enabled.
+                    if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                        self.assertEqual(res, [e.to(dtype) for e in expected])
+                    else:
+                        self.assertEqual(res, expected)
+
+                    if dtype in torch.testing.integral_types():
+                        if foreach_bin_op == torch._foreach_div and self.device_type == "cpu":
+                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
+                                foreach_bin_op_(tensors, scalar)
+                        else:
+                            foreach_bin_op_(tensors, scalar)
+                            self.assertEqual(tensors, res)
+                    else:
+                        foreach_bin_op_(tensors, scalar)
+                        self.assertEqual(tensors, expected)
 
-        expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, bool_scalar)
-        self.assertEqual(res, expected)
-
-        torch._foreach_add_(tensors, bool_scalar)
-        self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_bool_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [True for _ in range(N)]
+
+                if dtype == torch.bool:
+                    if self.device_type == 'cuda':
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op(tensors, scalars)
+
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op_(tensors, scalars)
+                        return
+                    else:
+                        if foreach_bin_op == torch._foreach_sub:
+                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
+                                foreach_bin_op_(tensors, scalars)
+
+                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
+                                foreach_bin_op(tensors, scalars)
+                        else:
+                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired"):
+                                foreach_bin_op_(tensors, scalars)
+
+                            res = foreach_bin_op(tensors, scalars)
+                            for r in res:
+                                self.assertTrue(r.dtype == torch.float32)
+                else:
+                    # we dont support bool and complex types on CUDA for now
+                    if (dtype in torch.testing.get_all_complex_dtypes()) and self.device_type == 'cuda':
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op_(tensors, scalars)
+
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op(tensors, scalars)
+                        return
+
+                    if foreach_bin_op == torch._foreach_sub:
+                        if self.device_type == "cpu":
+                            # see TODO[Fix scalar list]
+                            res = foreach_bin_op(tensors, scalars)
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [r.to(torch.float32) for r in foreach_bin_op(tensors, 1)])
+
+                                with self.assertRaisesRegex(RuntimeError, "esult type Float can't be cast to the "):
+                                    foreach_bin_op_(tensors, scalars)
+                            else:
+                                self.assertEqual(res, foreach_bin_op(tensors, 1))
+                                foreach_bin_op_(tensors, scalars)
+                                self.assertEqual(res, tensors)
+                        else:
+                            # see TODO[Fix scalar list]
+                            res = foreach_bin_op(tensors, scalars)
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [r.to(dtype) for r in foreach_bin_op(tensors, 1)])
+                            else:
+                                self.assertEqual(res, foreach_bin_op(tensors, 1))
+
+                            foreach_bin_op_(tensors, scalars)
+                            self.assertEqual(res, tensors)
+                    else:
+                        if self.device_type == "cpu":
+                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+                            res = foreach_bin_op(tensors, scalars)
+
+                            # see TODO[Fix scalar list]
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [e.to(torch.float32) for e in expected])
+                            else:
+                                self.assertEqual(res, expected)
+
+                            if dtype in torch.testing.integral_types():
+                                with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
+                                    foreach_bin_op_(tensors, scalars)
+                            else:
+                                foreach_bin_op_(tensors, scalars)
+                                self.assertEqual(tensors, expected)
+                        else:
+                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+                            res = foreach_bin_op(tensors, scalars)
+
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [e.to(dtype) for e in expected])
+                            else:
+                                self.assertEqual(res, expected)
+
+                            foreach_bin_op_(tensors, scalars)
+                            self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_add_with_different_size_tensors(self, device, dtype):
@@ -248,9 +549,9 @@ def test_add_list_error_cases(self, device):
 
         # One empty list
         tensors1.append(torch.tensor([1], device=device))
-        with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."):
+        with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."):
             torch._foreach_add(tensors1, tensors2)
-        with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."):
+        with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."):
             torch._foreach_add_(tensors1, tensors2)
 
         # Lists have different amount of tensors
@@ -318,13 +619,25 @@ def test_div_list(self, device, dtype):
                 self.skipTest("Skipped! See https://github.com/pytorch/pytorch/issues/44489")
             return
 
-        self._test_bin_op_list(device, dtype, torch._foreach_div, torch._foreach_div_, torch.div)
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+
+            if dtype in [torch.bfloat16, torch.bool, torch.float16]:
+                tensors2 = [torch.zeros(N, N, device=device, dtype=dtype).add(2) for _ in range(N)]
+            else:
+                tensors2 = self._get_test_data(device, dtype, N)
+
+            expected = [torch.div(tensors1[i], tensors2[i]) for i in range(N)]
+            res = torch._foreach_div(tensors1, tensors2)
+            torch._foreach_div_(tensors1, tensors2)
+            self.assertEqual(res, tensors1)
+            self.assertEqual(tensors1, res)
 
     def test_bin_op_list_error_cases(self, device):
         tensors1 = []
         tensors2 = []
 
-        for bin_op in self.bin_ops:
+        for bin_op in self.foreach_bin_ops + self.foreach_bin_ops_:
             # Empty lists
             with self.assertRaises(RuntimeError):
                 bin_op(tensors1, tensors2)
diff --git a/test/test_native_functions.py b/test/test_native_functions.py
index 869c7aad47fb..e5afc79f037a 100644
--- a/test/test_native_functions.py
+++ b/test/test_native_functions.py
@@ -58,7 +58,7 @@ def fake_module(values, const):
         self.do_test_optional_floatlist_with_module(fake_module)
 
     def test_optional_floatlist_invalid(self):
-        with self.assertRaisesRegex(TypeError, "must be .* but found"):
+        with self.assertRaisesRegex(TypeError, "must be tuple of floats, not list"):
             FloatListWrapperModule()(torch.zeros(1), ["hi"])
 
         with self.assertRaisesRegex(RuntimeError, "value of type .* instead found type"):
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 995dff38030b..8f272de9a5f6 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -281,6 +281,7 @@ def create_python_bindings(python_functions, is_python_method, module):
     'c10::optional<bool>': 'toBoolOptional',
     'c10::optional<double>': 'toDoubleOptional',
     'c10::optional<ArrayRef<double>>': 'doublelistOptional',
+    'ArrayRef<double>': 'doublelist',
     'IntArrayRef': 'intlist',
     'Scalar': 'scalar',
     'ScalarType': 'scalartype',
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 62e9b8dd227f..673af99bce77 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -44,6 +44,7 @@ using at::Generator;
 using at::TensorList;
 using at::Dimname;
 using at::DimnameList;
+using at::ArrayRef;
 
 using namespace torch::autograd::utils;
 
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index b0c470c91b6a..4ec0dc428b81 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -304,6 +304,10 @@ def __post_init__(self) -> None:
             # TODO: fixme
             if str(self.name) not in [
                     '_amp_non_finite_check_and_unscale_',
+                    '_foreach_add_.ScalarList',
+                    '_foreach_sub_.ScalarList',
+                    '_foreach_mul_.ScalarList',
+                    '_foreach_div_.ScalarList',
                     '_foreach_add_.Scalar',
                     '_foreach_sub_.Scalar',
                     '_foreach_mul_.Scalar',
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 7079c6750223..d24966f9fb52 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -146,6 +146,7 @@ def type_to_python(typename, size=None):
         'Dimname': 'Union[str, ellipsis, None]',
         'DimnameList': 'Sequence[Union[str, ellipsis, None]]',
         'QScheme': '_qscheme',
+        'ArrayRef<double>' : 'Sequence[float]'
     }[typename]
 
     return typename
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index e954bef398e9..f9e26af63ada 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -366,6 +366,23 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector<py::handle>
   return true;
 }
 
+bool is_float_list(PyObject* obj) {
+  auto tuple = six::isTuple(obj);
+  if (!(tuple || PyList_Check(obj))) {
+    return false;
+  }
+
+  auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
+  if (size > 0) { 
+    PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
+    if (!THPUtils_checkDouble(iobj) && !PyComplex_Check(iobj)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 // argnum is needed for raising the TypeError, it's used in the error message.
 auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded_args, int argnum) -> bool
 {
@@ -420,7 +437,9 @@ auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded
       // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single int
       return size > 0 && THPUtils_checkLong(obj);
     }
-    case ParameterType::FLOAT_LIST: return (PyTuple_Check(obj) || PyList_Check(obj));
+    case ParameterType::FLOAT_LIST: {
+      return is_float_list(obj);
+    }
     case ParameterType::GENERATOR: return THPGenerator_Check(obj);
     case ParameterType::BOOL: return PyBool_Check(obj);
     case ParameterType::STORAGE: return isStorage(obj);
@@ -901,6 +920,7 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject*
   print_error(self, args, kwargs, parsed_args);
 }
 
+
 void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {  // NOLINT
   auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
   std::vector<int> plausible_idxs;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 78efb6cf2db3..d0e2bdc074ff 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -173,6 +173,8 @@ struct PythonArgs {
   inline c10::optional<bool> toBoolOptional(int i);
   inline c10::optional<double> toDoubleOptional(int i);
   inline c10::OptionalArray<double> doublelistOptional(int i);
+  inline std::vector<double> doublelist(int i);
+  inline std::vector<double> getDoublelist(int i);
   inline at::Layout layout(int i);
   inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
   inline c10::optional<at::Layout> layoutOptional(int i);
@@ -369,10 +371,7 @@ inline c10::OptionalArray<int64_t> PythonArgs::intlistOptional(int i) {
   return intlist(i);
 }
 
-inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
-  if (!args[i]) {
-    return {};
-  }
+inline std::vector<double> PythonArgs::getDoublelist(int i) {
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
   auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -390,6 +389,17 @@ inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
   return res;
 }
 
+inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return this->getDoublelist(i);
+}
+
+inline std::vector<double> PythonArgs::doublelist(int i) {
+  return this->getDoublelist(i);
+}
+
 inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) {
   if (!args[i]) return default_scalartype;
   return scalartype(i);

From bc591d76a10c79f179d0bea016e59096add511a3 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Thu, 24 Sep 2020 08:35:46 -0700
Subject: [PATCH 088/449] add skip_if_rocm to all requires_nccl tests (#45158)

Summary:
requires_nccl annotation should skip_if_rocm as well

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45158

Reviewed By: seemethere

Differential Revision: D23879952

Pulled By: walterddr

fbshipit-source-id: 818fb31ab75d5f02e77fe3f1367faf748855bee7
---
 .../ddp_comm_hooks/test_ddp_hooks.py          |  5 ++++
 test/distributed/test_c10d.py                 | 24 +++++++++++++++++++
 .../ddp_under_dist_autograd_test.py           |  2 ++
 3 files changed, 31 insertions(+)

diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index 2b3d43814c0f..37c8f14af853 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -14,6 +14,7 @@
     MultiProcessTestCase,
     requires_nccl,
     skip_if_lt_x_gpu,
+    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import run_tests
 
@@ -97,6 +98,7 @@ def _run_and_get_grads(self, model):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_allreduce_hook(self):
         """
         This unit test verifies the ``allreduce`` hook registered case gives same result
@@ -114,6 +116,7 @@ def test_ddp_comm_hook_allreduce_hook(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_fp16compress_hook(self):
         """
         This unit test verifies the ``fp16 compress`` hook registered case
@@ -131,6 +134,7 @@ def test_ddp_comm_hook_fp16compress_hook(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_quantize_per_tensor_hook(self):
         """
         This unit test verifies the ``quantize per tensor`` hook registered case
@@ -148,6 +152,7 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_quantize_per_channel_hook(self):
         """
         This unit test verifies the ``quantize per channel`` hook registered case
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index cfd0930284a5..b2b7d186713c 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -1559,6 +1559,7 @@ def create(num, prefix):
     TEST_WITH_TSAN,
     "TSAN is not fork-safe since we're forking in a multi-threaded environment",
 )
+@skip_if_rocm
 class ProcessGroupNCCLTest(TestCase):
     MAIN_PROCESS_RANK = 0
 
@@ -2123,6 +2124,7 @@ def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(4)
+    @skip_if_rocm
     def test_nccl_backend_2gpu_module(self):
         int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
@@ -2130,6 +2132,7 @@ def test_nccl_backend_2gpu_module(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(8)
+    @skip_if_rocm
     def test_nccl_backend_4gpu_module(self):
         int_devices = gpus_for_rank(self.world_size)[self.rank][:4]
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
@@ -2137,6 +2140,7 @@ def test_nccl_backend_4gpu_module(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(4)
+    @skip_if_rocm
     def test_ddp_multi_device_module_config(self):
         gpus = gpus_for_rank(self.world_size)[self.rank]
 
@@ -2167,6 +2171,7 @@ def test_ddp_multi_device_module_config(self):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_fp16(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -2198,6 +2203,7 @@ def test_fp16(self):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_arbitrary_forward_return_value(self):
         """
         Note: this test can be sped up by only running it on a CPU module
@@ -2482,6 +2488,7 @@ def run_and_verify_grad(model):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_multiple_outputs_multiple_backward(self):
         """
         Note: this test can be sped up by only running it on a CPU module
@@ -2532,6 +2539,7 @@ def forward(self, x):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_no_grad(self):
         """
         Note: this test can be sped up by only running it on a CPU module
@@ -2643,6 +2651,7 @@ def step_model(model, input, target):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_accumulate_gradients_no_sync(self):
         """
         Runs _test_accumulate_gradients_no_sync using default inputs
@@ -2651,6 +2660,7 @@ def test_accumulate_gradients_no_sync(self):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_accumulate_gradients_no_sync_allreduce_hook(self):
         """
         Runs multiple iterations on _test_accumulate_gradients_no_sync
@@ -2670,6 +2680,7 @@ def allreduce_hook(
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
         """
         Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
@@ -2699,6 +2710,7 @@ def div(fut):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_accumulate_gradients_module(self):
         # This is NOT the recommended way to implement accumulating grads, but
         # we would like to make sure DDP does not mess up with the underlying
@@ -2840,6 +2852,7 @@ def forward(self, x):
 
     @requires_nccl()
     @skip_if_not_multigpu
+    @skip_if_rocm
     def test_failure_recovery(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -3161,6 +3174,7 @@ def test_ddp_comm_hook_future_passing_gpu_gloo(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_future_passing_gpu_nccl(self):
         """
         This unit test verifies whether the Future object is passed properly using nccl backend.
@@ -3178,6 +3192,7 @@ def test_ddp_comm_hook_future_passing_gpu_nccl(self):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_allreduce_hook_nccl(self):
         """
         This unit test verifies whether a DDP communication hook that just calls
@@ -3200,6 +3215,7 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future:
 
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
     def test_ddp_comm_hook_allreduce_with_then_hook_nccl(self):
         """
         This unit test verifies whether a DDP communication hook that calls allreduce and then
@@ -3591,6 +3607,7 @@ def _run_all_reduce(self, pg):
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_nonblocking(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -3642,36 +3659,42 @@ def _test_nccl_errors_blocking(self, func):
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_clean_exit(self):
         self._test_nccl_errors_blocking(lambda: sys.exit(0))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_nonzero_exit(self):
         self._test_nccl_errors_blocking(lambda: sys.exit(1))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_abort(self):
         self._test_nccl_errors_blocking(lambda: os.abort())
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_sigkill(self):
         self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_errors_blocking_sigterm(self):
         self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM))
 
     @requires_nccl()
     @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_nccl_blocking_wait_with_barrier(self):
         os.environ["NCCL_BLOCKING_WAIT"] = "1"
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -3694,6 +3717,7 @@ def _run_invalid_nccl_blocking_wait_env(self, val):
 
     @requires_nccl()
     @skip_if_lt_x_gpu(3)
+    @skip_if_rocm
     def test_invalid_nccl_blocking_wait_env(self):
         self._run_invalid_nccl_blocking_wait_env('abc')
         self._run_invalid_nccl_blocking_wait_env('-1')
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 75d89c33325a..1b1f755ed4cc 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -18,6 +18,7 @@
     requires_gloo,
     requires_nccl,
     skip_if_lt_x_gpu,
+    skip_if_rocm,
 )
 from torch.testing._internal.dist_utils import dist_init
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
@@ -641,6 +642,7 @@ def test_ddp_dist_autograd_local_vs_remote(self):
     @skip_if_lt_x_gpu(NUM_TRAINERS)
     @requires_nccl()
     @dist_init
+    @skip_if_rocm
     def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         # Each trainer uses a different random seed. Otherwise, they are going
         # to have exactly the same initial model parameters, input, and

From f9ae296a85c9e3835cd8664d18fea9282c205e58 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Thu, 24 Sep 2020 08:42:44 -0700
Subject: [PATCH 089/449] renaming TestDdpCommHook class so it doesn't get
 picked up as a test by pytest (#44905)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44905

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D23825308

Pulled By: bdhirsh

fbshipit-source-id: 17a07b3bd211850d6ecca793fd9ef3f326ca9274
---
 test/distributed/test_c10d.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index b2b7d186713c..d9faee9197a0 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -1937,7 +1937,7 @@ def forward(self, x):
         return self.p + x
 
 
-class TestDdpCommHook(nn.Module):
+class ModuleForDdpCommHook(nn.Module):
     def __init__(self):
         super().__init__()
         self.t0 = Task()
@@ -3110,7 +3110,7 @@ def test_ddp_comm_hook_future_passing_cpu(self):
 
         # Test on CPU
         cpu_model = DistributedDataParallel(
-            TestDdpCommHook().cpu(), process_group=process_group
+            ModuleForDdpCommHook().cpu(), process_group=process_group
         )
 
         # Register DDP Communication Hook
@@ -3123,7 +3123,7 @@ def test_ddp_comm_hook_future_passing_cpu(self):
     def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None):
         device_id = gpus_for_rank(self.world_size)[self.rank][0]
         gpu_model = DistributedDataParallel(
-            TestDdpCommHook().to(device_id),
+            ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
             process_group=process_group,
         )
@@ -3259,7 +3259,7 @@ def test_ddp_invalid_comm_hook_init(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
-        model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group)
+        model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group)
 
         with self.assertRaisesRegex(TypeError, "Communication hook must be callable."):
             model._register_comm_hook(state=None, hook=1)
@@ -3283,7 +3283,7 @@ def test_ddp_invalid_comm_hook_return_type(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
-        model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group)
+        model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group)
 
         with self.assertRaisesRegex(
             ValueError,
@@ -3320,7 +3320,7 @@ def test_ddp_comm_hook_register_just_once(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
-        model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group)
+        model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group)
 
         def dummy_hook(state, bucket):
             fut = torch.futures.Future()

From 5195d727b57c19f1d5e201338a062f4d1d0636c1 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Thu, 24 Sep 2020 09:14:00 -0700
Subject: [PATCH 090/449] adding a test for ddp save()/load() (#44906)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44906

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D23825386

Pulled By: bdhirsh

fbshipit-source-id: 2276e6e030ef9cffd78fc78c2ffe34d60a1e160e
---
 test/distributed/test_c10d.py | 86 +++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index d9faee9197a0..64e255fce3e6 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -2916,6 +2916,92 @@ def forward(self, x):
             loss = criterion(output, target)
             loss.backward()
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    def test_save_load_checkpoint(self):
+        dist.init_process_group(
+            "gloo",
+            init_method=f"file://{self.file_name}",
+            world_size=self.world_size,
+            rank=self.rank
+        )
+
+        class TestModel(nn.Module):
+            def __init__(self):
+                super(TestModel, self).__init__()
+                self.fc1 = nn.Linear(2, 10, bias=False)
+                self.fc2 = nn.Linear(10, 4, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.fc1(x))
+                x = self.relu(self.fc2(x))
+                return F.softmax(x, dim=1)
+
+        def train_loop(model, optimizer, iterations):
+            for _ in range(iterations):
+                optimizer.zero_grad()
+                output = model(input)
+                loss = criterion(output, target)
+                loss.backward()
+                optimizer.step()
+
+        device_id = gpus_for_rank(self.world_size)[self.rank][0]
+
+        model_withload = TestModel().float().to(device_id)
+        model_withoutload = TestModel().float().to(device_id)
+
+        ddp_withload = DistributedDataParallel(
+            model_withload,
+            device_ids=[device_id],
+        )
+        ddp_withoutload = DistributedDataParallel(
+            model_withoutload,
+            device_ids=[device_id],
+        )
+
+        # ensure that both models start with the same set of parameters. By default they are randomized on construction
+        for p in ddp_withload.parameters():
+            with torch.no_grad():
+                p.zero_()
+        for p in ddp_withoutload.parameters():
+            with torch.no_grad():
+                p.zero_()
+
+        batch_size = 4
+        criterion = nn.CrossEntropyLoss()
+
+        optimizer_withload = torch.optim.SGD(ddp_withload.parameters(), lr=0.001)
+        optimizer_withoutload = torch.optim.SGD(ddp_withoutload.parameters(), lr=0.001)
+
+        input = torch.rand([batch_size, 2], dtype=torch.float)
+        target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id)
+
+        # run the model for 6 iterations, with a checkpoint in the middle
+        train_loop(ddp_withload, optimizer_withload, 3)
+
+        # zero out parameters and reload them from the state dict
+        checkpoint_path = tempfile.gettempdir() + "/model.checkpoint"
+        if self.rank == 0:
+            torch.save(ddp_withload.state_dict(), checkpoint_path)
+
+        dist.barrier()
+        for p in ddp_withload.parameters():
+            with torch.no_grad():
+                p.zero_()
+        map_location = {'cuda:%d' % 0: 'cuda:%d' % self.rank}
+        ddp_withload.load_state_dict(
+            torch.load(checkpoint_path, map_location=map_location))
+
+        train_loop(ddp_withload, optimizer_withload, 3)
+
+        # re-run the model with the same inputs for 6 iterations with no checkpoint
+        train_loop(ddp_withoutload, optimizer_withoutload, 6)
+
+        for p_withload, p_withoutload in zip(ddp_withload.parameters(), ddp_withoutload.parameters()):
+            self.assertEqual(p_withload, p_withoutload)
+
+
     def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         mult = 2
         batch_size = mult * self.world_size

From bfdf4323ac3bbd3dcdf4b7ab216347770ebaa3bb Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Thu, 24 Sep 2020 09:31:46 -0700
Subject: [PATCH 091/449] Bump up NCCL to 2.7.8 (#45251)

Summary:
Use latest NCCL

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45251

Reviewed By: mingzhe09088

Differential Revision: D23893064

Pulled By: mrshenli

fbshipit-source-id: 820dd166039e61a5aa59b4c5bbc615a7b18be8c3
---
 third_party/nccl/nccl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl
index 195232556936..033d799524fb 160000
--- a/third_party/nccl/nccl
+++ b/third_party/nccl/nccl
@@ -1 +1 @@
-Subproject commit 195232556936b39b01cc908296e1650b80d4a3e9
+Subproject commit 033d799524fb97629af5ac2f609de367472b2696

From 8507ea22b21842f93a7d17ddfe737f134642375c Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Thu, 24 Sep 2020 09:36:46 -0700
Subject: [PATCH 092/449] replace timer test with a mocked variant (#45173)

Summary:
I noticed that the recently introduced adaptive_autorange tests occasionally timeout CI, and I've been meaning to improve the Timer tests for a while. This PR allows unit tests to swap the measurement portion of `Timer` with a deterministic mock so we can thoroughly test behavior without having to worry about flaky CI measurements. It also means that the tests can be much more detailed and still finish very quickly.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45173

Test Plan: You're lookin' at it.

Reviewed By: ezyang

Differential Revision: D23873548

Pulled By: robieta

fbshipit-source-id: 26113e5cea0cbf46909b9bf5e90c878c29e87e88
---
 test/test_utils.py                    | 152 +++++++++++++++++++++++---
 torch/utils/_benchmark/utils/timer.py |   4 +-
 2 files changed, 141 insertions(+), 15 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index bf002541bebf..398a10971d0d 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,6 +4,7 @@
 import shutil
 import random
 import tempfile
+import textwrap
 import unittest
 import torch
 import torch.nn as nn
@@ -16,6 +17,7 @@
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
 from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS
 from urllib.error import URLError
+import numpy as np
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -621,25 +623,147 @@ def test_timer(self):
         timer = benchmark_utils.Timer(
             stmt="torch.ones(())",
         )
-        median = timer.blocked_autorange(min_run_time=0.1).median
+        median = timer.blocked_autorange(min_run_time=0.01).median
         self.assertIsInstance(median, float)
 
+        # We set a very high threshold to avoid flakiness in CI.
+        # The internal algorithm is tested in `test_adaptive_timer`
+        median = timer.adaptive_autorange(threshold=0.5).median
+
+    class _MockTimer:
+        _seed = 0
+
+        _timer_noise_level = 0.05
+        _timer_cost = 100e-9  # 100 ns
+
+        _function_noise_level = 0.05
+        _function_costs = (
+            ("pass", 8e-9),
+            ("cheap_fn()", 4e-6),
+            ("expensive_fn()", 20e-6),
+        )
+
+        def __init__(self, stmt, setup, timer, globals):
+            self._random_state = np.random.RandomState(seed=self._seed)
+            self._mean_cost = {k: v for k, v in self._function_costs}[stmt]
+
+        def sample(self, mean, noise_level):
+            return max(self._random_state.normal(mean, mean * noise_level), 5e-9)
+
+        def timeit(self, number):
+            return sum([
+                # First timer invocation
+                self.sample(self._timer_cost, self._timer_noise_level),
+
+                # Stmt body
+                self.sample(self._mean_cost * number, self._function_noise_level),
+
+                # Second timer invocation
+                self.sample(self._timer_cost, self._timer_noise_level),
+            ])
+
     def test_adaptive_timer(self):
-        # Validate both on different sizes validate against blocked_autorange
-        # This looks for relative differences btetween orders of magnitude to
-        # provide a stable/portable test which is somewhat informative.
-        timer = benchmark_utils.Timer(
-            stmt="torch.sum(torch.ones((10,10)))",
+        class MockTimer(benchmark_utils.Timer):
+            _timer_cls = self._MockTimer
+
+        def assert_reprs_match(measurement, expected):
+            measurement_repr = re.sub(
+                "object at 0x[0-9a-fA-F]+>",
+                "object at 0xXXXXXXXXXXXX>",
+                repr(measurement)
+            )
+            self.assertEqual(measurement_repr, textwrap.dedent(expected).strip())
+
+        assert_reprs_match(
+            MockTimer("pass").blocked_autorange(min_run_time=10),
+            """
+            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            pass
+              Median: 7.98 ns
+              IQR:    0.52 ns (7.74 to 8.26)
+              125 measurements, 10000000 runs per measurement, 1 thread"""
         )
-        small = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0)
-        timer = benchmark_utils.Timer(
-            stmt="torch.sum(torch.ones((500,500)))",
+
+        assert_reprs_match(
+            MockTimer("pass").adaptive_autorange(),
+            """
+            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            pass
+              Median: 7.86 ns
+              IQR:    0.71 ns (7.63 to 8.34)
+              6 measurements, 1000000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("cheap_fn()").blocked_autorange(min_run_time=10),
+            """
+            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            cheap_fn()
+              Median: 3.98 us
+              IQR:    0.27 us (3.85 to 4.12)
+              252 measurements, 10000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("cheap_fn()").adaptive_autorange(),
+            """
+            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            cheap_fn()
+              Median: 4.16 us
+              IQR:    0.22 us (4.04 to 4.26)
+              4 measurements, 1000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("expensive_fn()").blocked_autorange(min_run_time=10),
+            """
+            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            expensive_fn()
+              Median: 19.97 us
+              IQR:    1.35 us (19.31 to 20.65)
+              501 measurements, 1000 runs per measurement, 1 thread"""
+        )
+
+        assert_reprs_match(
+            MockTimer("expensive_fn()").adaptive_autorange(),
+            """
+            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            expensive_fn()
+              Median: 20.79 us
+              IQR:    1.09 us (20.20 to 21.29)
+              4 measurements, 1000 runs per measurement, 1 thread"""
         )
-        medium = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0)
-        blocked_medium = timer.blocked_autorange(min_run_time=0.1)
-        self.assertLess(small.median, medium.median)
-        # This acts as a control to compare to a different way to measure the same value.
-        self.assertLess(small.median, blocked_medium.median)
+
+        class _MockCudaTimer(self._MockTimer):
+            # torch.cuda.synchronize is much more expensive than
+            # just timeit.default_timer
+            _timer_cost = 10e-6
+
+            _function_costs = (
+                self._MockTimer._function_costs[0],
+                self._MockTimer._function_costs[1],
+
+                # GPU should be faster once there is enough work.
+                ("expensive_fn()", 5e-6),
+            )
+
+        class MockCudaTimer(benchmark_utils.Timer):
+            _timer_cls = _MockCudaTimer
+
+        configurations = (
+            (7.9903966e-09, 376, 1000000, MockTimer("pass")),
+            (7.8554826e-09, 4, 100000000, MockCudaTimer("pass")),
+            (3.9930536e-06, 752, 1000, MockTimer("cheap_fn()")),
+            (3.9441239e-06, 8, 100000, MockCudaTimer("cheap_fn()")),
+            (1.9994249e-05, 150, 1000, MockTimer("expensive_fn()")),
+            (4.9301076e-06, 6, 100000, MockCudaTimer("expensive_fn()")),
+        )
+
+        for median, repeats, number_per_run, timer_instance in configurations:
+            measurement = timer_instance.blocked_autorange(min_run_time=3)
+            self.assertEqual(measurement.median, median)
+            self.assertEqual(len(measurement.times), repeats)
+            self.assertEqual(measurement.number_per_run, number_per_run)
 
     def test_compare(self):
         compare = benchmark_utils.Compare([
diff --git a/torch/utils/_benchmark/utils/timer.py b/torch/utils/_benchmark/utils/timer.py
index 00260b49f99f..c78db2740c2f 100644
--- a/torch/utils/_benchmark/utils/timer.py
+++ b/torch/utils/_benchmark/utils/timer.py
@@ -20,6 +20,8 @@ def timer():
 
 
 class Timer(object):
+    _timer_cls = timeit.Timer
+
     def __init__(
         self,
         stmt="pass",
@@ -47,7 +49,7 @@ def __init__(
         self._description = description
         self._env = env
         self._num_threads = num_threads
-        self._timer = timeit.Timer(stmt=stmt, setup=setup, timer=timer, globals=globals)
+        self._timer = self._timer_cls(stmt=stmt, setup=setup, timer=timer, globals=globals)
 
     def _construct_measurement(self, number_per_run: int, times: List[float]):
         return common.Measurement(

From 2b38c09f69ace58058ace6d1b3b45725c1281fca Mon Sep 17 00:00:00 2001
From: Raziel Alvarez Guevara <razy@fb.com>
Date: Thu, 24 Sep 2020 09:36:53 -0700
Subject: [PATCH 093/449] Moves prim ops from C10 back to JIT (#45144)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45144

Moves prim ops from C10 back to JIT.

These were originally moved to C10 from JIT in D19237648 (https://github.com/pytorch/pytorch/commit/f362cd510dcedbf7384d418aad60e0ba963baeb6)
ghstack-source-id: 112775781

Test Plan:
buck test //caffe2/test/cpp/jit:jit

https://pxl.cl/1l22N

buck test adsatlas/gavel/lib/ata_processor/tests:ata_processor_test

https://pxl.cl/1lBxD

Reviewed By: iseeyuan

Differential Revision: D23697598

fbshipit-source-id: 36d1eb8c346e9b161ba6af537a218440a9bafd27
---
 aten/src/ATen/templates/TypeDefault.cpp       |  8 ---
 test/cpp/jit/test_lite_interpreter.cpp        | 26 +++++++++
 test/cpp/jit/tests.h                          |  1 +
 tools/build_variables.bzl                     |  8 +--
 torch/csrc/jit/runtime/register_prim_ops.cpp  | 53 +++++++++++++++++++
 .../jit/runtime/register_prim_ops_c10.cpp     | 40 --------------
 6 files changed, 82 insertions(+), 54 deletions(-)
 delete mode 100644 torch/csrc/jit/runtime/register_prim_ops_c10.cpp

diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp
index c1e7c9ac0c64..58c80381d340 100644
--- a/aten/src/ATen/templates/TypeDefault.cpp
+++ b/aten/src/ATen/templates/TypeDefault.cpp
@@ -59,14 +59,6 @@ TORCH_LIBRARY(aten, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("aten::rsplit(str self, str separator=' ', int max=-1) -> str[]"));
   m.def(TORCH_SELECTIVE_SCHEMA("aten::join(str self, str[] values) -> str"));
 
-  // Integer Ops
-  // Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cp
-  m.def("Int.Tensor(Tensor a) -> int");
-  m.def("Int.bool(bool a) -> int");
-  m.def("Int.float(float a) -> int");
-  m.def("Int.Scalar(Scalar a) -> int");
-  m.def("Int.str(str a) -> int");
-
   // Distributed Ops
   // Implementations located in torch/csrc/jit/runtime/register_distributed_ops.cpp
   m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)");
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index d09048413aec..814654dfc697 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -204,6 +204,32 @@ void testLiteInterpreterPrim() {
   AT_ASSERT(resi == refi);
 }
 
+void testLiteInterpreterPrimScalar() {
+  Module m("m");
+  m.define(R"JIT(
+        def forward(self, x):
+            return int(x.item())
+  )JIT");
+
+  std::vector<IValue> inputs;
+  auto minput = 3.5 * torch::ones({});
+  inputs.emplace_back(minput);
+  auto ref = m.run_method("forward", minput);
+
+  std::stringstream ss;
+  m._save_for_mobile(ss);
+  mobile::Module bc = _load_for_mobile(ss);
+  IValue res;
+  for (int i = 0; i < 3; ++i) {
+    auto bcinputs = inputs;
+    res = bc.get_method("forward")(bcinputs);
+  }
+
+  auto resi = res.toInt();
+  auto refi = ref.toInt();
+  AT_ASSERT(resi == refi);
+}
+
 void testLiteInterpreterLoadOrigJit() {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 186aaaec2bba..0285559fb8fc 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -71,6 +71,7 @@ namespace jit {
   _(MobileTypeParser)                             \
   _(LiteInterpreterBuiltinFunction)               \
   _(LiteInterpreterPrim)                          \
+  _(LiteInterpreterPrimScalar)                    \
   _(LiteInterpreterLoadOrigJit)                   \
   _(LiteInterpreterWrongMethodName)               \
   _(LiteInterpreterParams)                        \
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index b1a2967f5dea..174bb858da44 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -91,11 +91,7 @@ core_sources_common = [
     "torch/csrc/jit/serialization/unpickler.cpp",
 ]
 
-jit_sources_common = [
-    "torch/csrc/jit/runtime/register_prim_ops_c10.cpp",
-]
-
-libtorch_sources_common = core_sources_common + jit_sources_common
+libtorch_sources_common = core_sources_common
 
 core_trainer_sources = [
     "torch/csrc/autograd/anomaly_mode.cpp",
@@ -306,7 +302,7 @@ jit_sources_full = [
     "torch/csrc/jit/passes/utils/check_alias_annotation.cpp",
 ]
 
-libtorch_core_jit_sources = sorted(jit_sources_common + jit_sources_full)
+libtorch_core_jit_sources = sorted(jit_sources_full)
 
 libtorch_cmake_sources = libtorch_core_sources + libtorch_core_jit_sources
 
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 98f328a43240..bf2ffa421ee9 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -188,6 +188,59 @@ RegisterOperators reg(
            push(stack, (bool)d);
          },
          aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.Tensor(Tensor a) -> int"),
+         [](Stack* stack) {
+           at::Tensor a;
+           pop(stack, a);
+           push(stack, a.item<int64_t>());
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.bool(bool a) -> int"),
+         [](Stack* stack) {
+           bool b;
+           pop(stack, b);
+           push(stack, static_cast<int64_t>(b));
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.float(float a) -> int"),
+         [](Stack* stack) {
+           double d;
+           pop(stack, d);
+           push(stack, static_cast<int64_t>(d));
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.Scalar(Scalar a) -> int"),
+         [](Stack* stack) {
+           IValue scalar;
+           pop(stack, scalar);
+           if (scalar.isInt()) {
+             push(stack, std::move(scalar));
+           } else {
+             // toScalar() needed to avoid strict type check in IValue::toInt.
+             push(stack, static_cast<int64_t>(scalar.toScalar().toInt()));
+           }
+         },
+         aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::Int.str(str a) -> int"),
+         [](Stack* stack) {
+           auto s = pop(stack).toString();
+           std::string::size_type sz;
+           int64_t val = static_cast<int64_t>(c10::stoll(s->string(), &sz));
+           if (sz == s->string().size()) {
+             push(stack, val);
+           } else {
+             std::stringstream error_str;
+             error_str << "invalid literal for int() "
+                       << "with base 10: '" << s->string() << "'";
+             throw std::runtime_error(error_str.str());
+           }
+         },
+         aliasAnalysisFromSchema()),
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA("aten::Float.Tensor(Tensor a) -> float"),
          [](Stack* stack) {
diff --git a/torch/csrc/jit/runtime/register_prim_ops_c10.cpp b/torch/csrc/jit/runtime/register_prim_ops_c10.cpp
deleted file mode 100644
index b9e4e23c77b0..000000000000
--- a/torch/csrc/jit/runtime/register_prim_ops_c10.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include <ATen/ATen.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/core/stack.h>
-#include <c10/util/string_utils.h>
-#include <torch/library.h>
-
-using Stack = std::vector<c10::IValue>;
-using at::Scalar;
-using at::Tensor;
-using c10::IValue;
-using torch::jit::drop;
-using torch::jit::pack;
-using torch::jit::peek;
-using torch::jit::pop;
-using torch::jit::push;
-
-// Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cpp
-TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
-  m.impl("Int.Tensor", [](at::Tensor a) { return a.item<int64_t>(); });
-
-  m.impl("Int.bool", [](bool b) { return static_cast<int64_t>(b); });
-
-  m.impl("Int.float", [](double d) { return static_cast<int64_t>(d); });
-
-  m.impl("Int.Scalar", [](Scalar scalar) {
-    return static_cast<int64_t>(scalar.toInt());
-  });
-
-  m.impl("Int.str", [](const std::string& str) {
-    std::string::size_type sz;
-    int64_t val = static_cast<int64_t>(c10::stoll(str, &sz));
-    if (sz != str.size()) {
-      std::stringstream error_str;
-      error_str << "invalid literal for int() "
-                << "with base 10: '" << str << "'";
-      throw std::runtime_error(error_str.str());
-    }
-    return val;
-  });
-}

From e57a08119bc01b7d06ee6ba8042cc0885ebb6276 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 24 Sep 2020 09:48:51 -0700
Subject: [PATCH 094/449] Add a warning log when there is high skew of uneven
 inputs in DDP training (#45238)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45238

Adds a warning when there is much higher than expected amount of
discrepancy of inputs across different processes when running with uneven
inputs. This is because a skew in the thousands can reduce performance a
nontrivial amount as shown in benchmarks, and it was proposed to add this
warning as a result. Tested by running the tests so the threshold is hit and
observing the output.
ghstack-source-id: 112773552

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D23719270

fbshipit-source-id: 306264f62c1de65e733696a912bdb6e9376d5622
---
 torch/nn/parallel/distributed.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 44f5e6fe2ccb..790a9d1c2fc4 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -4,6 +4,7 @@
 import os
 import inspect
 import logging
+import warnings
 
 import torch
 
@@ -431,7 +432,6 @@ def model_parameters(m):
 
         if self.device_ids and len(self.device_ids) > 1:
 
-            import warnings
             warnings.warn(
                 "Single-Process Multi-GPU is not the recommended mode for "
                 "DDP. In this mode, each DDP instance operates on multiple "
@@ -815,8 +815,20 @@ def join(self, divide_by_initial_world_size=True, enable=True):
             if enable and not has_error:
                 all_procs_joined = False
                 is_last_joiner = True
-                # Schedules allreduce to match fwd pass allreduce in non-joined procs
+                i = 0
+                WARN_THRESHOLD = 1000
+                warnings.simplefilter("once")
                 while not all_procs_joined:
+                    if i > WARN_THRESHOLD:
+                        my_rank = dist.get_rank(self.process_group)
+                        warnings.warn(
+                            "Detected uneven input skew of greater "
+                            f"than {WARN_THRESHOLD}. This means that rank {my_rank} "
+                            f"has at least {WARN_THRESHOLD} fewer inputs than "
+                            "other currently active ranks. This level of skew could "
+                            "lead to performance degradation during training."
+                        )
+                    # Schedules allreduce to match fwd pass allreduce in non-joined procs
                     num_active_procs = self._schedule_shadow_all_reduce_for_fwd_pass()
                     if num_active_procs == 0:
                         all_procs_joined = True
@@ -853,6 +865,7 @@ def join(self, divide_by_initial_world_size=True, enable=True):
                             self._match_unused_params_allreduce()
                         # It will push rebuilt params only once during training period
                         self.reducer._push_all_rebuilt_params()
+                        i += 1
 
                 # All procs joined. Agree on authoritative rank and broadcast the model.
                 self._sync_final_model(is_last_joiner)

From b8eab8cdbdc467bc6ef19381af9387f41e45fb44 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Thu, 24 Sep 2020 10:04:47 -0700
Subject: [PATCH 095/449] [hotfix] typo in NaiveConvolutionTranspose2d.cu
 (#45224)

Summary:
Fixes typo in e2f49c8
Fixes https://github.com/pytorch/pytorch/issues/45172

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45224

Reviewed By: ezyang

Differential Revision: D23879872

Pulled By: walterddr

fbshipit-source-id: c3db6d4c6f2ac0e6887862d4217a79c030647cb9
---
 .../cuda/NaiveConvolutionTranspose2d.cu       |  5 +-
 test/test_nn.py                               | 57 +++++++++++++++++++
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
index 10138f4bced0..13149759926d 100644
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
@@ -684,10 +684,7 @@ void slow_conv_transpose2d_acc_grad_parameters_cuda_template(
             // Matrix mulitply per output:
             input_n = input.select(0, elt);
 
-            if (kernel_height == 1 && kernel_width == 1) {
-              // for 1x1 column skip im2col step
-              columns.copy_(grad_output_n);
-            } else {
+            if (kernel_height != 1 || kernel_width != 1) {
               // Extract columns:
               im2col<scalar_t>(
                   at::cuda::getCurrentCUDAStream(),
diff --git a/test/test_nn.py b/test/test_nn.py
index 281425e26782..9618b70ab71c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -10698,6 +10698,63 @@ def test_contig_wrong_stride_cudnn(self, device):
         F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device))
         F.conv2d(x, torch.randn(1, 16, 1, 1, device=device))
 
+    @onlyCUDA
+    def test_Conv2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.Conv2d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
+    @onlyCUDA
+    def test_ConvTranspose2d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 5, 5)
+        conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.ConvTranspose2d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
+    @onlyCUDA
+    def test_ConvTranspose3d_size_1_kernel(self, device):
+        x_cpu = torch.randn(2, 3, 3, 5, 5)
+        conv_cpu = torch.nn.ConvTranspose3d(3, 3, kernel_size=1)
+        y_cpu = conv_cpu(x_cpu)
+        y = torch.rand_like(y_cpu)
+        y_cpu.backward(y)
+
+        with cudnn.flags(enabled=False):
+            conv_cuda = torch.nn.ConvTranspose3d(3, 3, kernel_size=1).to(device)
+            conv_cuda.bias.data.copy_(conv_cpu.bias.data)
+            conv_cuda.weight.data.copy_(conv_cpu.weight.data)
+            y_cuda = conv_cuda(x_cpu.to(device))
+            y_cuda.backward(y.to(device))
+
+        self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False)
+        self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False)
+
     def _ordered_sequence(self, device, dtype):
         """Create ordered list of random sequences"""
         seqs = [torch.empty(random.randint(1, 6), device=device, dtype=dtype)

From 3f5eee666cfbb5cbc5ced32915658e00b39b40e9 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Thu, 24 Sep 2020 10:23:46 -0700
Subject: [PATCH 096/449] Adjust TF32 tests (#44240)

Summary:
- The thresholds of some tests are bumped up. Depending on the random generator, sometimes these tests fail with things like 0.0059 is not smaller than 0.005. I ran `test_nn.py` and `test_torch.py` for 10+ times to check these are no longer flaky.
- Add `tf32_on_and_off` to new `matrix_exp` tests.
- Disable TF32 on test suites other than `test_nn.py` and `test_torch.py`

cc: ptrblck

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44240

Reviewed By: mruberry

Differential Revision: D23882498

Pulled By: ngimel

fbshipit-source-id: 44a9ec08802c93a2efaf4e01d7487222478b6df8
---
 aten/src/ATen/Context.cpp               | 23 ++++++++++++++++++
 aten/src/ATen/Context.h                 | 16 +++++++++++++
 aten/src/ATen/cuda/CUDABlas.cpp         | 32 +++++++++++++++----------
 aten/src/ATen/cuda/CublasHandlePool.cpp |  2 +-
 aten/src/ATen/native/LinearAlgebra.cpp  |  3 +++
 test/jit/test_tracer.py                 |  4 ++++
 test/test_jit_fuser.py                  |  7 ++++++
 test/test_nn.py                         |  1 +
 test/test_torch.py                      | 28 +++++++++++++---------
 torch/testing/_internal/common_cuda.py  | 15 ++++++++++++
 torch/testing/_internal/common_nn.py    | 21 +++++++++++++---
 11 files changed, 124 insertions(+), 28 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 1496b6ee551d..18673877c219 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -230,4 +230,27 @@ Allocator* getCPUAllocator() {
   return getTHDefaultAllocator();
 }
 
+// override_allow_tf32_flag = true
+//    means the allow_tf32 flags are overrided and tf32 is force disabled
+// override_allow_tf32_flag = false
+//    means the original allow_tf32 flags are followed
+thread_local bool override_allow_tf32_flag = false;
+
+NoTF32Guard::NoTF32Guard() {
+  if (!override_allow_tf32_flag) {
+    changed = true;
+    override_allow_tf32_flag = true;
+  }
+}
+
+NoTF32Guard::~NoTF32Guard() {
+  if (changed) {
+    override_allow_tf32_flag = false;
+  }
+}
+
+bool NoTF32Guard::should_disable_tf32() {
+  return override_allow_tf32_flag;
+}
+
 } // namespace at
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index b8782209def5..fed5e88e5314 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -327,4 +327,20 @@ static inline void manual_seed(uint64_t seed) {
   }
 }
 
+// When the global flag `allow_tf32` is set to true, cuBLAS handles are
+// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.
+// For some operators, such as addmv, TF32 offers no performance improvement
+// but causes precision loss. To help this case, this class implements
+// a RAII guard that can be used to quickly disable TF32 within its scope.
+//
+// Usage:
+//     NoTF32Guard disable_tf32;
+struct TORCH_API NoTF32Guard {
+  NoTF32Guard();
+  ~NoTF32Guard();
+  static bool should_disable_tf32();
+private:
+  bool changed = false;
+};
+
 } // namespace at
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 0311399649e7..d4b7155b0591 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -407,19 +407,22 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 #endif
 
 #if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 210)
-  template <>
-  void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
-    // See Note [Writing Nondeterministic Operations]
-    globalContext().alertCuBLASConfigNotDeterministic();
-    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-    cublasOperation_t op = _cublasOpFromChar(trans);
-    _cublasAdjustLdLevel2(m, n, &lda);
-    GEMV_CHECK_ARGVALUES(c10::complex<float>);
-    TORCH_CUDABLAS_CHECK(
-        cublasCgemv(handle, op, m, n, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(a),
-        lda, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(&beta),
-        reinterpret_cast<cuComplex*>(y), incy));
-  }
+template <>
+void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
+  // See Note [Writing Nondeterministic Operations]
+  globalContext().alertCuBLASConfigNotDeterministic();
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  cublasOperation_t op = _cublasOpFromChar(trans);
+  _cublasAdjustLdLevel2(m, n, &lda);
+  GEMV_CHECK_ARGVALUES(c10::complex<float>);
+  TORCH_CUDABLAS_CHECK(
+      cublasCgemv(handle, op, m, n, reinterpret_cast<const cuComplex*>(&alpha), reinterpret_cast<const cuComplex*>(a),
+      lda, reinterpret_cast<const cuComplex*>(x), incx, reinterpret_cast<const cuComplex*>(&beta),
+      reinterpret_cast<cuComplex*>(y), incy));
+}
 #endif
 
 template <>
@@ -436,6 +439,9 @@ void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
 
 template <>
 void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
+  // gemv is bw bound, and does not benefit from TF32. But the precision
+  // loss still happens on TF32. So we disable it here.
+  NoTF32Guard disable_tf32;
   // See Note [Writing Nondeterministic Operations]
   globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 404f322545f8..0165c53ac60d 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -45,7 +45,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
-  if (at::globalContext().allowTF32CuBLAS()) {
+  if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index e93eb11f642c..a8bb81b3e222 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1223,6 +1223,8 @@ Tensor matrix_exp(const Tensor& a) {
               "matrix_exp(", a.scalar_type(), "{", a.sizes(), "}): expected a tensor "
               "of squared matrices");
 
+  NoTF32Guard disable_tf32;
+
   if (a.size(-1) == 1) {
     return a.exp();
   }
@@ -1231,6 +1233,7 @@ Tensor matrix_exp(const Tensor& a) {
 }
 
 Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) {
+  NoTF32Guard disable_tf32;
   return backward_analytic_function_of_a_matrix(
     self, grad,
     [](const Tensor& a) {
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 518af2f95a4c..24db4cfe857e 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -18,6 +18,7 @@
     IS_SANDCASTLE, IS_WINDOWS
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \
     _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, RUN_CUDA_MULTI_GPU
+from torch.testing._internal.common_cuda import with_tf32_off
 from typing import List, Tuple
 from torch import Tensor
 
@@ -900,6 +901,9 @@ def foo(a):
         self.assertEqual(foo(x), x + x + x)
 
     @unittest.skipIf(not RUN_CUDA, "calls .cuda()")
+    # By default, on Ampere or later GPUs, nn.Linear computes float tensors at TF32 precision.
+    # We want float tensors to be computed at full precision in order to use the default precision
+    @with_tf32_off
     def test_traced_module_cuda(self):
         class Model(nn.Module):
             def __init__(self, num_features, num_layers):
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index a75da03a6d21..b4efbf12c358 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -10,6 +10,7 @@
     RUN_CUDA, RUN_CUDA_HALF, RUN_CUDA_MULTI_GPU, warmup_backward
 from textwrap import dedent
 from itertools import product, permutations
+from torch.testing._internal.common_cuda import with_tf32_off
 
 from test_jit import backward_graph, all_backward_graphs, get_lstm_inputs, get_milstm_inputs, \
     LSTMCellC, LSTMCellF, LSTMCellS, MiLSTMCell
@@ -710,6 +711,9 @@ def test_lstm_cuda(self):
                                                   "aten::_grad_sum_to_size"))
 
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision.
+    # We want float tensors to be computed at full precision in order to use the default precision
+    @with_tf32_off
     def test_lstm_concat_cuda(self):
         inputs = get_lstm_inputs('cuda')
         ge = self.checkTrace(LSTMCellC, inputs)
@@ -740,6 +744,9 @@ def cell(x, hx, cx, w_ih, w_hh, b_ih, b_hh):
 
     # TODO: Fuser doesn't work at all when inputs require grad. Fix that
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision.
+    # We want float tensors to be computed at full precision in order to use the default precision
+    @with_tf32_off
     def test_lstm_traced_cuda(self):
         inputs = get_lstm_inputs('cuda')
         ge = self.checkTrace(LSTMCellF, inputs)
diff --git a/test/test_nn.py b/test/test_nn.py
index 9618b70ab71c..8b9bf9156106 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -12018,6 +12018,7 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
     @onlyCUDA
     @skipCUDAIfRocm
     @skipCUDAIfCudnnVersionLessThan(7603)
+    @tf32_on_and_off(0.05)
     def test_conv_cudnn_mismatch_memory_format(self, device):
         configs = [
             [4, 2, 8, 8, 4, 2],
diff --git a/test/test_torch.py b/test/test_torch.py
index ee27c8dd65cf..8c355eb93570 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -41,11 +41,10 @@
 from typing import Dict, List, Tuple, Union
 import torch.backends.quantized
 import torch.testing._internal.data
-from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, \
+from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, with_tf32_off, \
     _get_torch_cuda_version, TEST_MAGMA
 
 
-
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -7003,6 +7002,9 @@ def test_matrix_exp_boundary_cases(self, device, dtype):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
+    # Although tf32 is always disabled on matrix_exp, this test uses matmul,
+    # which has tf32 on by default
+    @with_tf32_off
     def test_matrix_exp_analytic(self, device, dtype):
         # check zero matrix
         x = torch.zeros(20, 20, dtype=dtype, device=device)
@@ -7144,6 +7146,9 @@ def run_test(*n):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
+    # Although tf32 is always disabled on matrix_exp, this test uses matmul,
+    # which has tf32 on by default
+    @with_tf32_off
     def test_matrix_exp_compare_with_taylor(self, device, dtype):
 
         def normalize_to_1_operator_norm(sample, desired_norm):
@@ -16471,6 +16476,7 @@ def _test(row_major, incx, incy, lda_tail):
     @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     @dtypes(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes())
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    @tf32_on_and_off(0.05)
     def test_addmm(self, device, dtype):
         M = torch.randn(10, 25, device=device).to(dtype)
         m1 = torch.randn(10, 50, device=device).to(dtype)
@@ -19832,13 +19838,13 @@ def inner(self, device, dtype):
         1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('addbmm', '', _small_2d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
         1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types,
-        _cpu_types, True, [tf32_on_and_off(0.005)]),
+        _cpu_types, True, [tf32_on_and_off(0.01)]),
     ('addbmm', 'scalar', _small_2d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
         1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
     ('addbmm', 'two_scalars', _small_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
         1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]),
     ('baddbmm', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
         1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)),
     ('baddbmm', 'scalar', _small_3d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
@@ -19865,26 +19871,26 @@ def inner(self, device, dtype):
         [_wrap_maybe_warns("This overload of addcmul_? is deprecated")]),
     ('addmm', '', _medium_2d, lambda t, d: [_medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM),
-        _cpu_types, True, [tf32_on_and_off(0.005)], 0, True),
+        _cpu_types, True, [tf32_on_and_off(0.01)], 0, True),
     ('addmm', 'scalar', _medium_2d,
         lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
     ('addmm', 'two_scalars', _medium_2d,
         lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
+        [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]),
     ('addmv', '', _medium_1d, lambda t, d: [_medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types,
-        True, [tf32_on_and_off(0.005)], 0, True),
+        True, [], 0, True),
     ('addmv', 'scalar', _medium_1d,
         lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, 
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]),
+        [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('addmv', 'two_scalars', _medium_1d,
         lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
-        [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]),
+        [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('addr', '', _medium_2d, lambda t, d: [_medium_1d(t, d), _medium_1d(t, d)],
         1e-2, 1e-1, 1e-4, _float_types2),
     ('addr', 'scalar', _medium_2d,
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 88f6bf11976c..f0e8c40602c0 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -127,6 +127,21 @@ def wrapped(self, device, dtype):
         return wrapped
     return wrapper
 
+
+# This is a wrapper that wraps a test to run it with TF32 turned off.
+# This wrapper is designed to be used when a test uses matmul or convolutions
+# but the purpose of that test is not testing matmul or convolutions.
+# Disabling TF32 will enforce torch.float tensors to be always computed
+# at full precision.
+def with_tf32_off(f):
+    @functools.wraps(f)
+    def wrapped(*args, **kwargs):
+        with tf32_off():
+            return f(*args, **kwargs)
+
+    return wrapped
+
+
 def _get_torch_cuda_version():
     if torch.version.cuda is None:
         return [0, 0]
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 4b18ec86fa2f..2de86795cda7 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -1601,6 +1601,7 @@ def fractional_max_pool3d_test(test_case):
         input_size=(2, 4, 10),
         cudnn=True,
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1620,6 +1621,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad1',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1629,6 +1631,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad2',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1638,6 +1641,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad1size1',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1647,6 +1651,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='pad2size1',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         module_name='Conv1d',
@@ -1657,6 +1662,7 @@ def fractional_max_pool3d_test(test_case):
         desc='zero_batch',
         test_cuda=(not TEST_WITH_ROCM),
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='Conv1d_dilated',
@@ -1664,6 +1670,7 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).dilation(2)',
         input_size=(2, 4, 10),
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='Conv1d_groups',
@@ -1672,6 +1679,7 @@ def fractional_max_pool3d_test(test_case):
         input_size=(2, 4, 6),
         cudnn=True,
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='ConvTranspose1d',
@@ -1702,6 +1710,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         desc='dilated',
         with_tf32=True,
+        tf32_precision=0.005,
     ),
     dict(
         fullname='ConvTranspose1d_groups',
@@ -2117,7 +2126,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         check_with_long_tensor=True,
         with_tf32=True,
-        tf32_precision=0.005,
+        tf32_precision=0.05,
     ),
     dict(
         module_name='Conv3d',
@@ -2140,7 +2149,7 @@ def fractional_max_pool3d_test(test_case):
         desc='stride',
         check_with_long_tensor=True,
         with_tf32=True,
-        tf32_precision=0.005,
+        tf32_precision=0.05,
     ),
     dict(
         module_name='Conv3d',
@@ -2151,7 +2160,7 @@ def fractional_max_pool3d_test(test_case):
         desc='stride_padding',
         check_with_long_tensor=True,
         with_tf32=True,
-        tf32_precision=0.01,
+        tf32_precision=0.05,
     ),
     dict(
         module_name='Conv3d',
@@ -2180,6 +2189,7 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2)',
         input_size=(2, 3, 5, 5, 5),
         with_tf32=True,
+        tf32_precision=0.05,
     ),
     dict(
         fullname='Conv3d_dilated_strided',
@@ -2187,6 +2197,7 @@ def fractional_max_pool3d_test(test_case):
         cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2).stride(2)',
         input_size=(2, 3, 5, 5, 5),
         with_tf32=True,
+        tf32_precision=0.05
     ),
     dict(
         module_name='ConvTranspose3d',
@@ -2195,6 +2206,7 @@ def fractional_max_pool3d_test(test_case):
         cudnn=True,
         input_size=(1, 2, 4, 5, 4),
         with_tf32=True,
+        tf32_precision=0.05
     ),
     dict(
         module_name='ConvTranspose3d',
@@ -2205,6 +2217,7 @@ def fractional_max_pool3d_test(test_case):
         input_size=(1, 2, 4, 5, 4),
         desc='dilated',
         with_tf32=True,
+        tf32_precision=0.05
     ),
     dict(
         module_name='MaxPool3d',
@@ -5005,6 +5018,8 @@ def __init__(self, *args, **kwargs):
         self.check_bfloat16 = kwargs.get('check_bfloat16', False)
         self.convert_target = kwargs.get('convert_target', True)
         self.test_cpu = kwargs.get('test_cpu', True)
+        self.with_tf32 = kwargs.get('with_tf32', True)
+        self.tf32_precision = kwargs.get('tf32_precision', 0.001)
 
     def __call__(self, test_case):
         module = self.constructor(*self.constructor_args)

From c79d493096abdb1c87586e41e50c1aa725f28df1 Mon Sep 17 00:00:00 2001
From: Kyle Chen <kylechen@amd.com>
Date: Thu, 24 Sep 2020 11:15:46 -0700
Subject: [PATCH 097/449] added rocm 3.8 docker image (#45205)

Summary:
jeffdaily

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45205

Reviewed By: malfet

Differential Revision: D23906606

Pulled By: walterddr

fbshipit-source-id: 604a12bf4c97260215a1881cc96e35e7c42b4578
---
 .circleci/cimodel/data/simple/docker_definitions.py | 1 +
 .circleci/config.yml                                | 3 +++
 .circleci/docker/build.sh                           | 7 +++++++
 .circleci/docker/common/install_base.sh             | 2 +-
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
index 59944d190383..90d776311601 100644
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -28,6 +28,7 @@
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
     "pytorch-linux-bionic-rocm3.7-py3.6",
+    "pytorch-linux-bionic-rocm3.8-py3.6",
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index c952ee716b3d..b70a090bed72 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -6320,6 +6320,9 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-bionic-rocm3.7-py3.6"
           image_name: "pytorch-linux-bionic-rocm3.7-py3.6"
+      - docker_build_job:
+          name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
+          image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
       - pytorch_linux_build:
           name: pytorch_linux_xenial_py3_6_gcc5_4_build
           requires:
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 9bfa0b195499..0afc1b33c59e 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -262,6 +262,13 @@ case "$image" in
     VISION=yes
     ROCM_VERSION=3.7
     ;;
+  pytorch-linux-bionic-rocm3.8-py3.6)
+    ANACONDA_PYTHON_VERSION=3.6
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=3.8
+    ;;
   *)
     # Catch-all for builds that are not hardcoded.
     PROTOBUF=yes
diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index ac4e1f18f1ef..5e8173a43627 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -118,7 +118,7 @@ esac
 
 # Install Valgrind separately since the apt-get version is too old.
 mkdir valgrind_build && cd valgrind_build
-VALGRIND_VERSION=3.15.0
+VALGRIND_VERSION=3.16.1
 if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2
 then
   wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2

From 26001a2334783e083e252e366ec0804c1e12d5e9 Mon Sep 17 00:00:00 2001
From: Xinyu Li <lixinyu@fb.com>
Date: Thu, 24 Sep 2020 11:53:58 -0700
Subject: [PATCH 098/449] Revert D23753711: [pytorch][PR] Add foreach APIs for
 binary ops with ScalarList

Test Plan: revert-hammer

Differential Revision:
D23753711 (https://github.com/pytorch/pytorch/commit/71d1b5b0e227e407e60c0a3dd6a4caabdcd6c89a)

Original commit changeset: bf3e8c54bc07

fbshipit-source-id: 192692e0d3fff4cade9983db0a1760fedfc9674c
---
 aten/src/ATen/native/ForeachOpsKernels.cpp    |  24 -
 aten/src/ATen/native/ForeachUtils.h           |  14 -
 .../native/cuda/ForeachBinaryOpScalarList.cu  |  60 --
 aten/src/ATen/native/cuda/ForeachFunctors.cuh | 115 ----
 .../src/ATen/native/cuda/MultiTensorApply.cuh |  70 ---
 aten/src/ATen/native/native_functions.yaml    |  97 +---
 .../check_backward_compatibility.py           |   4 -
 test/test_foreach.py                          | 529 ++++--------------
 test/test_native_functions.py                 |   2 +-
 tools/autograd/gen_python_functions.py        |   1 -
 .../templates/python_torch_functions.cpp      |   1 -
 tools/codegen/model.py                        |   4 -
 tools/pyi/gen_pyi.py                          |   1 -
 torch/csrc/utils/python_arg_parser.cpp        |  22 +-
 torch/csrc/utils/python_arg_parser.h          |  18 +-
 15 files changed, 119 insertions(+), 843 deletions(-)
 delete mode 100644 aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 73eb2070c07d..912b5116c4cc 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -24,26 +24,6 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor
   return result;                                                                                          \
 }
 
-#define FOREACH_BINARY_OP_SCALARLIST(NAME)                                                                              \
-void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef<double> scalars) {                \
-  check_foreach_api_restrictions(tensors, scalars);                                                                     \
-                                                                                                                        \
-  for (int i = 0; i < tensors.size(); i++) {                                                                            \
-      tensors[i].NAME##_(scalars[i]);                                                                                   \
-    }                                                                                                                   \
-}                                                                                                                       \
-                                                                                                                        \
-std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef<double> scalars) {  \
-  check_foreach_api_restrictions(tensors, scalars);                                                                     \
-  std::vector<Tensor> result;                                                                                           \
-  result.reserve(tensors.size());                                                                                       \
-  for (int i = 0; i < tensors.size(); i++) {                                                                            \
-    result.emplace_back(tensors[i].NAME(scalars[i]));                                                                   \
-  }                                                                                                                     \
-                                                                                                                        \
-  return result;                                                                                                        \
-}
-
 #define FOREACH_BINARY_OP_LIST(NAME)                                                                      \
 std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) {  \
   check_foreach_api_restrictions(tensors1, tensors2);                                                     \
@@ -137,10 +117,6 @@ FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);
 FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
-FOREACH_BINARY_OP_SCALARLIST(add);
-FOREACH_BINARY_OP_SCALARLIST(sub);
-FOREACH_BINARY_OP_SCALARLIST(mul);
-FOREACH_BINARY_OP_SCALARLIST(div);
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 44e6a50297db..5a7aced74702 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -31,12 +31,6 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
   }
 }
 
-void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {
-  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
-  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
-  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
-}
-
 // To go via 'fast' path, several conditions must be satisfied
 // - All tensors must be on the same device
 // - All tensors must have strided layout
@@ -138,13 +132,5 @@ bool can_use_fast_route(TensorList tensors) {
   return true;
 }
 
-bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {
-  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
-  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
-  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
-
-  return can_use_fast_route(tensors);
-}
-
 }
 }} // at::native
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
deleted file mode 100644
index 684f12732ffc..000000000000
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <ATen/Dispatch.h>
-#include <ATen/native/ForeachUtils.h>
-#include <ATen/native/cuda/ForeachFunctors.cuh>
-
-namespace at { namespace native {
-
-template<template<class> class Op>
-std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<double> scalars) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
-    std::vector<at::Tensor> vec_res;
-    for (const auto& t: tensors) {
-        vec_res.emplace_back(at::native::empty_like(t));
-    }
-
-    tensor_lists.emplace_back(tensors.vec());
-    tensor_lists.emplace_back(vec_res);
-
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
-        multi_tensor_apply<2>(tensor_lists, scalars, BinaryOpScalarListFunctor<scalar_t, Op>());
-    });
-    return tensor_lists[1];
-}
-
-template<template<class> class Op>
-void foreach_binary_op_(TensorList tensors, at::ArrayRef<double> scalars) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
-    tensor_lists.emplace_back(tensors.vec());
-
-    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
-        multi_tensor_apply<1>(tensor_lists, scalars, BinaryOpScalarListFunctor_<scalar_t, Op>());
-    });
-}
-
-#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP)                                                                           \
-void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<double> scalars) {                 \
-    check_foreach_api_restrictions(tensors);                                                                             \
-                                                                                                                         \
-    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
-        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars);                            \
-    }                                                                                                                    \
-                                                                                                                         \
-    foreach_binary_op_<OP>(tensors, scalars);                                                                            \
-}                                                                                                                        \
-                                                                                                                         \
-std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef<double> scalars) {   \
-    check_foreach_api_restrictions(tensors);                                                                             \
-                                                                                                                         \
-    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
-        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars);                             \
-    }                                                                                                                    \
-                                                                                                                         \
-    return foreach_binary_op<OP>(tensors, scalars);                                                                      \
-}
-
-FOREACH_BINARY_OP_SCALARLIST(add, std::plus);
-FOREACH_BINARY_OP_SCALARLIST(sub, std::minus);
-FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies);
-FOREACH_BINARY_OP_SCALARLIST(div, std::divides);
-
-}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index e83eca3dd8e1..a04d27110c9a 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -118,121 +118,6 @@ struct BinaryOpScalarFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
-struct BinaryOpScalarListFunctor_ {
-    __device__ void operator() (
-        int chunk_size,
-        TensorListScalarListMetadata<1>& tl) {
-            int tensor_loc = tl.block_to_tensor[blockIdx.x];
-            int chunk_idx = tl.block_to_chunk[blockIdx.x];
-            int n = tl.sizes[tensor_loc];
-
-            T* x = (T*)tl.addresses[0][tensor_loc];
-            x += chunk_idx * chunk_size;
-
-            double y = tl.scalar_vals[tensor_loc];
-
-            n -= chunk_idx * chunk_size;
-
-            T r_x[kILP];
-
-            // to make things simple, we put aligned case in a different code path
-            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) {
-                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
-                    // load
-                    load_store(r_x, x, 0 , i_start);
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-                    // store
-                    load_store(x, r_x, i_start, 0);
-                }
-            }
-            else {
-                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = 0;
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size) {
-                            r_x[ii] = x[i];
-                        }
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size)
-                            x[i] = r_x[ii];
-                    }
-                }
-            }
-        }
-};
-
-template<typename T, template<class> class Op>
-struct BinaryOpScalarListFunctor {
-    __device__ void operator() (
-        int chunk_size,
-        TensorListScalarListMetadata<2>& tl) {
-            int tensor_loc = tl.block_to_tensor[blockIdx.x];
-            int chunk_idx = tl.block_to_chunk[blockIdx.x];
-            int n = tl.sizes[tensor_loc];
-
-            T* x = (T*)tl.addresses[0][tensor_loc];
-            x += chunk_idx * chunk_size;
-
-            T* out = (T*)tl.addresses[1][tensor_loc];
-            out += chunk_idx * chunk_size;
-
-            double y = tl.scalar_vals[tensor_loc];
-
-            n -= chunk_idx * chunk_size;
-
-            T r_x[kILP];
-
-            // to make things simple, we put aligned case in a different code path
-            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) {
-                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
-                    // load
-                    load_store(r_x, x, 0 , i_start);
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-                    // store
-                    load_store(out, r_x, i_start, 0);
-                }
-            }
-            else {
-                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = 0;
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size) {
-                            r_x[ii] = x[i];
-                        }
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
-                    }
-#pragma unroll
-                    for(int ii = 0; ii < kILP; ii++) {
-                        int i = i_start + threadIdx.x + ii * blockDim.x;
-                        if(i < n && i < chunk_size)
-                            out[i] = r_x[ii];
-                    }
-                }
-            }
-        }
-};
-
 template<typename T, template<class> class Op>
 struct BinaryOpListAlphaFunctor_ {
     __device__ void operator() (
diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
index d162af19fd1b..f82a0d9a58c8 100644
--- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh
+++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
@@ -26,7 +26,6 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s
 // TensorListMetadata has to be < 4KB - the limit for kernel launch argument
 static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
-static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
 
 template<int n> struct TensorListMetadata
 {
@@ -36,15 +35,6 @@ template<int n> struct TensorListMetadata
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
-template<int n> struct TensorListScalarListMetadata
-{
-  void* addresses[n][depth_to_max_tensors_scalarlist[n-1]];
-  int sizes[depth_to_max_tensors_scalarlist[n-1]];
-  double scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
-  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
-  int block_to_chunk[depth_to_max_blocks[n-1]];
-};
-
 template<typename T, typename U, typename... ArgTypes>
 C10_LAUNCH_BOUNDS_1(kBlockSize)
 __global__ void 
@@ -59,71 +49,11 @@ multi_tensor_apply_kernel(
 template<int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
-    at::ArrayRef<double> scalars,
     T callable,
     ArgTypes... args) {
         TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
         const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
-        size_t n_tensors = tensor_lists[0].size();
-        TensorListScalarListMetadata<depth> tensorListMeta;
-
-        int loc_block_info = 0;
-        int loc_tensor_info = 0;
-        for(size_t t = 0; t < n_tensors; t++) {
-
-            tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t];
-
-            tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
-            for (int d = 0; d < depth; d++) {
-                tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
-            }
-            loc_tensor_info++;
-
-            int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize;
-            for (int chunk = 0; chunk < chunks; chunk++) {
-                tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
-                tensorListMeta.block_to_chunk[loc_block_info] = chunk;
-                loc_block_info++;
-
-                bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] &&
-                    chunk == chunks - 1);
-                bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
-                bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1);
-
-                if (tensors_full || blocks_full || last_chunk) {
-                    multi_tensor_apply_kernel<<<loc_block_info, kBlockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
-                        tensorListMeta,
-                        callable,
-                        args...);
-
-                    AT_CUDA_CHECK(cudaGetLastError());
-
-                    // Reset.
-                    loc_block_info = 0;
-                    if(chunk == chunks - 1) {
-                        loc_tensor_info = 0; 
-                    }
-                    else {
-                        tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
-                        tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1];
-                        for(int d = 0; d < depth; d++) {
-                            tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1];
-                        }
-                        loc_tensor_info = 1;
-                    }
-                }
-            }
-        }
-    }
-
 
-template<int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(
-    std::vector<std::vector<at::Tensor>>& tensor_lists,
-    T callable,
-    ArgTypes... args) {
-        TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
-        const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
         size_t n_tensors = tensor_lists[0].size();
         TensorListMetadata<depth> tensorListMeta;
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8068bc1721df..f5bbb263ed9c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6187,7 +6187,6 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6195,7 +6194,6 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
 
 - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6203,7 +6201,6 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6211,7 +6208,6 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
 
 - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6219,7 +6215,6 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6227,7 +6222,6 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
 
 - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6235,39 +6229,34 @@
     CUDA: foreach_tensor_div_scalar_kernel_cuda
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
 
-- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
+- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
 
-- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
 
-- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
+- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
 
-- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
   device_guard: False
   variants: function
   dispatch:
@@ -6275,7 +6264,6 @@
     CUDA: foreach_tensor_sub_list_kernel_cuda_
 
 - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6283,15 +6271,13 @@
     CUDA: foreach_tensor_mul_list_kernel_cuda
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
 
-- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
+- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[]
   device_guard: False
   variants: function
   dispatch:
@@ -6299,79 +6285,13 @@
     CUDA: foreach_tensor_div_list_kernel_cuda
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
 
-- func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_add_scalarlist_kernel_slow
-    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
-
-- func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_add_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
-
-- func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sub_scalarlist_kernel_slow
-    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
-
-- func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
-
-- func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_div_scalarlist_kernel_slow
-    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
-
-- func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_div_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
-
-- func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_mul_scalarlist_kernel_slow
-    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
-
-- func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
-  device_guard: False
-  variants: function
-  dispatch:
-    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
-    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
-
 - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6379,7 +6299,6 @@
     CUDA: foreach_tensor_exp_cuda
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6387,7 +6306,6 @@
     CUDA: foreach_tensor_exp_cuda_
 
 - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6395,7 +6313,6 @@
     CUDA: foreach_tensor_sqrt_cuda
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6403,7 +6320,6 @@
     CUDA: foreach_tensor_sqrt_cuda_
 
 - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6411,7 +6327,6 @@
     CUDA: foreach_tensor_addcdiv_cuda_
 
 - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6419,7 +6334,6 @@
     CUDA: foreach_tensor_addcmul_cuda_
 
 - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6427,7 +6341,6 @@
     CUDA: foreach_tensor_addcdiv_cuda
 
 - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 4303fc563cfc..739a4de51951 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -99,10 +99,6 @@
     ("preprocess", datetime.date(2020, 10, 1)),
     ("compile", datetime.date(2020, 10, 1)),
     ("execute", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_add", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_div", datetime.date(2020, 10, 1)),
-    ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
 ]
 
 
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 85d79096b2ad..8369ba5b9be5 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -4,30 +4,21 @@
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm
 
 class TestForeach(TestCase):
-    foreach_bin_ops = [
+    bin_ops = [
         torch._foreach_add,
-        torch._foreach_sub,
-        torch._foreach_mul,
-        torch._foreach_div,
-    ]
-
-    foreach_bin_ops_ = [
         torch._foreach_add_,
+        torch._foreach_sub,
         torch._foreach_sub_,
+        torch._foreach_mul,
         torch._foreach_mul_,
+        torch._foreach_div,
         torch._foreach_div_,
     ]
 
-    torch_bin_ops = [
-        torch.add,
-        torch.sub,
-        torch.mul,
-        torch.div,
-    ]
-
     def _get_test_data(self, device, dtype, N):
         if dtype in [torch.bfloat16, torch.bool, torch.float16]:
             tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)]
+
         elif dtype in torch.testing.get_all_int_dtypes():
             tensors = [torch.randint(1, 100, (N, N), device=device, dtype=dtype) for _ in range(N)]
         else:
@@ -35,39 +26,36 @@ def _get_test_data(self, device, dtype, N):
 
         return tensors
 
-    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op):
-        for N in [30, 300]:
-            tensors1 = self._get_test_data(device, dtype, N)
-            tensors2 = self._get_test_data(device, dtype, N)
-
-            expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
-            res = foreach_op(tensors1, tensors2)
-            foreach_op_(tensors1, tensors2)
-            self.assertEqual(res, tensors1)
-            self.assertEqual(tensors1, res)
-
-    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
-        for N in [30, 300]:
-            tensors1 = self._get_test_data(device, dtype, N)
-            expected = [torch_op(tensors1[i]) for i in range(N)]
-            res = foreach_op(tensors1)
-            foreach_op_(tensors1)
-            self.assertEqual(res, tensors1)
-            self.assertEqual(tensors1, expected)
-
-    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
-        for N in [30, 300]:
-            tensors = self._get_test_data(device, dtype, N)
-            tensors1 = self._get_test_data(device, dtype, N)
-            tensors2 = self._get_test_data(device, dtype, N)
-            value = 2
-
-            expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
-
-            res = foreach_op(tensors, tensors1, tensors2, value)
-            foreach_op_(tensors, tensors1, tensors2, value)
-            self.assertEqual(res, tensors)
-            self.assertEqual(tensors, expected)
+    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
+        tensors1 = self._get_test_data(device, dtype, N)
+        tensors2 = self._get_test_data(device, dtype, N)
+
+        expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
+        res = foreach_op(tensors1, tensors2)
+        foreach_op_(tensors1, tensors2)
+        self.assertEqual(res, tensors1)
+        self.assertEqual(tensors1, expected)
+
+    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
+        tensors1 = self._get_test_data(device, dtype, N)
+        expected = [torch_op(tensors1[i]) for i in range(N)]
+        res = foreach_op(tensors1)
+        foreach_op_(tensors1)
+        self.assertEqual(res, tensors1)
+        self.assertEqual(tensors1, expected)
+
+    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
+        tensors = self._get_test_data(device, dtype, N)
+        tensors1 = self._get_test_data(device, dtype, N)
+        tensors2 = self._get_test_data(device, dtype, N)
+        value = 2
+
+        expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
+
+        res = foreach_op(tensors, tensors1, tensors2, value)
+        foreach_op_(tensors, tensors1, tensors2, value)
+        self.assertEqual(res, tensors)
+        self.assertEqual(tensors, expected)
 
     def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
         tensors1 = self._get_test_data(device, dtype, N)
@@ -75,8 +63,8 @@ def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_
         alpha = 2
 
         expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)]
-        res = foreach_op(tensors1, tensors2, alpha=alpha)
-        foreach_op_(tensors1, tensors2, alpha=alpha)
+        res = foreach_op(tensors1, tensors2, alpha)
+        foreach_op_(tensors1, tensors2, alpha)
         self.assertEqual(res, tensors1)
 
         if dtype == torch.bool:
@@ -100,7 +88,7 @@ def test_exp(self, device, dtype):
     @skipCUDAIfRocm
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
     def test_addcmul(self, device, dtype):
-        if self.device_type == 'cpu':
+        if device == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcmul_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcmul,
@@ -117,7 +105,7 @@ def test_addcdiv(self, device, dtype):
                 self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, torch._foreach_addcdiv_, torch.addcdiv)
             return
 
-        if self.device_type == 'cpu':
+        if device == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcdiv_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcdiv,
@@ -130,372 +118,83 @@ def test_addcdiv(self, device, dtype):
     #
     @dtypes(*torch.testing.get_all_dtypes())
     def test_int_scalar(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalar = 3
-                expected = [torch_bin_op(t, scalar) for t in tensors]
-
-                res = foreach_bin_op(tensors, scalar)
-
-                if dtype == torch.bool:
-                    self.assertEqual(res, expected)
-
-                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalar)
-                    return
-
-
-                if foreach_bin_op_ == torch._foreach_div_ and dtype in torch.testing.integral_types() and self.device_type == "cpu":
-                    with self.assertRaisesRegex(RuntimeError,
-                                                "can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalar)
-                    return
-
-                # TODO[type promotion]: Fix once type promotion is enabled.
-                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
-                    self.assertEqual(res, [e.to(dtype) for e in expected])
-
-                    foreach_bin_op_(tensors, scalar)
-                    self.assertEqual(tensors, [e.to(dtype) for e in expected])
-                else:
-                    self.assertEqual(res, expected)
-                    foreach_bin_op_(tensors, scalar)
-                    self.assertEqual(tensors, expected)
-
-    # TODO[Fix scalar list]: 
-    # We need to update codegen to correctly handle function overloads with float[] and int[].
-    # As optimizers work with float tensors, the result will always be torch.float32 for now.
-    # Current schema is using 'float[]' as scalar list type.
-    @dtypes(*torch.testing.get_all_dtypes())
-    def test_int_scalarlist(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalars = [1 for _ in range(N)]
-                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
-
-                # we dont support bool and complex types on CUDA for now
-                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
-                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                        foreach_bin_op_(tensors, scalars)
-
-                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                        foreach_bin_op(tensors, scalars)
-                    return
-
-                res = foreach_bin_op(tensors, scalars)
-
-                if dtype == torch.bool:
-                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
-
-                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalars)
-                    return
-
-                if dtype in torch.testing.integral_types():
-                    if self.device_type == 'cpu':
-                        self.assertEqual(res, [e.to(torch.float32) for e in expected])
-                    else:
-                        # TODO[type promotion]: Fix once type promotion is enabled.
-                        self.assertEqual(res, [e.to(dtype) for e in expected])
-                else: 
-                    self.assertEqual(res, expected)
-
-                if dtype in torch.testing.integral_types() and self.device_type == 'cpu':
-                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalars)
-                    return
-                else:
-                    foreach_bin_op_(tensors, scalars)
-                    self.assertEqual(res, tensors)
+        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
+        int_scalar = 1
+
+        # bool tensor + 1 will result in int64 tensor
+        if dtype == torch.bool:
+            expected = [torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10)]
+        else:
+            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
+
+        res = torch._foreach_add(tensors, int_scalar)
+        self.assertEqual(res, expected)
+
+        if dtype in [torch.bool]:
+            with self.assertRaisesRegex(RuntimeError,
+                                        "result type Long can't be cast to the desired output type Bool"):
+                torch._foreach_add_(tensors, int_scalar)
+        else:
+            torch._foreach_add_(tensors, int_scalar)
+            self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_float_scalar(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalar = 3.3
-                expected = [torch_bin_op(t, scalar) for t in tensors]
-
-                if dtype == torch.bool:
-                    if foreach_bin_op == torch._foreach_sub:
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
-                            foreach_bin_op_(tensors, scalar)
-
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
-                            foreach_bin_op(tensors, scalar)
-                    return
-
-                res = foreach_bin_op(tensors, scalar)
-                self.assertEqual(res, expected)
-
-                if dtype in torch.testing.integral_types():
-                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalar)
-                    return
-
-                foreach_bin_op_(tensors, scalar)
-                self.assertEqual(tensors, expected)
+        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
+        float_scalar = 1.
 
-    @dtypes(*torch.testing.get_all_dtypes())
-    def test_float_scalarlist(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalars = [1.1 for _ in range(N)]
-                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
-
-                # we dont support bool and complex types on CUDA for now
-                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
-                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                        foreach_bin_op_(tensors, scalars)
-
-                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                        foreach_bin_op(tensors, scalars)
-                    return
-
-                res = foreach_bin_op(tensors, scalars)
-
-                if dtype == torch.bool:
-                    # see TODO[Fix scalar list]
-                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
-
-                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalars)
-                    return
-
-                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
-                    # see TODO[Fix scalar list]
-                    self.assertEqual(res, [e.to(dtype) for e in expected])
-
-                    foreach_bin_op_(tensors, scalars)
-                    self.assertEqual(tensors, res)
-                    return
-                else:
-                    self.assertEqual(res, expected)
-
-                if dtype in torch.testing.integral_types() and self.device_type == "cpu":
-                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalars)
-                    return
-
-                foreach_bin_op_(tensors, scalars)
-                self.assertEqual(tensors, expected)
+        # float scalar + integral tensor will result in float tensor
+        if dtype in [torch.uint8, torch.int8, torch.int16,
+                     torch.int32, torch.int64, torch.bool]:
+            expected = [torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10)]
+        else:
+            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
+
+        res = torch._foreach_add(tensors, float_scalar)
+        self.assertEqual(res, expected)
+
+        if dtype in [torch.uint8, torch.int8, torch.int16,
+                     torch.int32, torch.int64, torch.bool]:
+            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar))
+        else:
+            torch._foreach_add_(tensors, float_scalar)
+            self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_complex_scalar(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalar = 3 + 5j
-                expected = [torch_bin_op(t, scalar) for t in tensors]
-
-                if dtype == torch.bool:
-                    if foreach_bin_op == torch._foreach_sub:
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
-                            foreach_bin_op_(tensors, scalar)
-
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
-                            foreach_bin_op(tensors, scalar)
-                    return
-
-                if dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=True) and \
-                   self.device_type == 'cuda':
-                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
-                        foreach_bin_op_(tensors, scalar)
-
-                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
-                        foreach_bin_op(tensors, scalar)
-                    return
-
-                res = foreach_bin_op(tensors, scalar)
-                self.assertEqual(res, expected)
-
-                if dtype not in [torch.complex64, torch.complex128]:
-                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
-                        foreach_bin_op_(tensors, scalar)
-                else:
-                    foreach_bin_op_(tensors, scalar)
-                    self.assertEqual(res, tensors)
+        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
+        complex_scalar = 3 + 5j
 
-    @dtypes(*torch.testing.get_all_dtypes())
-    def test_complex_scalarlist(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalars = [3 + 5j for _ in range(N)]
-                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
-
-                if dtype == torch.bool:
-                    if foreach_bin_op == torch._foreach_sub:
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
-                            foreach_bin_op_(tensors, scalar)
-
-                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
-                            foreach_bin_op(tensors, scalar)
-                    return
-
-                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
-                    res = foreach_bin_op(tensors, scalars)
-
-                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
-                    foreach_bin_op_(tensors, scalars)
+        # bool tensor + 1 will result in int64 tensor
+        expected = [torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10)]
+
+        if dtype in [torch.float16, torch.float32, torch.float64, torch.bfloat16] and device == 'cuda:0':
+            # value cannot be converted to dtype without overflow:
+            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
+            self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar))
+            return
+
+        res = torch._foreach_add(tensors, complex_scalar)
+        self.assertEqual(res, expected)
+
+        if dtype not in [torch.complex64, torch.complex128]:
+            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
+        else:
+            torch._foreach_add_(tensors, complex_scalar)
+            self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_bool_scalar(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalar = True
-
-                if dtype == torch.bool:
-                    expected = [torch_bin_op(t, scalar) for t in tensors]
-                    res = foreach_bin_op(tensors, scalar)
-
-                    foreach_bin_op_(tensors, scalar)
-                    self.assertEqual(tensors, res)
-                    return
-
-                if foreach_bin_op == torch._foreach_sub and self.device_type == "cpu":
-                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
-                        res = foreach_bin_op(tensors, scalar)
-
-                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
-                        foreach_bin_op_(tensors, scalar)
-                elif foreach_bin_op == torch._foreach_sub and self.device_type == 'cuda':
-                    res = foreach_bin_op(tensors, scalar)
-                    self.assertEqual(res, foreach_bin_op(tensors, 1))
-
-                    foreach_bin_op_(tensors, scalar)
-                    self.assertEqual(tensors, res)
-                else:
-                    expected = [torch_bin_op(t, scalar) for t in tensors]
-                    res = foreach_bin_op(tensors, scalar)
-
-                    # TODO[type promotion]: Fix once type promotion is enabled.
-                    if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
-                        self.assertEqual(res, [e.to(dtype) for e in expected])
-                    else:
-                        self.assertEqual(res, expected)
-
-                    if dtype in torch.testing.integral_types():
-                        if foreach_bin_op == torch._foreach_div and self.device_type == "cpu":
-                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
-                                foreach_bin_op_(tensors, scalar)
-                        else:
-                            foreach_bin_op_(tensors, scalar)
-                            self.assertEqual(tensors, res)
-                    else:
-                        foreach_bin_op_(tensors, scalar)
-                        self.assertEqual(tensors, expected)
+        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
+        bool_scalar = True
 
-    @dtypes(*torch.testing.get_all_dtypes())
-    def test_bool_scalarlist(self, device, dtype):
-        for N in [30, 300]:
-            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
-                                                                     self.foreach_bin_ops_,
-                                                                     self.torch_bin_ops):
-                tensors = self._get_test_data(device, dtype, N)
-                scalars = [True for _ in range(N)]
-
-                if dtype == torch.bool:
-                    if self.device_type == 'cuda':
-                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                            foreach_bin_op(tensors, scalars)
-
-                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                            foreach_bin_op_(tensors, scalars)
-                        return
-                    else:
-                        if foreach_bin_op == torch._foreach_sub:
-                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
-                                foreach_bin_op_(tensors, scalars)
-
-                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
-                                foreach_bin_op(tensors, scalars)
-                        else:
-                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired"):
-                                foreach_bin_op_(tensors, scalars)
-
-                            res = foreach_bin_op(tensors, scalars)
-                            for r in res:
-                                self.assertTrue(r.dtype == torch.float32)
-                else:
-                    # we dont support bool and complex types on CUDA for now
-                    if (dtype in torch.testing.get_all_complex_dtypes()) and self.device_type == 'cuda':
-                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                            foreach_bin_op_(tensors, scalars)
-
-                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
-                            foreach_bin_op(tensors, scalars)
-                        return
-
-                    if foreach_bin_op == torch._foreach_sub:
-                        if self.device_type == "cpu":
-                            # see TODO[Fix scalar list]
-                            res = foreach_bin_op(tensors, scalars)
-                            if dtype in torch.testing.integral_types():
-                                self.assertEqual(res, [r.to(torch.float32) for r in foreach_bin_op(tensors, 1)])
-
-                                with self.assertRaisesRegex(RuntimeError, "esult type Float can't be cast to the "):
-                                    foreach_bin_op_(tensors, scalars)
-                            else:
-                                self.assertEqual(res, foreach_bin_op(tensors, 1))
-                                foreach_bin_op_(tensors, scalars)
-                                self.assertEqual(res, tensors)
-                        else:
-                            # see TODO[Fix scalar list]
-                            res = foreach_bin_op(tensors, scalars)
-                            if dtype in torch.testing.integral_types():
-                                self.assertEqual(res, [r.to(dtype) for r in foreach_bin_op(tensors, 1)])
-                            else:
-                                self.assertEqual(res, foreach_bin_op(tensors, 1))
-
-                            foreach_bin_op_(tensors, scalars)
-                            self.assertEqual(res, tensors)
-                    else:
-                        if self.device_type == "cpu":
-                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
-                            res = foreach_bin_op(tensors, scalars)
-
-                            # see TODO[Fix scalar list]
-                            if dtype in torch.testing.integral_types():
-                                self.assertEqual(res, [e.to(torch.float32) for e in expected])
-                            else:
-                                self.assertEqual(res, expected)
-
-                            if dtype in torch.testing.integral_types():
-                                with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
-                                    foreach_bin_op_(tensors, scalars)
-                            else:
-                                foreach_bin_op_(tensors, scalars)
-                                self.assertEqual(tensors, expected)
-                        else:
-                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
-                            res = foreach_bin_op(tensors, scalars)
-
-                            if dtype in torch.testing.integral_types():
-                                self.assertEqual(res, [e.to(dtype) for e in expected])
-                            else:
-                                self.assertEqual(res, expected)
-
-                            foreach_bin_op_(tensors, scalars)
-                            self.assertEqual(res, tensors)
+        expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
+
+        res = torch._foreach_add(tensors, bool_scalar)
+        self.assertEqual(res, expected)
+
+        torch._foreach_add_(tensors, bool_scalar)
+        self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_add_with_different_size_tensors(self, device, dtype):
@@ -549,9 +248,9 @@ def test_add_list_error_cases(self, device):
 
         # One empty list
         tensors1.append(torch.tensor([1], device=device))
-        with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."):
+        with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."):
             torch._foreach_add(tensors1, tensors2)
-        with self.assertRaisesRegex(RuntimeError, "Scalars list must have at least one value."):
+        with self.assertRaisesRegex(RuntimeError, "Tensor list must have at least one tensor."):
             torch._foreach_add_(tensors1, tensors2)
 
         # Lists have different amount of tensors
@@ -619,25 +318,13 @@ def test_div_list(self, device, dtype):
                 self.skipTest("Skipped! See https://github.com/pytorch/pytorch/issues/44489")
             return
 
-        for N in [30, 300]:
-            tensors1 = self._get_test_data(device, dtype, N)
-
-            if dtype in [torch.bfloat16, torch.bool, torch.float16]:
-                tensors2 = [torch.zeros(N, N, device=device, dtype=dtype).add(2) for _ in range(N)]
-            else:
-                tensors2 = self._get_test_data(device, dtype, N)
-
-            expected = [torch.div(tensors1[i], tensors2[i]) for i in range(N)]
-            res = torch._foreach_div(tensors1, tensors2)
-            torch._foreach_div_(tensors1, tensors2)
-            self.assertEqual(res, tensors1)
-            self.assertEqual(tensors1, res)
+        self._test_bin_op_list(device, dtype, torch._foreach_div, torch._foreach_div_, torch.div)
 
     def test_bin_op_list_error_cases(self, device):
         tensors1 = []
         tensors2 = []
 
-        for bin_op in self.foreach_bin_ops + self.foreach_bin_ops_:
+        for bin_op in self.bin_ops:
             # Empty lists
             with self.assertRaises(RuntimeError):
                 bin_op(tensors1, tensors2)
diff --git a/test/test_native_functions.py b/test/test_native_functions.py
index e5afc79f037a..869c7aad47fb 100644
--- a/test/test_native_functions.py
+++ b/test/test_native_functions.py
@@ -58,7 +58,7 @@ def fake_module(values, const):
         self.do_test_optional_floatlist_with_module(fake_module)
 
     def test_optional_floatlist_invalid(self):
-        with self.assertRaisesRegex(TypeError, "must be tuple of floats, not list"):
+        with self.assertRaisesRegex(TypeError, "must be .* but found"):
             FloatListWrapperModule()(torch.zeros(1), ["hi"])
 
         with self.assertRaisesRegex(RuntimeError, "value of type .* instead found type"):
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 8f272de9a5f6..995dff38030b 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -281,7 +281,6 @@ def create_python_bindings(python_functions, is_python_method, module):
     'c10::optional<bool>': 'toBoolOptional',
     'c10::optional<double>': 'toDoubleOptional',
     'c10::optional<ArrayRef<double>>': 'doublelistOptional',
-    'ArrayRef<double>': 'doublelist',
     'IntArrayRef': 'intlist',
     'Scalar': 'scalar',
     'ScalarType': 'scalartype',
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 673af99bce77..62e9b8dd227f 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -44,7 +44,6 @@ using at::Generator;
 using at::TensorList;
 using at::Dimname;
 using at::DimnameList;
-using at::ArrayRef;
 
 using namespace torch::autograd::utils;
 
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 4ec0dc428b81..b0c470c91b6a 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -304,10 +304,6 @@ def __post_init__(self) -> None:
             # TODO: fixme
             if str(self.name) not in [
                     '_amp_non_finite_check_and_unscale_',
-                    '_foreach_add_.ScalarList',
-                    '_foreach_sub_.ScalarList',
-                    '_foreach_mul_.ScalarList',
-                    '_foreach_div_.ScalarList',
                     '_foreach_add_.Scalar',
                     '_foreach_sub_.Scalar',
                     '_foreach_mul_.Scalar',
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index d24966f9fb52..7079c6750223 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -146,7 +146,6 @@ def type_to_python(typename, size=None):
         'Dimname': 'Union[str, ellipsis, None]',
         'DimnameList': 'Sequence[Union[str, ellipsis, None]]',
         'QScheme': '_qscheme',
-        'ArrayRef<double>' : 'Sequence[float]'
     }[typename]
 
     return typename
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index f9e26af63ada..e954bef398e9 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -366,23 +366,6 @@ bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector<py::handle>
   return true;
 }
 
-bool is_float_list(PyObject* obj) {
-  auto tuple = six::isTuple(obj);
-  if (!(tuple || PyList_Check(obj))) {
-    return false;
-  }
-
-  auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
-  if (size > 0) { 
-    PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
-    if (!THPUtils_checkDouble(iobj) && !PyComplex_Check(iobj)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 // argnum is needed for raising the TypeError, it's used in the error message.
 auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded_args, int argnum) -> bool
 {
@@ -437,9 +420,7 @@ auto FunctionParameter::check(PyObject* obj, std::vector<py::handle> &overloaded
       // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single int
       return size > 0 && THPUtils_checkLong(obj);
     }
-    case ParameterType::FLOAT_LIST: {
-      return is_float_list(obj);
-    }
+    case ParameterType::FLOAT_LIST: return (PyTuple_Check(obj) || PyList_Check(obj));
     case ParameterType::GENERATOR: return THPGenerator_Check(obj);
     case ParameterType::BOOL: return PyBool_Check(obj);
     case ParameterType::STORAGE: return isStorage(obj);
@@ -920,7 +901,6 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject*
   print_error(self, args, kwargs, parsed_args);
 }
 
-
 void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {  // NOLINT
   auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
   std::vector<int> plausible_idxs;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index d0e2bdc074ff..78efb6cf2db3 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -173,8 +173,6 @@ struct PythonArgs {
   inline c10::optional<bool> toBoolOptional(int i);
   inline c10::optional<double> toDoubleOptional(int i);
   inline c10::OptionalArray<double> doublelistOptional(int i);
-  inline std::vector<double> doublelist(int i);
-  inline std::vector<double> getDoublelist(int i);
   inline at::Layout layout(int i);
   inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
   inline c10::optional<at::Layout> layoutOptional(int i);
@@ -371,7 +369,10 @@ inline c10::OptionalArray<int64_t> PythonArgs::intlistOptional(int i) {
   return intlist(i);
 }
 
-inline std::vector<double> PythonArgs::getDoublelist(int i) {
+inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
   auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -389,17 +390,6 @@ inline std::vector<double> PythonArgs::getDoublelist(int i) {
   return res;
 }
 
-inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
-  if (!args[i]) {
-    return {};
-  }
-  return this->getDoublelist(i);
-}
-
-inline std::vector<double> PythonArgs::doublelist(int i) {
-  return this->getDoublelist(i);
-}
-
 inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) {
   if (!args[i]) return default_scalartype;
   return scalartype(i);

From c211a9102f20cf85eba3a395a20567baa73c764f Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Thu, 24 Sep 2020 11:54:41 -0700
Subject: [PATCH 099/449] add rocm 3.8 to nightly builds (#45222)

Summary:
Corresponding change in builder repo: https://github.com/pytorch/builder/pull/528.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45222

Reviewed By: ezyang

Differential Revision: D23894831

Pulled By: walterddr

fbshipit-source-id: c6a256ec325ddcf5836b4d293f546368d58db538
---
 .circleci/cimodel/data/binary_build_data.py |   6 +-
 .circleci/cimodel/data/dimensions.py        |   5 +-
 .circleci/config.yml                        | 156 ++++++++++++++++++++
 3 files changed, 163 insertions(+), 4 deletions(-)

diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py
index 58fbbd08f994..21b6eebef5a1 100644
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version):
     )),
     # Skip CUDA-9.2 builds on Windows
     windows=(
-        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]],
+        [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
         OrderedDict(
             wheel=dimensions.STANDARD_PYTHON_VERSIONS,
             conda=dimensions.STANDARD_PYTHON_VERSIONS,
@@ -142,11 +142,11 @@ def get_children(self):
 
         # XXX disabling conda rocm build since docker images are not there
         if self.find_prop("package_format") == 'conda':
-            gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
+            gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)
 
         # XXX libtorch rocm build  is temporarily disabled
         if self.find_prop("package_format") == 'libtorch':
-            gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
+            gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)
 
         return [ArchConfigNode(self, v) for v in gpu_versions]
 
diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
index 93d4d645a53a..1f83cd61b13c 100644
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@@ -9,9 +9,12 @@
 
 ROCM_VERSIONS = [
     "3.7",
+    "3.8",
 ]
 
-GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS]
+ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
+
+GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS
 
 STANDARD_PYTHON_VERSIONS = [
     "3.6",
diff --git a/.circleci/config.yml b/.circleci/config.yml
index b70a090bed72..700a4155441d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2130,6 +2130,39 @@ workflows:
               only:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           docker_image: "pytorch/manylinux-rocm:3.7"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
+      - binary_linux_build:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          docker_image: "pytorch/manylinux-rocm:3.8"
       - binary_linux_build:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -3429,6 +3462,51 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - binary_linux_test:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          filters:
+            branches:
+              only:
+                - /.*/
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          requires:
+            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - binary_linux_test:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test
           build_environment: "conda 3.6 cpu devtoolset7"
@@ -4932,6 +5010,48 @@ workflows:
                 - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           package_type: manywheel
           upload_subfolder: rocm3.7
+      - binary_upload:
+          name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
+      - binary_upload:
+          name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
+      - binary_upload:
+          name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_upload
+          context: org-member
+          requires:
+            - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
+          filters:
+            branches:
+              only:
+                - nightly
+            tags:
+              only:
+                - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          package_type: manywheel
+          upload_subfolder: rocm3.8
       - binary_upload:
           name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload
           context: org-member
@@ -7458,6 +7578,42 @@ workflows:
           docker_image: "pytorch/manylinux-rocm:3.7"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
+      - smoke_linux_test:
+          name: smoke_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly
+          build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
+          requires:
+            - update_s3_htmls
+          filters:
+            branches:
+              only:
+                - postnightly
+          docker_image: "pytorch/manylinux-rocm:3.8"
+          use_cuda_docker_runtime: "1"
+          resource_class: gpu.medium
       - smoke_linux_test:
           name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly
           build_environment: "conda 3.6 cpu devtoolset7"

From c3a5aed5f7f13f193c7444b1c7af12344e8ce964 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Thu, 24 Sep 2020 12:10:10 -0700
Subject: [PATCH 100/449] Run pytorch_core CUDA tests on GPU using TPX

Summary:
Modify contbuild to disable sanitizers, add option to run "cuda" test using TPX RE

(Note: this ignores all push blocking failures!)

Test Plan: CI

Reviewed By: walterddr, cspanda

Differential Revision: D23854578

fbshipit-source-id: 327d7cc3655c17034a6a7bc78f69967403290623
---
 test/test_cuda.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index 2d23954cfcf8..6c904a67e619 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -21,7 +21,7 @@
 from torch.testing._internal.common_methods_invocations import tri_tests_args, tri_large_tests_args, \
     _compare_trilu_indices, _compare_large_trilu_indices
 from torch.testing._internal.common_utils import TestCase, get_gpu_type, freeze_rng_state, run_tests, \
-    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, \
+    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_SANDCASTLE, \
     slowTest, skipCUDANonDefaultStreamIf, TEST_WITH_ROCM, TEST_NUMPY
 from torch.testing._internal.autocast_test_lists import AutocastTestLists
 
@@ -1732,6 +1732,7 @@ def test_streaming_backwards_device_transfer(self):
         self.assertTrue(b.grad.sum().item() == 4 * size)
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
+    @unittest.skipIf(not IS_SANDCASTLE, "Does not work on Sandcastle")
     def test_cuda_init_race(self):
         # See https://github.com/pytorch/pytorch/issues/16559
         import subprocess

From e2bcdc7b697de757991664d81735d797e70be59b Mon Sep 17 00:00:00 2001
From: Xiaomeng Yang <yangxm@fb.com>
Date: Thu, 24 Sep 2020 12:28:12 -0700
Subject: [PATCH 101/449] [Caffe2] Fix LayerNormOp when batch_size == 0.
 (#45250)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45250

[Caffe2] Fix LayerNormOp when batch_size == 0.

Test Plan: buck test mode/dev-nosan //caffe2/caffe2/python/operator_test:layer_norm_op_test

Reviewed By: houseroad

Differential Revision: D23892091

fbshipit-source-id: 9a34654dd6880c9d14b7111fcf850e4f48ffdf91
---
 caffe2/operators/layer_norm_op.h              | 15 ++++++++++
 .../operator_test/layer_norm_op_test.py       | 28 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h
index e1e8ec0693d6..543ad8dd0b34 100644
--- a/caffe2/operators/layer_norm_op.h
+++ b/caffe2/operators/layer_norm_op.h
@@ -52,6 +52,11 @@ class LayerNormOp final : public Operator<Context> {
     T* sigma_data = sigma->template mutable_data<T>();
     T* scale_data = scale_.template mutable_data<T>();
     T* bias_data = bias_.template mutable_data<T>();
+
+    if (M == 0) {
+      return true;
+    }
+
     const std::array<int, 2> X_dims = {M, N};
     const std::array<int, 2> Y_dims = {M, 1};
     math::Moments<T, Context>(
@@ -174,6 +179,16 @@ class LayerNormGradientOp final : public Operator<Context> {
       g_scale_data = g_scale_.template mutable_data<T>();
     }
 
+    if (M == 0) {
+      if (N > 0 && dgamma_data != nullptr) {
+        math::Set<T, Context>(N, T(0), dgamma_data, &context_);
+      }
+      if (N > 0 && dbeta_data != nullptr) {
+        math::Set<T, Context>(N, T(0), dbeta_data, &context_);
+      }
+      return true;
+    }
+
     ComputeInternalGradients<T>(
         M, N, dY_data, X_data, gamma_data, dX_data, ds_data, db_data);
     ComputeFusedParams<T>(
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 56cd72d69991..62e94afe9e7d 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -373,6 +373,34 @@ def test_layer_norm_brew_wrapper(self, X, gc, dc):
         self.ws.create_net(model.param_init_net).run()
         self.ws.create_net(model.net).run()
 
+    @given(N=st.integers(1, 10), elementwise_affine=st.booleans(), **hu.gcs)
+    @settings(deadline=None)
+    def test_layer_norm_with_empty_batch(self, N, elementwise_affine, gc, dc):
+        X = np.random.randn(0, N).astype(np.float32)
+        gamma = np.random.rand(N).astype(np.float32)
+        beta = np.random.rand(N).astype(np.float32)
+
+        op = core.CreateOperator(
+            "LayerNorm",
+            ["X", "gamma", "beta"] if elementwise_affine else ["X"],
+            ["Y", "mean", "sigma"],
+            elementwise_affine=elementwise_affine,
+        )
+
+        def ref(X, gamma=None, beta=None):
+            Y = np.zeros_like(X)
+            axis = 1
+            mean = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype)
+            sigma = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype)
+            return Y, mean, sigma
+
+
+        inputs = [X, gamma, beta] if elementwise_affine else [X]
+        self.assertReferenceChecks(gc, op, inputs, ref)
+        self.assertDeviceChecks(dc, op, inputs, [0, 1])
+        for i in range(len(inputs)):
+            self.assertGradientChecks(gc, op, inputs, i, [0])
+
 
 if __name__ == "__main__":
     unittest.main()

From 022ba5a78bc18fd1947cd666d5eab7dbb4eb7328 Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Thu, 24 Sep 2020 13:27:57 -0700
Subject: [PATCH 102/449] Make ddp_comm_hook_wrapper a private method. (#44643)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44643

This method is not used anywhere else.

Also formatted the file.

Test Plan: buck test caffe2/test/distributed/algorithms/ddp_comm_hooks:test_ddp_hooks

Reviewed By: pritamdamania87

Differential Revision: D23675945

fbshipit-source-id: 2d04f94589a20913e46b8d71e6a39b70940c1461
---
 .../algorithms/ddp_comm_hooks/__init__.py       | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index 51678fe44590..6b07e23c9476 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -6,24 +6,27 @@
 from torch.nn.parallel import DistributedDataParallel
 
 
-def ddp_comm_hook_wrapper(comm_hook, model, state):
+def _ddp_comm_hook_wrapper(comm_hook, model, state):
     model._register_comm_hook(state, comm_hook)
 
 
 class DDPCommHookType(Enum):
-    '''
+    """
     DDPCommHookType enumerates the hooks of ``torch.distributed.algorithms.ddp_comm_hooks``
     as names and ``ddp_comm_hook_wrapper`` partials with hook specified. As an example,
     you can register allreduce hook by
     ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
-    '''
-    ALLREDUCE = partial(ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
-    FP16_COMPRESS = partial(ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook)
+    """
+
+    ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+    FP16_COMPRESS = partial(
+        _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
+    )
     QUANTIZE_PER_TENSOR = partial(
-        ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
     )
     QUANTIZE_PER_CHANNEL = partial(
-        ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
     )
 
 

From cbe1eac1f48329b4706149cd3a05898213542398 Mon Sep 17 00:00:00 2001
From: Danny Huang <dannyhuang@fb.com>
Date: Thu, 24 Sep 2020 14:20:23 -0700
Subject: [PATCH 103/449] [caffe2] adds Cancel to SafeDequeueBlobsOp and
 SafeEnqueueBlobsOp (#45177)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45177

## Motivation
* To be able to make C2 ops cancellable so we can safely exit.
* Some C2 operators are now blocking thus being non-cancellable. If an error
occurs we need to be able to safely stop all net execution so we can throw
the exception to the caller.

## Summary
* When an error occurs in a net or it got cancelled, running ops will have the
`Cancel` method called.
This diff adds `Cancel` method to the `SafeEnqueueBlobsOp`
and `SafeDequeueBlobsOp` to have the call queue->close() to force all the
blocking ops to return.
* Adds unit test that verified the error propagation.

Test Plan:
## Unit test added to verify that queue ops propagate errors

```
buck test caffe2/caffe2/python:hypothesis_test -- test_safe_dequeue_blob__raises_exception_when_hang --stress-runs 1000
```

```
Summary
  Pass: 1000
  ListingSuccess: 1
```

Reviewed By: d4l3k

Differential Revision: D23846967

fbshipit-source-id: c7ddd63259e033ed0bed9df8e1b315f87bf59394
---
 caffe2/queue/queue_ops.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h
index 64ab19937929..bb70e0f85885 100644
--- a/caffe2/queue/queue_ops.h
+++ b/caffe2/queue/queue_ops.h
@@ -113,6 +113,12 @@ class SafeEnqueueBlobsOp final : public Operator<Context> {
         1, !status, Output(size)->template mutable_data<bool>(), &context_);
     return true;
   }
+
+  void Cancel() override {
+    auto queue = Operator<Context>::Inputs()[0]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+    queue->close();
+  }
 };
 
 template <typename Context>
@@ -192,6 +198,12 @@ class SafeDequeueBlobsOp final : public Operator<Context> {
     return true;
   }
 
+  void Cancel() override {
+    auto queue = Operator<Context>::Inputs()[0]
+                     ->template Get<std::shared_ptr<BlobsQueue>>();
+    queue->close();
+  }
+
  private:
   int numRecords_;
   std::vector<Blob> blobs_;

From 71e6ce66166bb74dbec0fffcdfc72b5fb0e6f9d5 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Thu, 24 Sep 2020 14:29:55 -0700
Subject: [PATCH 104/449] [JIT] Specialize AutogradZero: merge
 AutogradAnyNonZero and Not(AutogradAnyNonZero) checks into one. (#44987)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44987

This PR introduces new `prim::AutogradAllZero` and
`prim::AutogradAllNonZero` ops that are used for a batch check for
multiple tensors. The specialize-autogradzero pass now generates one
check for all expected-to-be-undefined tensors, one check for all
expected-to-be-defined tensors, and a bunch of checks for size
parameters passed to `grad_sum_to_size` (this probably could be cleaned
up somehow as well in future).

An example of what we generated before this change:
```
%1626 : bool = prim::AutogradAnyNonZero(%0)
%1627 : bool = prim::AutogradAnyNonZero(%2)
%1628 : bool = aten::__not__(%1627)
%1629 : bool = prim::AutogradAnyNonZero(%3)
%1630 : bool = aten::__not__(%1629)
%1631 : bool = prim::AutogradAnyNonZero(%4)
%1632 : bool = aten::__not__(%1631)
%1633 : bool = prim::AutogradAnyNonZero(%5)
%1634 : bool = aten::__not__(%1633)
%1635 : bool = prim::AutogradAnyNonZero(%6)
%1636 : bool = aten::__not__(%1635)
%1637 : bool = prim::AutogradAnyNonZero(%7)
%1638 : bool = aten::__not__(%1637)
%1639 : bool = prim::AutogradAnyNonZero(%8)
%1640 : bool = aten::__not__(%1639)
%1641 : bool = prim::AutogradAnyNonZero(%9)
%1642 : bool = aten::__not__(%1641)
%1643 : bool = prim::AutogradAnyNonZero(%10)
%1644 : bool = aten::__not__(%1643)
%1645 : bool = prim::AutogradAnyNonZero(%11)
%1646 : bool = aten::__not__(%1645)
%1647 : bool = prim::AutogradAnyNonZero(%12)
%1648 : bool = aten::__not__(%1647)
%1649 : bool = prim::AutogradAnyNonZero(%13)
%1650 : bool = aten::__not__(%1649)
%1651 : bool = prim::AutogradAnyNonZero(%14)
%1652 : bool = aten::__not__(%1651)
%1653 : bool = prim::AutogradAnyNonZero(%15)
%1654 : bool = aten::__not__(%1653)
%1655 : bool = prim::AutogradAnyNonZero(%16)
%1656 : bool = aten::__not__(%1655)
%1657 : bool = prim::AutogradAnyNonZero(%17)
%1658 : bool = prim::AutogradAnyNonZero(%18)
%1659 : bool = prim::AutogradAnyNonZero(%19)
%1660 : bool = prim::AutogradAnyNonZero(%20)
%1661 : bool = aten::__is__(%self_size.16, %1625)
%1662 : bool = aten::__is__(%other_size.16, %1625)
%1663 : bool = aten::__is__(%self_size.14, %1625)
%1664 : bool = aten::__is__(%self_size.12, %1625)
%1665 : bool = prim::AutogradAnyNonZero(%ingate.7)
%1666 : bool = prim::AutogradAnyNonZero(%forgetgate.7)
%1667 : bool = prim::AutogradAnyNonZero(%cellgate.7)
%1668 : bool = prim::AutogradAnyNonZero(%30)
%1669 : bool = prim::AutogradAnyNonZero(%31)
%1670 : bool = aten::__is__(%self_size.10, %1625)
%1671 : bool = aten::__is__(%other_size.10, %1625)
%1672 : bool = prim::AutogradAnyNonZero(%34)
%1673 : bool = prim::AutogradAnyNonZero(%35)
%1674 : bool = aten::__is__(%self_size.8, %1625)
%1675 : bool = aten::__is__(%other_size.8, %1625)
%1676 : bool = aten::__is__(%self_size.6, %1625)
%1677 : bool = aten::__is__(%other_size.6, %1625)
%1678 : bool = prim::AutogradAnyNonZero(%outgate.7)
%1679 : bool = prim::AutogradAnyNonZero(%41)
%1680 : bool = prim::AutogradAnyNonZero(%42)
%1681 : bool = prim::AutogradAnyNonZero(%43)
%1682 : bool = aten::__is__(%self_size.4, %1625)
%1683 : bool = aten::__is__(%other_size.4, %1625)
%1684 : bool[] = prim::ListConstruct(%1626, %1628, %1630, %1632, %1634, %1636, %1638, %1640, %1642, %1644, %1646, %1648, %1650, %1652, %1654, %1656, %1657, %1658, %1659, %1660, %1661, %1662, %1663, %1664, %1665, %1666, %1667, %1668, %1669, %1670, %1671, %1672, %1673, %1674, %1675, %1676, %1677, %1678, %1679, %1680, %1681, %1682, %1683)
%1685 : bool = aten::all(%1684)
```

Same example after this change:
```
%1625 : None = prim::Constant()
%1626 : bool = aten::__is__(%self_size.16, %1625)
%1627 : bool = aten::__is__(%other_size.16, %1625)
%1628 : bool = aten::__is__(%self_size.14, %1625)
%1629 : bool = aten::__is__(%self_size.12, %1625)
%1630 : bool = aten::__is__(%self_size.10, %1625)
%1631 : bool = aten::__is__(%other_size.10, %1625)
%1632 : bool = aten::__is__(%self_size.8, %1625)
%1633 : bool = aten::__is__(%other_size.8, %1625)
%1634 : bool = aten::__is__(%self_size.6, %1625)
%1635 : bool = aten::__is__(%other_size.6, %1625)
%1636 : bool = aten::__is__(%self_size.4, %1625)
%1637 : bool = aten::__is__(%other_size.4, %1625)
%1638 : bool = prim::AutogradAllNonZero(%0, %17, %18, %19, %20, %ingate.7, %forgetgate.7, %cellgate.7, %30, %31, %34, %35, %outgate.7, %41, %42, %43)
%1639 : bool = prim::AutogradAllZero(%2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16)
%1640 : bool[] = prim::ListConstruct(%1626, %1627, %1628, %1629, %1630, %1631, %1632, %1633, %1634, %1635, %1636, %1637, %1638, %1639)
%1641 : bool = aten::all(%1640)
```

My performance measurements showed some changes, but I don't really
trust them and think that they are probably just a noise. Below are
tables with min-aggregation over 10 runs:

FastRNN models:

| name                                             | base time (s) |   diff time (s) |   % change |
| :---                                             |          ---: |            ---: |       ---: |
| lstm[aten]:bwd                                   |     30.059927 |       29.834089 |      -0.8% |
| lstm[aten]:fwd                                   |     25.673708 |       25.700039 |       0.1% |
| lstm[cudnn]:bwd                                  |     17.866232 |       17.893120 |       0.2% |
| lstm[cudnn]:fwd                                  |     11.418444 |       11.408514 |      -0.1% |
| lstm[jit]:bwd                                    |     27.127205 |       27.141029 |       0.1% |
| lstm[jit]:fwd                                    |     17.018047 |       16.975451 |      -0.3% |
| lstm[jit_multilayer]:bwd                         |     27.502396 |       27.365149 |      -0.5% |
| lstm[jit_multilayer]:fwd                         |     16.918591 |       16.917767 |      -0.0% |
| lstm[jit_premul]:bwd                             |     22.281199 |       22.215082 |      -0.3% |
| lstm[jit_premul]:fwd                             |     14.848708 |       14.896231 |       0.3% |
| lstm[jit_premul_bias]:bwd                        |     20.761206 |       21.170969 |       2.0% |
| lstm[jit_premul_bias]:fwd                        |     15.013515 |       15.037978 |       0.2% |
| lstm[jit_simple]:bwd                             |     26.715771 |       26.697786 |      -0.1% |
| lstm[jit_simple]:fwd                             |     16.675898 |       16.545893 |      -0.8% |
| lstm[py]:bwd                                     |     56.327065 |       54.731030 |      -2.8% |
| lstm[py]:fwd                                     |     39.876324 |       39.230572 |      -1.6% |

Torch Hub models:

| name                                             | base time (s) |   diff time (s) |   % change |
| :---                                             |          ---: |            ---: |       ---: |
| test_eval[BERT_pytorch-cuda-jit]                 |      0.111706 |        0.106604 |      -4.6% |
| test_eval[LearningToPaint-cuda-jit]              |      0.002841 |        0.002801 |      -1.4% |
| test_eval[Super_SloMo-cuda-jit]                  |      0.384869 |        0.384737 |      -0.0% |
| test_eval[attension_is_all_you_nee...-cuda-jit]  |      0.123857 |        0.123923 |       0.1% |
| test_eval[demucs-cuda-jit]                       |      0.077270 |        0.076878 |      -0.5% |
| test_eval[fastNLP-cuda-jit]                      |      0.000255 |        0.000249 |      -2.3% |
| test_eval[moco-cuda-jit]                         |      0.426472 |        0.427380 |       0.2% |
| test_eval[pytorch_CycleGAN_and_pix...-cuda-jit]  |      0.026483 |        0.026423 |      -0.2% |
| test_eval[pytorch_mobilenet_v3-cuda-jit]         |      0.036202 |        0.035853 |      -1.0% |
| test_eval[pytorch_struct-cuda-jit]               |      0.001439 |        0.001495 |       3.9% |
| test_train[BERT_pytorch-cuda-jit]                |      0.247236 |        0.247188 |      -0.0% |
| test_train[Background_Matting-cuda-jit]          |      3.536659 |        3.581864 |       1.3% |
| test_train[LearningToPaint-cuda-jit]             |      0.015341 |        0.015331 |      -0.1% |
| test_train[Super_SloMo-cuda-jit]                 |      1.018626 |        1.019098 |       0.0% |
| test_train[attension_is_all_you_nee...-cuda-jit] |      0.446314 |        0.444893 |      -0.3% |
| test_train[demucs-cuda-jit]                      |      0.169647 |        0.169846 |       0.1% |
| test_train[fastNLP-cuda-jit]                     |      0.001990 |        0.001978 |      -0.6% |
| test_train[moco-cuda-jit]                        |      0.855323 |        0.856974 |       0.2% |
| test_train[pytorch_mobilenet_v3-cuda-jit]        |      0.497723 |        0.485416 |      -2.5% |
| test_train[pytorch_struct-cuda-jit]              |      0.309692 |        0.308792 |      -0.3% |

Differential Revision: D23794659

Test Plan: Imported from OSS

Reviewed By: bertmaher

Pulled By: ZolotukhinM

fbshipit-source-id: 859b68868ef839c5c6cbc7021879ee22d3144ea8
---
 aten/src/ATen/core/interned_strings.h         |  2 ++
 .../jit/passes/specialize_autogradzero.cpp    | 23 ++++++++++---
 torch/csrc/jit/runtime/operator.cpp           |  2 ++
 torch/csrc/jit/runtime/profiling_record.cpp   |  2 ++
 .../jit/runtime/register_prim_ops_fulljit.cpp | 32 +++++++++++++++++++
 5 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index bce5b27e37b1..b279a2400350 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -59,6 +59,8 @@ namespace c10 {
   _(prim, Store)                     \
   _(prim, AutogradZero)              \
   _(prim, AutogradAnyNonZero)        \
+  _(prim, AutogradAllNonZero)        \
+  _(prim, AutogradAllZero)           \
   _(prim, Starred)                   \
   _(prim, TupleConstruct)            \
   _(prim, TupleUnpack)               \
diff --git a/torch/csrc/jit/passes/specialize_autogradzero.cpp b/torch/csrc/jit/passes/specialize_autogradzero.cpp
index ad1fb36da5de..2fc95ae72339 100644
--- a/torch/csrc/jit/passes/specialize_autogradzero.cpp
+++ b/torch/csrc/jit/passes/specialize_autogradzero.cpp
@@ -117,6 +117,8 @@ struct AutogradZeroSpecializer {
     WithInsertPoint wip{graph_->block()->param_node()->next()};
     Value* none_val = graph_->insertConstant(IValue());
     std::vector<Value*> checks;
+    std::vector<Value*> zero_values;
+    std::vector<Value*> nonzero_values;
 
     for (auto inp : graph_->inputs()) {
       if (auto profile_optional_node = getUse(inp, prim::profile_optional)) {
@@ -146,15 +148,16 @@ struct AutogradZeroSpecializer {
       }
 
       state_[inp] = *pttp->undefined() ? State::Zero : State::Nonzero;
-      auto check = graph_->insert(prim::AutogradAnyNonZero, {inp});
+
       if (*pttp->undefined()) {
-        check = graph_->insert(aten::__not__, {check});
+        zero_values.push_back(inp);
+      } else {
+        nonzero_values.push_back(inp);
       }
-      checks.push_back(check);
     }
 
     // unable to specialize any of the inputs
-    if (checks.size() == 0) {
+    if (nonzero_values.size() == 0 && zero_values.size() == 0) {
       GRAPH_DUMP("Unable to add any specialization guards", graph_);
       versioning_if->destroy();
       // the checks we inserted will be cleaned up
@@ -162,6 +165,18 @@ struct AutogradZeroSpecializer {
       return nullptr;
     }
 
+    Node* nonzero_check = graph_->insert(prim::AutogradAllNonZero, {})->node();
+    for (Value* v : nonzero_values) {
+      nonzero_check->addInput(v);
+    }
+    checks.push_back(nonzero_check->output());
+
+    Node* zero_check = graph_->insert(prim::AutogradAllZero, {})->node();
+    for (Value* v : zero_values) {
+      zero_check->addInput(v);
+    }
+    checks.push_back(zero_check->output());
+
     Value* bool_list =
         graph_->insertNode(graph_->createList(BoolType::get(), checks))
             ->output();
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index f12c8186396e..2bd6a2b47ec9 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -224,6 +224,8 @@ bool printerHasSpecialCaseFor(Symbol sym) {
       c10::onnx::Shape, // only used in onnx
       prim::AutogradZero, // temporarily inserted by autograd
       prim::AutogradAnyNonZero, // temporarily inserted by autograd
+      prim::AutogradAllNonZero, // temporarily inserted by autograd
+      prim::AutogradAllZero, // temporarily inserted by autograd
       prim::AutogradAdd, // temporarily inserted by autograd
       prim::ConstantChunk, // optimization pass adds it
       prim::DifferentiableGraph, // optimization pass adds it,
diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp
index 6ad16774789b..98c073668170 100644
--- a/torch/csrc/jit/runtime/profiling_record.cpp
+++ b/torch/csrc/jit/runtime/profiling_record.cpp
@@ -171,6 +171,8 @@ bool needsProfiledInputs(Node* n) {
     // specialize_autogradzero
     case prim::AutogradAdd:
     case prim::AutogradAnyNonZero:
+    case prim::AutogradAllNonZero:
+    case prim::AutogradAllZero:
     case prim::AutogradZero:
     // peephole
     case aten::dim:
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index e3b0fa1e88c3..dc075ce14166 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -442,6 +442,38 @@ RegisterOperators reg(
            stack->emplace_back(result);
          },
          aliasAnalysisFromSchema()),
+     Operator(
+         "prim::AutogradAllZero(...) -> bool",
+         [](Stack* stack) {
+           auto num_inputs = pop(stack).toInt();
+           bool result = true;
+           for (const IValue& v : last(stack, num_inputs)) {
+             TORCH_INTERNAL_ASSERT(v.isTensor());
+             if (v.toTensor().defined()) {
+               result = false;
+               break;
+             }
+           }
+           drop(stack, num_inputs);
+           stack->emplace_back(result);
+         },
+         aliasAnalysisFromSchema()),
+     Operator(
+         "prim::AutogradAllNonZero(...) -> bool",
+         [](Stack* stack) {
+           auto num_inputs = pop(stack).toInt();
+           bool result = true;
+           for (const IValue& v : last(stack, num_inputs)) {
+             TORCH_INTERNAL_ASSERT(v.isTensor());
+             if (!v.toTensor().defined()) {
+               result = false;
+               break;
+             }
+           }
+           drop(stack, num_inputs);
+           stack->emplace_back(result);
+         },
+         aliasAnalysisFromSchema()),
      Operator(
          "prim::AutogradAdd(Any a, Any b) -> Any",
          [](Stack* stack) {

From cd7a68228280e8497ebb55402c1fd7ce0c905b92 Mon Sep 17 00:00:00 2001
From: Danny Huang <dannyhuang@fb.com>
Date: Thu, 24 Sep 2020 14:39:58 -0700
Subject: [PATCH 105/449] [caffe2] adds hypothesis test for queue ops cancel
 (#45178)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45178

## Motivation
* To be able to make C2 ops cancellable so we can safely exit.
* Some C2 operators are now blocking thus being non-cancellable. If an error
occurs we need to be able to safely stop all net execution so we can throw
the exception to the caller.

## Summary
* Adds a hypothesis test for queue ops cancellation.

Test Plan:
## Unit test added to verify that queue ops propagate errors

```
buck test caffe2/caffe2/python:hypothesis_test
buck test caffe2/caffe2/python:hypothesis_test -- test_safe_dequeue_blob__raises_exception_when_hang --stress-runs 1000
```

```
Summary
  Pass: 1000
  ListingSuccess: 1
```

Reviewed By: d4l3k

Differential Revision: D23847576

fbshipit-source-id: 2fc351e1ee13ea8b32d976216d2d01dfb6fcc1ad
---
 caffe2/python/hypothesis_test.py | 56 +++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
index 8a286383f60f..045677f8422a 100644
--- a/caffe2/python/hypothesis_test.py
+++ b/caffe2/python/hypothesis_test.py
@@ -10,7 +10,7 @@
 from hypothesis import assume, given, settings, HealthCheck
 import hypothesis.strategies as st
 import unittest
-import os
+import threading
 
 from caffe2.python import core, workspace, tt_core, dyndep
 import caffe2.python.hypothesis_test_util as hu
@@ -2695,6 +2695,60 @@ def histogram(X):
         self.assertDeviceChecks(dc, op, [X], [0, 1])
         self.assertReferenceChecks(gc, op, [X], histogram)
 
+    @settings(max_examples=1, deadline=None)
+    @given(
+        queue_capacity=st.integers(2, 2),
+        time_sleep=st.integers(5, 10),
+        num_blobs_to_equeue=st.integers(1, 1),
+        num_blobs_to_dequeue=st.integers(2, 2),
+    )
+    def test_safe_dequeue_blob__raises_exception_when_hang(
+        self,
+        queue_capacity,
+        time_sleep,
+        num_blobs_to_equeue,
+        num_blobs_to_dequeue,
+    ):
+        r"""
+        Tests SafeDequeueBlobsOp being cancellable.
+
+        Create a queue with the number of BlobsQueue less than the number
+        SafeDequeueBlobs to cause the hanging behavior when running the Net.
+
+        Then call cancel from the previous sleeping thread to ensure exception
+        is raised.
+        """
+
+        def _net_instance_cancel(net_instance):
+            time.sleep(time_sleep)
+            net_instance.cancel()
+
+        init_net = core.Net("init_net")
+        init_net.Proto().type = "async_scheduling"
+
+        queue = init_net.CreateBlobsQueue(
+            [],
+            "queue_name",
+            capacity=queue_capacity,
+            num_blobs=num_blobs_to_equeue,
+        )
+
+        ws = workspace.Workspace()
+        ws.create_net(init_net).run()
+
+        net = core.Net("net")
+        net.Proto().type = "async_scheduling"
+
+        blobs = net.SafeDequeueBlobs([queue], num_blobs_to_dequeue)
+
+        net_instance = ws.create_net(net)
+
+        t = threading.Thread(target=_net_instance_cancel, args=[net_instance])
+        t.start()
+
+        with self.assertRaises(Exception):
+            net_instance.run()
+            t.join()
 
 
 if __name__ == "__main__":

From b84dd771e69c67162716ddd1baa4d9b062531c82 Mon Sep 17 00:00:00 2001
From: vishalrao487 <111801046@smail.iitpkd.ac.in>
Date: Thu, 24 Sep 2020 14:40:53 -0700
Subject: [PATCH 106/449] Grammatically updated the tech docs (#45192)

Summary:
Small grammatical update to the [https://pytorch.org/docs/stable/tensors.html](url) docs.

**_update1_**
![update1](https://user-images.githubusercontent.com/62737243/93969792-5c0ea800-fd8a-11ea-8c9f-0033f51a1fdc.png)

**_update2_**
![update2](https://user-images.githubusercontent.com/62737243/93969801-603ac580-fd8a-11ea-812d-d3026b9fc8a5.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45192

Reviewed By: bwasti

Differential Revision: D23877870

Pulled By: ezyang

fbshipit-source-id: 929ba3d479925b5132dbe87fad2da487408db7c7
---
 docs/source/tensors.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index cd1c363604fe..7cd1a88f82b3 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -8,7 +8,7 @@ torch.Tensor
 A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of
 a single data type.
 
-Torch defines 10 tensor types with CPU and GPU variants:
+Torch defines 10 tensor types with CPU and GPU variants which are as follows:
 
 ========================== ===========================================   ============================= ================================
 Data type                  dtype                                         CPU tensor                    GPU tensor
@@ -32,7 +32,7 @@ Boolean                    ``torch.bool``                                :class:
   Sometimes referred to as binary16: uses 1 sign, 5 exponent, and 10
   significand bits. Useful when precision is important at the expense of range.
 .. [2]
-  Sometimes referred to as Brain Floating Point: use 1 sign, 8 exponent and 7
+  Sometimes referred to as Brain Floating Point: uses 1 sign, 8 exponent, and 7
   significand bits. Useful when range is important, since it has the same
   number of exponent bits as ``float32``
 

From 6311c5a483e446abaca2d95c2d58b5f462911e6f Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Thu, 24 Sep 2020 15:04:51 -0700
Subject: [PATCH 107/449] Minor touchups. (#44317)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44317

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D23820828

Pulled By: AshkanAliabadi

fbshipit-source-id: b83bdea9aed2fb52bd254ff15914d55a1af58c04
---
 aten/src/ATen/native/vulkan/Vulkan.h          |  2 +-
 aten/src/ATen/native/vulkan/api/Allocator.h   | 12 +++-
 aten/src/ATen/native/vulkan/api/Command.cpp   | 62 ++++++++++++++++---
 aten/src/ATen/native/vulkan/api/Command.h     |  2 +-
 aten/src/ATen/native/vulkan/api/Common.h      | 22 +++++--
 aten/src/ATen/native/vulkan/api/Context.cpp   | 28 +++++++--
 aten/src/ATen/native/vulkan/api/Context.h     |  6 +-
 .../src/ATen/native/vulkan/api/Descriptor.cpp | 39 +++++++++++-
 aten/src/ATen/native/vulkan/api/Descriptor.h  | 10 +--
 aten/src/ATen/native/vulkan/api/Pipeline.cpp  | 59 ++++++++++++++++--
 aten/src/ATen/native/vulkan/api/Pipeline.h    |  2 +-
 aten/src/ATen/native/vulkan/api/Resource.cpp  | 42 ++++++++++++-
 aten/src/ATen/native/vulkan/api/Resource.h    | 23 +++++--
 aten/src/ATen/native/vulkan/api/Shader.cpp    | 29 ++++++++-
 aten/src/ATen/native/vulkan/api/Shader.h      | 12 ++--
 15 files changed, 294 insertions(+), 56 deletions(-)

diff --git a/aten/src/ATen/native/vulkan/Vulkan.h b/aten/src/ATen/native/vulkan/Vulkan.h
index df9a53f7076d..c2b1775e8f0a 100644
--- a/aten/src/ATen/native/vulkan/Vulkan.h
+++ b/aten/src/ATen/native/vulkan/Vulkan.h
@@ -456,7 +456,7 @@ class ComputeUnit final {
   void createComputePipelineCompile(
       const std::string& glslSrc,
       const VkPipelineCache pipelineCache,
-      const VkDescriptorSetLayout& descrSetLayout,
+      const VkDescriptorSetLayout descrSetLayout,
       const WorkGroupSize workGroupSize);
 #endif
 
diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h
index afa720a515e6..f0f0c9baa59c 100644
--- a/aten/src/ATen/native/vulkan/api/Allocator.h
+++ b/aten/src/ATen/native/vulkan/api/Allocator.h
@@ -2,11 +2,19 @@
 
 #include <ATen/native/vulkan/api/Common.h>
 
+#ifdef DEBUG
+  #define VMA_DEBUG_LOG(format, ...)  \
+    do {                              \
+      printf(format, ##__VA_ARGS__);  \
+      printf("\n");                   \
+    } while(false)
+#endif /* DEBUG */
+
 #ifdef __clang__
   #pragma clang diagnostic push
   #pragma clang diagnostic ignored "-Wnullability-completeness"
   #pragma clang diagnostic ignored "-Wunused-variable"
-#endif
+#endif /* __clang__ */
 
 // Do NOT include vk_mem_alloc.h directly.
 // Always include this file (Allocator.h) instead.
@@ -15,4 +23,4 @@
 
 #ifdef __clang__
   #pragma clang diagnostic pop
-#endif
+#endif /* __clang__ */
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index 21279b408233..48512215c5fc 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -7,6 +7,9 @@ namespace api {
 
 Command::Pool::Factory::Factory(const VkDevice device)
   : device_(device) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        device_,
+        "Invalid Vulkan device!");
 }
 
 typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()(
@@ -20,7 +23,14 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()(
 
   VkCommandPool command_pool{};
   VK_CHECK(vkCreateCommandPool(
-      device_, &command_pool_create_info, nullptr, &command_pool));
+      device_,
+      &command_pool_create_info,
+      nullptr,
+      &command_pool));
+
+  TORCH_CHECK(
+      command_pool,
+      "Invalid Vulkan command pool!");
 
   return Handle{
     command_pool,
@@ -31,8 +41,13 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()(
 void Command::Pool::purge(
     const VkDevice device,
     const VkCommandPool command_pool) {
-  TORCH_INTERNAL_ASSERT(device, "Invalid Vulkan device!");
-  TORCH_INTERNAL_ASSERT(command_pool, "Invalid Vulkan command pool!");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      command_pool,
+      "Invalid Vulkan command pool!");
 
   VK_CHECK(vkResetCommandPool(device, command_pool, 0u));
 }
@@ -42,6 +57,14 @@ namespace {
 VkCommandBuffer allocate_command_buffer(
     const VkDevice device,
     const VkCommandPool command_pool) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      command_pool,
+      "Invalid Vulkan command pool!");
+
   const VkCommandBufferAllocateInfo command_buffer_allocate_info{
     VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
     nullptr,
@@ -52,7 +75,13 @@ VkCommandBuffer allocate_command_buffer(
 
   VkCommandBuffer command_buffer{};
   VK_CHECK(vkAllocateCommandBuffers(
-      device, &command_buffer_allocate_info, &command_buffer));
+      device,
+      &command_buffer_allocate_info,
+      &command_buffer));
+
+  TORCH_CHECK(
+      command_buffer,
+      "Invalid Vulkan command buffer!");
 
   return command_buffer;
 }
@@ -61,6 +90,9 @@ VkCommandBuffer allocate_command_buffer(
 
 Command::Buffer::Buffer(const VkDevice device, const VkCommandPool command_pool)
   : command_buffer_(allocate_command_buffer(device, command_pool)) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      command_buffer_,
+      "Invalid Vulkan command buffer!");
 }
 
 void Command::Buffer::Buffer::begin() {
@@ -71,7 +103,9 @@ void Command::Buffer::Buffer::begin() {
     nullptr,
   };
 
-  VK_CHECK(vkBeginCommandBuffer(command_buffer_, &command_buffer_begin_info));
+  VK_CHECK(vkBeginCommandBuffer(
+      command_buffer_,
+      &command_buffer_begin_info));
 }
 
 void Command::Buffer::Buffer::end() {
@@ -79,16 +113,26 @@ void Command::Buffer::Buffer::end() {
 }
 
 void Command::Buffer::bind(const VkPipeline pipeline) {
-  TORCH_INTERNAL_ASSERT(pipeline, "Invalid Vulkan pipeline!");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      pipeline,
+      "Invalid Vulkan pipeline!");
 
-  vkCmdBindPipeline(command_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+  vkCmdBindPipeline(
+      command_buffer_,
+      VK_PIPELINE_BIND_POINT_COMPUTE,
+      pipeline);
 }
 
 void Command::Buffer::bind(
     const VkPipelineLayout pipeline_layout,
     const VkDescriptorSet descriptor_set) {
-  TORCH_INTERNAL_ASSERT(pipeline_layout, "Invalid Vulkan pipeline layout!");
-  TORCH_INTERNAL_ASSERT(descriptor_set, "Invalid Vulkan descriptor set!");
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      pipeline_layout,
+      "Invalid Vulkan pipeline layout!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor_set,
+      "Invalid Vulkan descriptor set!");
 
   vkCmdBindDescriptorSets(
       command_buffer_,
diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h
index 462a50fef7fd..554e6fdf373e 100644
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@@ -9,7 +9,7 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-struct C10_EXPORT Command final {
+struct Command final {
   //
   // Pool
   //
diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
index 0c1e7cc4720b..aec26bf987a0 100644
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@@ -24,10 +24,10 @@
     at::native::vulkan::api::destroy_##Handle
 
 #define VK_DELETER_DISPATCHABLE_DECLARE(Handle) \
-    C10_EXPORT void destroy_##Handle(const Vk##Handle handle)
+    void destroy_##Handle(const Vk##Handle handle)
 
 #define VK_DELETER_NON_DISPATCHABLE_DECLARE(Handle)   \
-  class C10_EXPORT destroy_##Handle final {           \
+  class destroy_##Handle final {                      \
    public:                                            \
     explicit destroy_##Handle(const VkDevice device); \
     void operator()(const Vk##Handle handle) const;   \
@@ -40,6 +40,14 @@ namespace native {
 namespace vulkan {
 namespace api {
 
+struct Command;
+class Context;
+struct Descriptor;
+struct Pipeline;
+struct Resource;
+class Runtime;
+struct Shader;
+
 VK_DELETER_DISPATCHABLE_DECLARE(Instance);
 VK_DELETER_DISPATCHABLE_DECLARE(Device);
 VK_DELETER_NON_DISPATCHABLE_DECLARE(Semaphore);
@@ -78,11 +86,13 @@ class Handle final {
   Handle(const Handle&) = delete;
   Handle& operator=(const Handle&) = delete;
   Handle(Handle&&);
-  Handle& operator=(Handle&&);
+  Handle& operator=(Handle&&) &;
+  Handle& operator=(Handle&&) && = delete;
   ~Handle();
 
   operator bool() const;
-  Type get() const;
+  Type get() const &;
+  Type get() const && = delete;
   Type release();
   void reset(Type payload = kNull);
 
@@ -112,7 +122,7 @@ inline Handle<Type, Deleter>::Handle(Handle&& handle)
 
 template<typename Type, typename Deleter>
 inline Handle<Type, Deleter>&
-Handle<Type, Deleter>::operator=(Handle&& handle)
+Handle<Type, Deleter>::operator=(Handle&& handle) &
 {
   reset(handle.release());
   deleter_ = std::move(handle.deleter_);
@@ -130,7 +140,7 @@ inline Handle<Type, Deleter>::operator bool() const {
 }
 
 template<typename Type, typename Deleter>
-inline Type Handle<Type, Deleter>::get() const {
+inline Type Handle<Type, Deleter>::get() const & {
   return payload_;
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index 76a245e16d38..206967b550b2 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -82,7 +82,9 @@ VkInstance create_instance(const bool enable_validation_layers) {
         instance_extension_count);
 
     VK_CHECK(vkEnumerateInstanceExtensionProperties(
-        nullptr, &instance_extension_count, instance_extension_properties.data()));
+        nullptr,
+        &instance_extension_count,
+        instance_extension_properties.data()));
 
     constexpr const char* const requested_instance_extensions[]{
       VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
@@ -121,6 +123,7 @@ VkInstance create_instance(const bool enable_validation_layers) {
 
   VkInstance instance{};
   VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
+  TORCH_CHECK(instance, "Invalid Vulkan instance!");
 
   return instance;
 }
@@ -159,13 +162,20 @@ VkDebugReportCallbackEXT create_debug_report_callback(
       nullptr,
       &debug_report_callback));
 
+  TORCH_CHECK(
+      debug_report_callback,
+      "Invalid Vulkan debug report callback!");
+
   return debug_report_callback;
 }
 
 VkPhysicalDevice acquire_physical_device(const VkInstance instance) {
   uint32_t device_count = 0;
   VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
-  TORCH_CHECK(device_count > 0, "Vulkan: Could not find a device with Vulkan support!");
+
+  TORCH_CHECK(
+      device_count > 0,
+      "Vulkan: Could not find a device with Vulkan support!");
 
   std::vector<VkPhysicalDevice> devices(device_count);
   VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
@@ -187,13 +197,16 @@ uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device
       physical_device, &queue_family_count, nullptr);
 
   TORCH_CHECK(
-      queue_family_count > 0, "Vulkan: Invalid number of queue families!");
+      queue_family_count > 0,
+      "Vulkan: Invalid number of queue families!");
 
   std::vector<VkQueueFamilyProperties> queue_families_properties(
     queue_family_count);
 
   vkGetPhysicalDeviceQueueFamilyProperties(
-      physical_device, &queue_family_count, queue_families_properties.data());
+      physical_device,
+      &queue_family_count,
+      queue_families_properties.data());
 
   for (uint32_t i = 0; i < queue_families_properties.size(); ++i) {
     const VkQueueFamilyProperties& properties = queue_families_properties[i];
@@ -234,6 +247,7 @@ VkDevice create_device(
 
   VkDevice device{};
   VK_CHECK(vkCreateDevice(physical_device, &device_create_info, nullptr, &device));
+  TORCH_CHECK(device, "Invalid Vulkan device!");
 
   return device;
 }
@@ -243,6 +257,8 @@ VkQueue acquire_queue(
     const uint32_t compute_queue_family_index) {
   VkQueue queue{};
   vkGetDeviceQueue(device, compute_queue_family_index, 0, &queue);
+  TORCH_CHECK(queue, "Invalid Vulkan queue!");
+
   return queue;
 }
 
@@ -309,11 +325,11 @@ bool available() {
   return initialize();
 }
 
-Context& context() {
+Context* context() {
   Context* const context = initialize();
   TORCH_CHECK(context, "Vulkan: Backend not available on this platform!");
 
-  return *context;
+  return context;
 }
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index d57eab66108e..7cec6ada5d5e 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -19,7 +19,7 @@ namespace api {
 // user.
 //
 
-class C10_EXPORT Context final {
+class Context final {
  public:
   explicit Context(bool enable_validation_layers);
   ~Context() = default;
@@ -90,8 +90,8 @@ class C10_EXPORT Context final {
   Resource resource_;
 };
 
-C10_EXPORT bool available();
-C10_EXPORT Context& context();
+bool available();
+Context* context();
 
 } // namespace api
 } // namespace vulkan
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
index 1b5ea94341a3..bab10466ea02 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
@@ -46,6 +46,9 @@ const Descriptor::Pool::Descriptor Descriptor::Pool::kDefault{
 
 Descriptor::Pool::Factory::Factory(const VkDevice device)
   : device_(device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
 }
 
 typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()(
@@ -61,7 +64,14 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()
 
   VkDescriptorPool descriptor_pool{};
   VK_CHECK(vkCreateDescriptorPool(
-      device_, &descriptor_pool_create_info, nullptr, &descriptor_pool));
+      device_,
+      &descriptor_pool_create_info,
+      nullptr,
+      &descriptor_pool));
+
+  TORCH_CHECK(
+      descriptor_pool,
+      "Invalid Vulkan descriptor pool!");
 
   return Handle{
     descriptor_pool,
@@ -72,12 +82,29 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()
 void Descriptor::Pool::purge(
     const VkDevice device,
     const VkDescriptorPool descriptor_pool) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor_pool,
+      "Invalid Vulkan descriptor pool!");
+
   VK_CHECK(vkResetDescriptorPool(device, descriptor_pool, 0u));
 }
 
-Descriptor::Factory::Factory(const VkDevice device, const VkDescriptorPool descriptor_pool)
+Descriptor::Factory::Factory(
+    const VkDevice device,
+    const VkDescriptorPool descriptor_pool)
   : device_(device),
     descriptor_pool_(descriptor_pool) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor_pool,
+      "Invalid Vulkan descriptor pool!");
 }
 
 VkDescriptorSet Descriptor::Factory::allocate(
@@ -92,7 +119,13 @@ VkDescriptorSet Descriptor::Factory::allocate(
 
   VkDescriptorSet descriptor_set{};
   VK_CHECK(vkAllocateDescriptorSets(
-      device_, &descriptor_set_allocate_info, &descriptor_set));
+      device_,
+      &descriptor_set_allocate_info,
+      &descriptor_set));
+
+  TORCH_CHECK(
+      descriptor_set,
+      "Invalid Vulkan descriptor set!");
 
   return descriptor_set;
 }
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h
index 3e339ae4641f..da4a2a03e2f9 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.h
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.h
@@ -49,7 +49,7 @@ namespace api {
 // as well.  This behavior is by design.
 //
 
-struct C10_EXPORT Descriptor final {
+struct Descriptor final {
   //
   // Pool
   //
@@ -156,8 +156,8 @@ inline size_t Descriptor::Pool::Factory::Hasher::operator()(
 } // namespace at
 
 inline bool operator==(
-    const VkDescriptorPoolSize& descriptor_pool_size_1,
-    const VkDescriptorPoolSize& descriptor_pool_size_2) {
-  return (descriptor_pool_size_1.type == descriptor_pool_size_2.type) &&
-         (descriptor_pool_size_1.descriptorCount == descriptor_pool_size_2.descriptorCount);
+    const VkDescriptorPoolSize& _1,
+    const VkDescriptorPoolSize& _2) {
+  return (_1.type == _2.type) &&
+         (_1.descriptorCount == _2.descriptorCount);
 }
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
index 303eea7cb401..3c845c5fae32 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
@@ -7,10 +7,17 @@ namespace api {
 
 Pipeline::Layout::Factory::Factory(const VkDevice device)
  : device_(device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_,
+      "Invalid Vulkan device!");
 }
 
 typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()(
     const Descriptor& descriptor) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor.descriptor_set_layout,
+      "Invalid Vulkan descriptor set layout!");
+
   const VkPipelineLayoutCreateInfo pipeline_layout_create_info{
     VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
     nullptr,
@@ -23,7 +30,14 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()
 
   VkPipelineLayout pipeline_layout{};
   VK_CHECK(vkCreatePipelineLayout(
-      device_, &pipeline_layout_create_info, nullptr, &pipeline_layout));
+      device_,
+      &pipeline_layout_create_info,
+      nullptr,
+      &pipeline_layout));
+
+  TORCH_CHECK(
+      pipeline_layout,
+      "Invalid Vulkan pipeline layout!");
 
   return Handle{
     pipeline_layout,
@@ -34,6 +48,10 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()
 namespace {
 
 VkPipelineCache create_pipeline_cache(const VkDevice device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
   const VkPipelineCacheCreateInfo pipeline_cache_create_info{
     VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
     nullptr,
@@ -44,7 +62,14 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) {
 
   VkPipelineCache pipeline_cache{};
   VK_CHECK(vkCreatePipelineCache(
-      device, &pipeline_cache_create_info, nullptr, &pipeline_cache));
+      device,
+      &pipeline_cache_create_info,
+      nullptr,
+      &pipeline_cache));
+
+  TORCH_CHECK(
+      pipeline_cache,
+      "Invalid Vulkan pipeline cache!");
 
   return pipeline_cache;
 }
@@ -53,11 +78,28 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) {
 
 Pipeline::Factory::Factory(const VkDevice device)
  : device_(device),
-   pipeline_cache_(create_pipeline_cache(device), VK_DELETER(PipelineCache)(device)) {
+   pipeline_cache_(
+      create_pipeline_cache(device),
+      VK_DELETER(PipelineCache)(device)) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_,
+      "Invalid Vulkan device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      pipeline_cache_,
+      "Invalid Vulkan pipeline cache!");
 }
 
 typename Pipeline::Factory::Handle Pipeline::Factory::operator()(
     const Descriptor& descriptor) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor.pipeline_layout,
+      "Invalid Vulkan pipeline layout!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      descriptor.shader_module,
+      "Invalid Vulkan shader module!");
+
   constexpr uint32_t x_offset = 0u;
   constexpr uint32_t x_size = sizeof(Shader::WorkGroup::x);
   constexpr uint32_t y_offset = x_offset + x_size;
@@ -113,7 +155,16 @@ typename Pipeline::Factory::Handle Pipeline::Factory::operator()(
 
   VkPipeline pipeline{};
   VK_CHECK(vkCreateComputePipelines(
-      device_, pipeline_cache_.get(), 1u, &compute_pipeline_create_info, nullptr, &pipeline));
+      device_,
+      pipeline_cache_.get(),
+      1u,
+      &compute_pipeline_create_info,
+      nullptr,
+      &pipeline));
+
+  TORCH_CHECK(
+      pipeline,
+      "Invalid Vulkan pipeline!");
 
   return Handle{
     pipeline,
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h
index a5d72324c36e..0ecef40c8b19 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.h
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.h
@@ -29,7 +29,7 @@ namespace api {
 // these Vulkan objects.
 //
 
-struct C10_EXPORT Pipeline final {
+struct Pipeline final {
   //
   // Layout
   //
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index c538a1b6e2d0..7163294bd1d9 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -10,6 +10,18 @@ VmaAllocator create_allocator(
     const VkInstance instance,
     const VkPhysicalDevice physical_device,
     const VkDevice device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      instance,
+      "Invalid Vulkan instance!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
   const VmaAllocatorCreateInfo allocator_create_info{
     0u,
     physical_device,
@@ -27,6 +39,7 @@ VmaAllocator create_allocator(
 
   VmaAllocator allocator{};
   VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator));
+  TORCH_CHECK(allocator, "Invalid VMA allocator!");
 
   return allocator;
 }
@@ -87,6 +100,13 @@ Resource::Memory::Scope::Scope(
   : allocator_(allocator),
     allocation_(allocation),
     access_(access) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      allocator,
+      "Invalid VMA allocator!");
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      allocation,
+      "Invalid VMA allocation!");
 }
 
 void Resource::Memory::Scope::operator()(const void* const data) const {
@@ -109,7 +129,12 @@ Resource::Pool::Pool(
     const VkPhysicalDevice physical_device,
     const VkDevice device)
   : device_(device),
-    allocator_(create_allocator(instance, physical_device, device), vmaDestroyAllocator) {
+    allocator_(
+        create_allocator(
+          instance,
+          physical_device,
+          device),
+        vmaDestroyAllocator) {
     buffers_.reserve(Configuration::kReserve);
     images_.reserve(Configuration::kReserve);
 }
@@ -141,6 +166,9 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor)
       &allocation,
       &allocation_info));
 
+  TORCH_CHECK(buffer, "Invalid Vulkan buffer!");
+  TORCH_CHECK(allocation, "Invalid VMA allocation!");
+
   buffers_.emplace_back(
       Buffer{
         buffer,
@@ -189,6 +217,9 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) {
       &allocation,
       &allocation_info));
 
+  TORCH_CHECK(image, "Invalid Vulkan image!");
+  TORCH_CHECK(allocation, "Invalid VMA allocation!");
+
   const VkImageViewCreateInfo image_view_create_info{
     VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
     nullptr,
@@ -213,7 +244,14 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) {
 
   VkImageView view{};
   VK_CHECK(vkCreateImageView(
-      device_, &image_view_create_info, nullptr, &view))
+      device_,
+      &image_view_create_info,
+      nullptr,
+      &view));
+
+  TORCH_CHECK(
+      view,
+      "Invalid Vulkan image view!");
 
   images_.emplace_back(
       Image{
diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h
index 04cd9a067663..a74a3c2d3c89 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@@ -8,7 +8,7 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-struct C10_EXPORT Resource final {
+struct Resource final {
   /*
     Memory
   */
@@ -25,12 +25,25 @@ struct C10_EXPORT Resource final {
     template<
         typename Type,
         typename Pointer = std::add_pointer_t<std::add_const_t<Type>>>
-    Data<Pointer> map() const;
+    Data<Pointer> map() const &;
 
     template<
         typename Type,
         typename Pointer = std::add_pointer_t<Type>>
-    Data<Pointer> map();
+    Data<Pointer> map() &;
+
+   private:
+    // Intentionally disabed to ensure memory access is always properly
+    // encapsualted in a scoped map-unmap region.  Allowing below overloads
+    // to be invoked on a temporary would open the door to the possibility
+    // of accessing the underlying memory out of the expected scope making
+    // for seemingly ineffective memory writes and hard to hunt down bugs.
+
+    template<typename Type, typename Pointer>
+    Data<Pointer> map() const && = delete;
+
+    template<typename Type, typename Pointer>
+    Data<Pointer> map() && = delete;
   };
 
   /*
@@ -144,7 +157,7 @@ class Resource::Memory::Scope final {
 };
 
 template<typename, typename Pointer>
-inline Resource::Memory::Data<Pointer> Resource::Memory::map() const {
+inline Resource::Memory::Data<Pointer> Resource::Memory::map() const & {
   void* map(const Memory& memory);
 
   return Data<Pointer>{
@@ -154,7 +167,7 @@ inline Resource::Memory::Data<Pointer> Resource::Memory::map() const {
 }
 
 template<typename, typename Pointer>
-inline Resource::Memory::Data<Pointer> Resource::Memory::map() {
+inline Resource::Memory::Data<Pointer> Resource::Memory::map() & {
   void* map(const Memory& memory);
 
   return Data<Pointer>{
diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp
index bbd3e3464d78..4cde24a2eef9 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@@ -11,6 +11,9 @@ namespace api {
 
 Shader::Layout::Factory::Factory(const VkDevice device)
   : device_(device) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        device_,
+        "Invalid Vulkan device!");
 }
 
 Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
@@ -25,7 +28,14 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
 
   VkDescriptorSetLayout descriptor_set_layout{};
   VK_CHECK(vkCreateDescriptorSetLayout(
-      device_, &descriptor_set_layout_create_info, nullptr, &descriptor_set_layout));
+      device_,
+      &descriptor_set_layout_create_info,
+      nullptr,
+      &descriptor_set_layout));
+
+  TORCH_CHECK(
+      descriptor_set_layout,
+      "Invalid Vulkan descriptor set layout!");
 
   return Handle{
     descriptor_set_layout,
@@ -35,6 +45,8 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
 
 Shader::Descriptor::Descriptor(const char* const glsl)
  : type(Type::Source) {
+  TORCH_CHECK(glsl, "Invalid shader source code!");
+
   shader.source = {
     glsl,
     0u,
@@ -43,6 +55,8 @@ Shader::Descriptor::Descriptor(const char* const glsl)
 
 Shader::Descriptor::Descriptor(const uint32_t* const code, const uint32_t size)
  : type(Type::Binary) {
+  TORCH_CHECK(code && (0u != size), "Invalid shader binary!");
+
   shader.binary = {
     code,
     size,
@@ -68,6 +82,10 @@ struct Shader::Factory::Compiler final {
   }
 
   std::vector<uint32_t> compile(const char* const source) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        source,
+        "Invalid shader source code!");
+
     const shaderc::SpvCompilationResult result = context.CompileGlslToSpv(
         source,
         ::strlen(source),
@@ -139,7 +157,14 @@ typename Shader::Factory::Handle Shader::Factory::operator()(
 
   VkShaderModule shader_module{};
   VK_CHECK(vkCreateShaderModule(
-      device_, &shader_module_create_info, nullptr, &shader_module));
+      device_,
+      &shader_module_create_info,
+      nullptr,
+      &shader_module));
+
+  TORCH_CHECK(
+      shader_module,
+      "Invalid Vulkan shader module!");
 
   return Handle{
     shader_module,
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index 0fd2fa01614b..4a0080cb888d 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -32,7 +32,7 @@ namespace api {
 // and destruct the aforementioned Vulkan objects.
 //
 
-struct C10_EXPORT Shader final {
+struct Shader final {
   //
   // Layout
   //
@@ -187,11 +187,11 @@ inline size_t Shader::Layout::Factory::Hasher::operator()(
 }
 
 inline bool operator==(
-    const Shader::WorkGroup& work_group_1,
-    const Shader::WorkGroup& work_group_2) {
-  return (work_group_1.x == work_group_2.x) &&
-         (work_group_1.y == work_group_2.y) &&
-         (work_group_1.z == work_group_2.z);
+    const Shader::WorkGroup& _1,
+    const Shader::WorkGroup& _2) {
+  return (_1.x == _2.x) &&
+         (_1.y == _2.y) &&
+         (_1.z == _2.z);
 }
 
 inline bool operator==(

From 5a59330647d1c461dc49dc84b5ff5f18e8a192d8 Mon Sep 17 00:00:00 2001
From: Ashkan Aliabadi <ashkanaliabadi@fb.com>
Date: Thu, 24 Sep 2020 15:04:51 -0700
Subject: [PATCH 108/449] Add architectural support for multi-GPU. (#44059)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44059

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D23820825

Pulled By: AshkanAliabadi

fbshipit-source-id: 0719b00581487a77ebadff867d1e4ac89354bf90
---
 aten/src/ATen/native/vulkan/api/Adapter.h     |  36 ++
 aten/src/ATen/native/vulkan/api/Command.cpp   |   6 +-
 aten/src/ATen/native/vulkan/api/Command.h     |  10 +-
 aten/src/ATen/native/vulkan/api/Common.h      |   7 +
 aten/src/ATen/native/vulkan/api/Context.cpp   | 300 ++-------------
 aten/src/ATen/native/vulkan/api/Context.h     |  68 ++--
 .../src/ATen/native/vulkan/api/Descriptor.cpp |   6 +-
 aten/src/ATen/native/vulkan/api/Descriptor.h  |  12 +-
 aten/src/ATen/native/vulkan/api/Pipeline.cpp  |  12 +-
 aten/src/ATen/native/vulkan/api/Pipeline.h    |  14 +-
 aten/src/ATen/native/vulkan/api/Resource.cpp  |  22 +-
 aten/src/ATen/native/vulkan/api/Resource.h    |  12 +-
 aten/src/ATen/native/vulkan/api/Runtime.cpp   | 343 ++++++++++++++++++
 aten/src/ATen/native/vulkan/api/Runtime.h     |  64 ++++
 aten/src/ATen/native/vulkan/api/Shader.cpp    |  15 +-
 aten/src/ATen/native/vulkan/api/Shader.h      |  14 +-
 aten/src/ATen/native/vulkan/api/api.h         |   2 +
 aten/src/ATen/test/vulkan_api_test.cpp        |   5 -
 18 files changed, 582 insertions(+), 366 deletions(-)
 create mode 100644 aten/src/ATen/native/vulkan/api/Adapter.h
 create mode 100644 aten/src/ATen/native/vulkan/api/Runtime.cpp
 create mode 100644 aten/src/ATen/native/vulkan/api/Runtime.h

diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h
new file mode 100644
index 000000000000..239edfb74518
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Adapter.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/Runtime.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+//
+// A Vulkan Adapter represents a physical device and its properties.  Adapters
+// are enumerated through the Runtime and are used in creation of Contexts.
+// Each tensor in PyTorch is associated with a Context to make the
+// device <-> tensor affinity explicit.
+//
+
+struct Adapter final {
+  Runtime* runtime;
+  VkPhysicalDevice handle;
+  VkPhysicalDeviceProperties properties;
+  VkPhysicalDeviceMemoryProperties memory_properties;
+  uint32_t compute_queue_family_index;
+
+  inline bool has_unified_memory() const {
+    // Ideally iterate over all memory types to see if there is a pool that
+    // is both host-visible, and device-local.  This should be a good proxy
+    // for now.
+    return VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU == properties.deviceType;
+  }
+};
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp
index 48512215c5fc..a7793aea16dc 100644
--- a/aten/src/ATen/native/vulkan/api/Command.cpp
+++ b/aten/src/ATen/native/vulkan/api/Command.cpp
@@ -5,15 +5,15 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-Command::Pool::Factory::Factory(const VkDevice device)
-  : device_(device) {
+Command::Pool::Factory::Factory(const GPU& gpu)
+  : device_(gpu.device) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         device_,
         "Invalid Vulkan device!");
 }
 
 typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()(
-  const Descriptor& descriptor) const {
+    const Descriptor& descriptor) const {
   const VkCommandPoolCreateInfo command_pool_create_info{
     VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
     nullptr,
diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h
index 554e6fdf373e..b0c171faa490 100644
--- a/aten/src/ATen/native/vulkan/api/Command.h
+++ b/aten/src/ATen/native/vulkan/api/Command.h
@@ -29,7 +29,7 @@ struct Command final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Pool::Descriptor Descriptor;
       typedef VK_DELETER(CommandPool) Deleter;
@@ -52,8 +52,8 @@ struct Command final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Pool(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Pool(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
 
     static void purge(VkDevice device, VkCommandPool command_pool);
@@ -78,8 +78,8 @@ struct Command final {
     VkCommandBuffer command_buffer_;
   };
 
-  explicit Command(const VkDevice device)
-    : pool(device) {
+  explicit Command(const GPU& gpu)
+    : pool(gpu) {
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h
index aec26bf987a0..cbd53e8045ef 100644
--- a/aten/src/ATen/native/vulkan/api/Common.h
+++ b/aten/src/ATen/native/vulkan/api/Common.h
@@ -40,6 +40,7 @@ namespace native {
 namespace vulkan {
 namespace api {
 
+struct Adapter;
 struct Command;
 class Context;
 struct Descriptor;
@@ -48,6 +49,12 @@ struct Resource;
 class Runtime;
 struct Shader;
 
+struct GPU final {
+  const Adapter* adapter;
+  VkDevice device;
+  VkQueue queue;
+};
+
 VK_DELETER_DISPATCHABLE_DECLARE(Instance);
 VK_DELETER_DISPATCHABLE_DECLARE(Device);
 VK_DELETER_NON_DISPATCHABLE_DECLARE(Semaphore);
diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp
index 206967b550b2..d0fa08dbde1d 100644
--- a/aten/src/ATen/native/vulkan/api/Context.cpp
+++ b/aten/src/ATen/native/vulkan/api/Context.cpp
@@ -8,221 +8,31 @@ namespace vulkan {
 namespace api {
 namespace {
 
-struct Configuration final {
-#ifndef DEBUG
-  static constexpr bool kEnableValidationLayers = false;
-#else
-  static constexpr bool kEnableValidationLayers = true;
-#endif
-};
-
-VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
-    const VkDebugReportFlagsEXT flags,
-    const VkDebugReportObjectTypeEXT /* object_type */,
-    const uint64_t /* object */,
-    const size_t /* location */,
-    const int32_t message_code,
-    const char* const layer_prefix,
-    const char* const message,
-    void* const /* user_data */) {
-  std::stringstream stream;
-  stream << layer_prefix << " " << message_code << " " << message << std::endl;
-  const std::string log = stream.str();
-
-  if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) {
-    LOG(ERROR) << log;
-  } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) {
-    LOG(WARNING) << log;
-  } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) {
-    LOG(WARNING) << "Performance:" << log;
-  } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) {
-    LOG(INFO) << log;
-  } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) {
-    LOG(INFO) << "Debug: " << log;
-  }
-
-  return VK_FALSE;
-}
-
-VkInstance create_instance(const bool enable_validation_layers) {
-  std::vector<const char*> enabled_instance_layers;
-  std::vector<const char*> enabled_instance_extensions;
-
-  if (enable_validation_layers) {
-    uint32_t instance_layers_count = 0;
-    VK_CHECK(vkEnumerateInstanceLayerProperties(
-        &instance_layers_count, nullptr));
-
-    std::vector<VkLayerProperties> instance_layer_properties(
-        instance_layers_count);
-
-    VK_CHECK(vkEnumerateInstanceLayerProperties(
-        &instance_layers_count,
-        instance_layer_properties.data()));
-
-    constexpr const char* const requested_instance_layers[]{
-        // "VK_LAYER_LUNARG_api_dump",
-        "VK_LAYER_KHRONOS_validation",
-    };
-
-    for (const auto& requested_instance_layer : requested_instance_layers) {
-      for (const auto& layer : instance_layer_properties) {
-        if (strcmp(requested_instance_layer, layer.layerName) == 0) {
-          enabled_instance_layers.push_back(requested_instance_layer);
-          break;
-        }
-      }
-    }
-
-    uint32_t instance_extension_count = 0;
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(
-        nullptr, &instance_extension_count, nullptr));
-
-    std::vector<VkExtensionProperties> instance_extension_properties(
-        instance_extension_count);
-
-    VK_CHECK(vkEnumerateInstanceExtensionProperties(
-        nullptr,
-        &instance_extension_count,
-        instance_extension_properties.data()));
-
-    constexpr const char* const requested_instance_extensions[]{
-      VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
-    };
+Context* initialize() {
+  static const std::unique_ptr<Context> context([]() -> Context* {
+    try {
+      const Adapter adapter = runtime()->select([](const Adapter& adapter) {
+        // Select the first adapter.
+        return true;
+      });
 
-    for (const auto& requested_instance_extension : requested_instance_extensions) {
-      for (const auto& extension : instance_extension_properties) {
-        if (strcmp(requested_instance_extension, extension.extensionName) == 0) {
-          enabled_instance_extensions.push_back(requested_instance_extension);
-          break;
-        }
-      }
+      return new Context(adapter);
     }
-  }
-
-  constexpr VkApplicationInfo application_info{
-    VK_STRUCTURE_TYPE_APPLICATION_INFO,
-    nullptr,
-    "PyTorch",
-    0,
-    "PyTorch",
-    0,
-    VK_API_VERSION_1_0,
-  };
-
-  const VkInstanceCreateInfo instance_create_info{
-    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
-    nullptr,
-    0u,
-    &application_info,
-    static_cast<uint32_t>(enabled_instance_layers.size()),
-    enabled_instance_layers.data(),
-    static_cast<uint32_t>(enabled_instance_extensions.size()),
-    enabled_instance_extensions.data(),
-  };
-
-  VkInstance instance{};
-  VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
-  TORCH_CHECK(instance, "Invalid Vulkan instance!");
-
-  return instance;
-}
-
-VkDebugReportCallbackEXT create_debug_report_callback(
-    const VkInstance instance,
-    const bool enable_validation_layers) {
-  if (!enable_validation_layers) {
-    return VkDebugReportCallbackEXT{};
-  }
-
-  const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{
-    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
-    nullptr,
-    VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
-      VK_DEBUG_REPORT_WARNING_BIT_EXT |
-      VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
-      VK_DEBUG_REPORT_ERROR_BIT_EXT |
-      VK_DEBUG_REPORT_DEBUG_BIT_EXT,
-    debug_report_callback_fn,
-    nullptr,
-  };
-
-  const auto vkCreateDebugReportCallbackEXT =
-      (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
-          instance, "vkCreateDebugReportCallbackEXT");
-
-  TORCH_CHECK(
-      vkCreateDebugReportCallbackEXT,
-      "Could not load vkCreateDebugReportCallbackEXT");
-
-  VkDebugReportCallbackEXT debug_report_callback{};
-  VK_CHECK(vkCreateDebugReportCallbackEXT(
-      instance,
-      &debugReportCallbackCreateInfo,
-      nullptr,
-      &debug_report_callback));
-
-  TORCH_CHECK(
-      debug_report_callback,
-      "Invalid Vulkan debug report callback!");
-
-  return debug_report_callback;
-}
-
-VkPhysicalDevice acquire_physical_device(const VkInstance instance) {
-  uint32_t device_count = 0;
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
-
-  TORCH_CHECK(
-      device_count > 0,
-      "Vulkan: Could not find a device with Vulkan support!");
-
-  std::vector<VkPhysicalDevice> devices(device_count);
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
-
-  return devices[0];
-}
-
-VkPhysicalDeviceLimits query_physical_device_physical_device_limits(
-    const VkPhysicalDevice physical_device) {
-  VkPhysicalDeviceProperties physical_device_properties{};
-  vkGetPhysicalDeviceProperties(physical_device, &physical_device_properties);
-  return physical_device_properties.limits;
-}
-
-uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) {
-  uint32_t queue_family_count = 0;
-
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physical_device, &queue_family_count, nullptr);
-
-  TORCH_CHECK(
-      queue_family_count > 0,
-      "Vulkan: Invalid number of queue families!");
-
-  std::vector<VkQueueFamilyProperties> queue_families_properties(
-    queue_family_count);
-
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      physical_device,
-      &queue_family_count,
-      queue_families_properties.data());
-
-  for (uint32_t i = 0; i < queue_families_properties.size(); ++i) {
-    const VkQueueFamilyProperties& properties = queue_families_properties[i];
-    if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
-      return i;
+    catch (...) {
+      return nullptr;
     }
-  }
+  }());
 
-  TORCH_CHECK(
-      false,
-      "Vulkan: Could not find a queue family that supports compute operations!");
+  return context.get();
 }
 
 VkDevice create_device(
     const VkPhysicalDevice physical_device,
     const uint32_t compute_queue_family_index) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
   const float queue_priorities = 1.0f;
   const VkDeviceQueueCreateInfo device_queue_create_info{
     VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
@@ -255,6 +65,10 @@ VkDevice create_device(
 VkQueue acquire_queue(
     const VkDevice device,
     const uint32_t compute_queue_family_index) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device,
+      "Invalid Vulkan device!");
+
   VkQueue queue{};
   vkGetDeviceQueue(device, compute_queue_family_index, 0, &queue);
   TORCH_CHECK(queue, "Invalid Vulkan queue!");
@@ -264,65 +78,25 @@ VkQueue acquire_queue(
 
 } // namespace
 
-Context::Context(const bool enable_validation_layers)
-    : instance_(create_instance(enable_validation_layers), &VK_DELETER(Instance)),
-      debug_report_callback_(
-          create_debug_report_callback(instance(), enable_validation_layers),
-          Debug(instance())),
-      physical_device_(acquire_physical_device(instance())),
-      physical_device_limits_(query_physical_device_physical_device_limits(physical_device())),
-      compute_queue_family_index_(query_compute_queue_family_index(physical_device())),
-      device_(create_device(physical_device(), compute_queue_family_index_), &VK_DELETER(Device)),
-      queue_(acquire_queue(device(), compute_queue_family_index_)),
-      command_(device()),
-      shader_(device()),
-      pipeline_(device()),
-      descriptor_(device()),
-      resource_(instance(), physical_device(), device()) {
-}
-
-Context::Debug::Debug(const VkInstance instance)
-  : instance_(instance) {
-}
-
-void Context::Debug::operator()(
-    const VkDebugReportCallbackEXT debug_report_callback) const {
-  if (debug_report_callback) {
-    const auto vkDestroyDebugReportCallbackEXT =
-      (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
-          instance_, "vkDestroyDebugReportCallbackEXT");
-
-      TORCH_CHECK(
-        vkDestroyDebugReportCallbackEXT,
-        "Could not load vkDestroyDebugReportCallbackEXT");
-
-      vkDestroyDebugReportCallbackEXT(
-          instance_, debug_report_callback, nullptr);
-  }
-}
-
-Context* initialize() {
-  static const std::unique_ptr<Context> context([]() -> Context* {
-#ifdef USE_VULKAN_WRAPPER
-    if (!InitVulkan()) {
-      TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan");
-      return nullptr;
-    }
-#endif
-
-    try {
-      return new Context(Configuration::kEnableValidationLayers);
-    }
-    catch (...) {
-      return nullptr;
-    }
-  }());
-
-  return context.get();
+void Context::Deleter::operator()(const VkDevice device) const {
+  // No VK_CHECK.  Don't want an exception thrown in the destructor.
+  vkDeviceWaitIdle(device);
+  vkDestroyDevice(device, nullptr);
 }
 
-bool available() {
-  return initialize();
+Context::Context(const Adapter& adapter)
+    : adapter_(adapter),
+      device_(
+          create_device(
+              adapter.handle,
+              adapter.compute_queue_family_index),
+          Deleter{}),
+      queue_(acquire_queue(device(), adapter.compute_queue_family_index)),
+      command_(gpu()),
+      shader_(gpu()),
+      pipeline_(gpu()),
+      descriptor_(gpu()),
+      resource_(gpu()) {
 }
 
 Context* context() {
diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h
index 7cec6ada5d5e..5d593bdd9bc1 100644
--- a/aten/src/ATen/native/vulkan/api/Context.h
+++ b/aten/src/ATen/native/vulkan/api/Context.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ATen/native/vulkan/api/Common.h>
+#include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
 #include <ATen/native/vulkan/api/Descriptor.h>
 #include <ATen/native/vulkan/api/Pipeline.h>
@@ -14,34 +15,29 @@ namespace api {
 
 //
 // Vulkan Context holds onto all relevant Vulkan state as it pertains to our
-// use of Vulkan in PyTorch.  The context is currently a global object, but
-// technically it does not need to be if we were to make it explicit to the
-// user.
+// use of Vulkan in PyTorch.  A Context is associated with one, and only one,
+// Adapter as a precursor to multi-GPU support.  All Vulkan tensors in PyTorch
+// are associated with a Context to make tensor <-> device affinity explicit.
+// The context is currently a global object, but technically it does not need
+// to be if we were to make it explicit to the user.
 //
 
 class Context final {
  public:
-  explicit Context(bool enable_validation_layers);
+  explicit Context(const Adapter& adapter);
+  Context(const Context&) = delete;
+  Context(Context&&) = default;
+  Context& operator=(const Context&) = delete;
+  Context& operator=(Context&&) = default;
   ~Context() = default;
 
-  inline VkInstance instance() const {
-    return instance_.get();
-  }
-
-  inline VkPhysicalDevice physical_device() const {
-    return physical_device_;
-  }
-
-  inline const VkPhysicalDeviceLimits& physical_device_limits() const {
-    return physical_device_limits_;
-  }
-
-  inline VkDevice device() const {
-    return device_.get();
-  }
-
-  inline VkQueue queue() const {
-    return queue_;
+  inline GPU gpu() {
+    // A GPU is simply a (physical device, logical device, device queue) trio.
+    return {
+      &adapter_,
+      device(),
+      queue(),
+    };
   }
 
   inline Command& command() {
@@ -65,23 +61,26 @@ class Context final {
   }
 
  private:
-  class Debug final {
-   public:
-    explicit Debug(VkInstance instance);
-    void operator()(VkDebugReportCallbackEXT debug_report_callback) const;
+  inline VkDevice device() {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_);
+    return device_.get();
+  }
 
-   private:
-    VkInstance instance_;
+  inline VkQueue queue() {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(queue_);
+    return queue_;
+  }
+
+ private:
+  class Deleter final {
+   public:
+    void operator()(VkDevice device) const;
   };
 
  private:
   // Construction and destruction order matters.  Do not move members around.
-  Handle<VkInstance, decltype(&VK_DELETER(Instance))> instance_;
-  Handle<VkDebugReportCallbackEXT, Debug> debug_report_callback_;
-  VkPhysicalDevice physical_device_;
-  VkPhysicalDeviceLimits physical_device_limits_;
-  uint32_t compute_queue_family_index_;
-  Handle<VkDevice, decltype(&VK_DELETER(Device))> device_;
+  Adapter adapter_;
+  Handle<VkDevice, Deleter> device_;
   VkQueue queue_;
   Command command_;
   Shader shader_;
@@ -90,7 +89,6 @@ class Context final {
   Resource resource_;
 };
 
-bool available();
 Context* context();
 
 } // namespace api
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
index bab10466ea02..ff0505ccebca 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp
@@ -44,15 +44,15 @@ const Descriptor::Pool::Descriptor Descriptor::Pool::kDefault{
   },
 };
 
-Descriptor::Pool::Factory::Factory(const VkDevice device)
-  : device_(device) {
+Descriptor::Pool::Factory::Factory(const GPU& gpu)
+  : device_(gpu.device) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device,
       "Invalid Vulkan device!");
 }
 
 typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()(
-  const Descriptor& descriptor) const {
+    const Descriptor& descriptor) const {
   const VkDescriptorPoolCreateInfo descriptor_pool_create_info{
     VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
     nullptr,
diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h
index da4a2a03e2f9..bc6c14723990 100644
--- a/aten/src/ATen/native/vulkan/api/Descriptor.h
+++ b/aten/src/ATen/native/vulkan/api/Descriptor.h
@@ -72,7 +72,7 @@ struct Descriptor final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Pool::Descriptor Descriptor;
       typedef VK_DELETER(DescriptorPool) Deleter;
@@ -95,8 +95,8 @@ struct Descriptor final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Pool(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Pool(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
 
     static void purge(VkDevice device, VkDescriptorPool descriptor_pool);
@@ -118,9 +118,9 @@ struct Descriptor final {
     VkDescriptorPool descriptor_pool_;
   } factory;
 
-  explicit Descriptor(const VkDevice device)
-    : pool(device),
-      factory(device, pool.cache.retrieve(Pool::kDefault)) {
+  explicit Descriptor(const GPU& gpu)
+    : pool(gpu),
+      factory(gpu.device, pool.cache.retrieve(Pool::kDefault)) {
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
index 3c845c5fae32..bd9881c05443 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp
@@ -5,8 +5,8 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-Pipeline::Layout::Factory::Factory(const VkDevice device)
- : device_(device) {
+Pipeline::Layout::Factory::Factory(const GPU& gpu)
+ : device_(gpu.device) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device_,
       "Invalid Vulkan device!");
@@ -76,11 +76,11 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) {
 
 } // namespace
 
-Pipeline::Factory::Factory(const VkDevice device)
- : device_(device),
+Pipeline::Factory::Factory(const GPU& gpu)
+ : device_(gpu.device),
    pipeline_cache_(
-      create_pipeline_cache(device),
-      VK_DELETER(PipelineCache)(device)) {
+      create_pipeline_cache(device_),
+      VK_DELETER(PipelineCache)(device_)) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
       device_,
       "Invalid Vulkan device!");
diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h
index 0ecef40c8b19..c327a140eded 100644
--- a/aten/src/ATen/native/vulkan/api/Pipeline.h
+++ b/aten/src/ATen/native/vulkan/api/Pipeline.h
@@ -49,7 +49,7 @@ struct Pipeline final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Layout::Descriptor Descriptor;
       typedef VK_DELETER(PipelineLayout) Deleter;
@@ -72,8 +72,8 @@ struct Pipeline final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Layout(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Layout(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
   } layout;
 
@@ -93,7 +93,7 @@ struct Pipeline final {
 
   class Factory final {
    public:
-    explicit Factory(VkDevice device);
+    explicit Factory(const GPU& gpu);
 
     typedef Pipeline::Descriptor Descriptor;
     typedef VK_DELETER(Pipeline) Deleter;
@@ -117,9 +117,9 @@ struct Pipeline final {
   typedef api::Cache<Factory> Cache;
   Cache cache;
 
-  explicit Pipeline(const VkDevice device)
-    : layout(device),
-      cache(Factory(device)) {
+  explicit Pipeline(const GPU& gpu)
+    : layout(gpu),
+      cache(Factory(gpu)) {
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp
index 7163294bd1d9..6969883cb183 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.cpp
+++ b/aten/src/ATen/native/vulkan/api/Resource.cpp
@@ -1,4 +1,5 @@
 #include <ATen/native/vulkan/api/Resource.h>
+#include <ATen/native/vulkan/api/Adapter.h>
 
 namespace at {
 namespace native {
@@ -59,6 +60,7 @@ VmaAllocationCreateInfo create_allocation_create_info(
 }
 
 void release_buffer(const Resource::Buffer& buffer) {
+  // Safe to pass null as buffer or allocation.
   vmaDestroyBuffer(
       buffer.memory.allocator,
       buffer.handle,
@@ -72,6 +74,7 @@ void release_image(const Resource::Image& image) {
     vkDestroyImageView(allocator_info.device, image.view, nullptr);
   }
 
+  // Safe to pass null as image or allocation.
   vmaDestroyImage(
       image.memory.allocator,
       image.handle,
@@ -124,22 +127,20 @@ void Resource::Memory::Scope::operator()(const void* const data) const {
   }
 }
 
-Resource::Pool::Pool(
-    const VkInstance instance,
-    const VkPhysicalDevice physical_device,
-    const VkDevice device)
-  : device_(device),
+Resource::Pool::Pool(const GPU& gpu)
+  : device_(gpu.device),
     allocator_(
         create_allocator(
-          instance,
-          physical_device,
-          device),
+          gpu.adapter->runtime->instance(),
+          gpu.adapter->handle,
+          device_),
         vmaDestroyAllocator) {
     buffers_.reserve(Configuration::kReserve);
     images_.reserve(Configuration::kReserve);
 }
 
-Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) {
+Resource::Buffer Resource::Pool::allocate(
+    const Buffer::Descriptor& descriptor) {
   const VkBufferCreateInfo buffer_create_info{
     VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
     nullptr,
@@ -183,7 +184,8 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor)
   return buffers_.back().get();
 }
 
-Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) {
+Resource::Image Resource::Pool::allocate(
+    const Image::Descriptor& descriptor) {
   const VkImageCreateInfo image_create_info{
     VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
     nullptr,
diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h
index a74a3c2d3c89..00145ebe071f 100644
--- a/aten/src/ATen/native/vulkan/api/Resource.h
+++ b/aten/src/ATen/native/vulkan/api/Resource.h
@@ -108,10 +108,7 @@ struct Resource final {
 
   class Pool final {
    public:
-    Pool(
-        VkInstance instance,
-        VkPhysicalDevice physical_device,
-        VkDevice device);
+    explicit Pool(const GPU& gpu);
 
     Buffer allocate(const Buffer::Descriptor& descriptor);
     Image allocate(const Image::Descriptor& descriptor);
@@ -128,11 +125,8 @@ struct Resource final {
     std::vector<Handle<Image, void(*)(const Image&)>> images_;
   } pool;
 
-  Resource(
-      const VkInstance instance,
-      const VkPhysicalDevice physical_device,
-      const VkDevice device)
-    : pool(instance, physical_device, device) {
+  explicit Resource(const GPU& gpu)
+    : pool(gpu) {
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
new file mode 100644
index 000000000000..ce6e3b4231e4
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -0,0 +1,343 @@
+#include <ATen/native/vulkan/api/Runtime.h>
+#include <ATen/native/vulkan/api/Adapter.h>
+
+#include <sstream>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+namespace {
+
+struct Configuration final {
+#ifndef DEBUG
+  static constexpr Runtime::Type kRuntime = Runtime::Type::Debug;
+#else
+  static constexpr Runtime::Type kRuntime = Runtime::Type::Release;
+#endif
+};
+
+VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
+    const VkDebugReportFlagsEXT flags,
+    const VkDebugReportObjectTypeEXT /* object_type */,
+    const uint64_t /* object */,
+    const size_t /* location */,
+    const int32_t message_code,
+    const char* const layer_prefix,
+    const char* const message,
+    void* const /* user_data */) {
+  std::stringstream stream;
+  stream << layer_prefix << " " << message_code << " " << message << std::endl;
+  const std::string log = stream.str();
+
+  if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) {
+    LOG(ERROR) << log;
+  } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) {
+    LOG(WARNING) << log;
+  } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) {
+    LOG(WARNING) << "Performance:" << log;
+  } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) {
+    LOG(INFO) << log;
+  } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) {
+    LOG(INFO) << "Debug: " << log;
+  }
+
+  return VK_FALSE;
+}
+
+VkInstance create_instance(const Runtime::Type type) {
+  std::vector<const char*> enabled_instance_layers;
+  std::vector<const char*> enabled_instance_extensions;
+
+  if (Runtime::Type::Debug == type) {
+    uint32_t instance_layers_count = 0;
+    VK_CHECK(vkEnumerateInstanceLayerProperties(
+        &instance_layers_count, nullptr));
+
+    std::vector<VkLayerProperties> instance_layer_properties(
+        instance_layers_count);
+
+    VK_CHECK(vkEnumerateInstanceLayerProperties(
+        &instance_layers_count,
+        instance_layer_properties.data()));
+
+    constexpr const char* const requested_instance_layers[]{
+        // "VK_LAYER_LUNARG_api_dump",
+        "VK_LAYER_KHRONOS_validation",
+    };
+
+    for (const auto& requested_instance_layer : requested_instance_layers) {
+      for (const auto& layer : instance_layer_properties) {
+        if (strcmp(requested_instance_layer, layer.layerName) == 0) {
+          enabled_instance_layers.push_back(requested_instance_layer);
+          break;
+        }
+      }
+    }
+
+    uint32_t instance_extension_count = 0;
+    VK_CHECK(vkEnumerateInstanceExtensionProperties(
+        nullptr, &instance_extension_count, nullptr));
+
+    std::vector<VkExtensionProperties> instance_extension_properties(
+        instance_extension_count);
+
+    VK_CHECK(vkEnumerateInstanceExtensionProperties(
+        nullptr, &instance_extension_count, instance_extension_properties.data()));
+
+    constexpr const char* const requested_instance_extensions[]{
+      VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
+    };
+
+    for (const auto& requested_instance_extension : requested_instance_extensions) {
+      for (const auto& extension : instance_extension_properties) {
+        if (strcmp(requested_instance_extension, extension.extensionName) == 0) {
+          enabled_instance_extensions.push_back(requested_instance_extension);
+          break;
+        }
+      }
+    }
+  }
+
+  constexpr VkApplicationInfo application_info{
+    VK_STRUCTURE_TYPE_APPLICATION_INFO,
+    nullptr,
+    "PyTorch",
+    0,
+    "PyTorch",
+    0,
+    VK_API_VERSION_1_0,
+  };
+
+  const VkInstanceCreateInfo instance_create_info{
+    VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+    nullptr,
+    0u,
+    &application_info,
+    static_cast<uint32_t>(enabled_instance_layers.size()),
+    enabled_instance_layers.data(),
+    static_cast<uint32_t>(enabled_instance_extensions.size()),
+    enabled_instance_extensions.data(),
+  };
+
+  VkInstance instance{};
+  VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
+  TORCH_CHECK(instance, "Invalid Vulkan instance!");
+
+  return instance;
+}
+
+VkDebugReportCallbackEXT create_debug_report_callback(
+    const VkInstance instance,
+    const Runtime::Type type) {
+  if (Runtime::Type::Debug != type) {
+    return VkDebugReportCallbackEXT{};
+  }
+
+  const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{
+    VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
+    nullptr,
+    VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
+      VK_DEBUG_REPORT_WARNING_BIT_EXT |
+      VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
+      VK_DEBUG_REPORT_ERROR_BIT_EXT |
+      VK_DEBUG_REPORT_DEBUG_BIT_EXT,
+    debug_report_callback_fn,
+    nullptr,
+  };
+
+  const auto vkCreateDebugReportCallbackEXT =
+      (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
+          instance, "vkCreateDebugReportCallbackEXT");
+
+  TORCH_CHECK(
+      vkCreateDebugReportCallbackEXT,
+      "Could not load vkCreateDebugReportCallbackEXT");
+
+  VkDebugReportCallbackEXT debug_report_callback{};
+  VK_CHECK(vkCreateDebugReportCallbackEXT(
+      instance,
+      &debugReportCallbackCreateInfo,
+      nullptr,
+      &debug_report_callback));
+
+  TORCH_CHECK(
+      debug_report_callback,
+      "Invalid Vulkan debug report callback!");
+
+  return debug_report_callback;
+}
+
+std::vector<VkPhysicalDevice> acquire_physical_devices(
+    const VkInstance instance) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      instance,
+      "Invalid Vulkan instance!");
+
+  uint32_t device_count = 0;
+  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
+
+  TORCH_CHECK(
+      device_count > 0,
+      "Vulkan: Could not find a device with Vulkan support!");
+
+  std::vector<VkPhysicalDevice> devices(device_count);
+  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
+
+  return devices;
+}
+
+VkPhysicalDeviceProperties query_physical_device_properties(
+    const VkPhysicalDevice physical_device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  VkPhysicalDeviceProperties physical_device_properties{};
+  vkGetPhysicalDeviceProperties(
+      physical_device,
+      &physical_device_properties);
+
+  return physical_device_properties;
+}
+
+VkPhysicalDeviceMemoryProperties query_physical_device_memory_properties(
+    const VkPhysicalDevice physical_device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  VkPhysicalDeviceMemoryProperties physical_device_memory_properties{};
+  vkGetPhysicalDeviceMemoryProperties(
+      physical_device,
+      &physical_device_memory_properties);
+
+  return physical_device_memory_properties;
+}
+
+uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      physical_device,
+      "Invalid Vulkan physical device!");
+
+  uint32_t queue_family_count = 0;
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      physical_device, &queue_family_count, nullptr);
+
+  TORCH_CHECK(
+      queue_family_count > 0,
+      "Vulkan: Invalid number of queue families!");
+
+  std::vector<VkQueueFamilyProperties>
+      queue_families_properties(queue_family_count);
+
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      physical_device,
+      &queue_family_count,
+      queue_families_properties.data());
+
+  for (uint32_t i = 0; i < queue_families_properties.size(); ++i) {
+    const VkQueueFamilyProperties& properties = queue_families_properties[i];
+    if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
+      return i;
+    }
+  }
+
+  TORCH_CHECK(
+      false,
+      "Vulkan: Could not find a queue family that supports compute operations!");
+}
+
+} // namespace
+
+Runtime::Debug::Debug(const VkInstance instance)
+  : instance_(instance) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        instance,
+        "Invalid Vulkan instance!");
+}
+
+void Runtime::Debug::operator()(
+    const VkDebugReportCallbackEXT debug_report_callback) const {
+  if (debug_report_callback) {
+    const auto vkDestroyDebugReportCallbackEXT =
+      (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
+          instance_, "vkDestroyDebugReportCallbackEXT");
+
+      TORCH_CHECK(
+        vkDestroyDebugReportCallbackEXT,
+        "Could not load vkDestroyDebugReportCallbackEXT");
+
+      vkDestroyDebugReportCallbackEXT(
+          instance_, debug_report_callback, nullptr);
+  }
+}
+
+Runtime::Runtime(const Type type)
+    : instance_(create_instance(type), &VK_DELETER(Instance)),
+      debug_report_callback_(
+          create_debug_report_callback(instance(), type),
+          Debug(instance())) {
+}
+
+Adapter Runtime::select(const Selector& selector) {
+  const std::vector<VkPhysicalDevice> physical_devices =
+      acquire_physical_devices(instance());
+
+  for (const VkPhysicalDevice physical_device : physical_devices) {
+    const Adapter adapter{
+      this,
+      physical_device,
+      query_physical_device_properties(physical_device),
+      query_physical_device_memory_properties(physical_device),
+      query_compute_queue_family_index(physical_device),
+    };
+
+    if (selector(adapter)) {
+      return adapter;
+    }
+  }
+
+  TORCH_CHECK(
+      false,
+      "Vulkan: no adapter was selected as part of device enumeration!");
+}
+
+Runtime* initialize() {
+  static const std::unique_ptr<Runtime> runtime([]() -> Runtime* {
+#ifdef USE_VULKAN_WRAPPER
+    if (!InitVulkan()) {
+      TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan!");
+      return nullptr;
+    }
+#endif
+
+    try {
+      return new Runtime(Configuration::kRuntime);
+    }
+    catch (...) {
+      return nullptr;
+    }
+  }());
+
+  return runtime.get();
+}
+
+bool available() {
+  return initialize();
+}
+
+Runtime* runtime() {
+  Runtime* const runtime = initialize();
+  TORCH_CHECK(
+      runtime,
+      "Vulkan: Backend not available on this platform!"
+      "Calls to api::runtime() must have been guarded by api::available().");
+
+  return runtime;
+}
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.h b/aten/src/ATen/native/vulkan/api/Runtime.h
new file mode 100644
index 000000000000..766aeb50cabc
--- /dev/null
+++ b/aten/src/ATen/native/vulkan/api/Runtime.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <ATen/native/vulkan/api/Common.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+namespace api {
+
+//
+// A Vulkan Runtime initializes a Vulkan instance and decouples the concept of
+// Vulkan instance initialization from intialization of, and subsequent
+// interactions with,  Vulkan [physical and logical] devices as a precursor to
+// multi-GPU support.  The Vulkan Runtime can be queried for available Adapters
+// (i.e. physical devices) in the system which in turn can be used for creation
+// of a Vulkan Context (i.e. logical devices).  All Vulkan tensors in PyTorch
+// are associated with a Context to make tensor <-> device affinity explicit.
+//
+
+class Runtime final {
+ public:
+  enum class Type {
+    Debug,
+    Release,
+  };
+
+  explicit Runtime(Type type);
+  Runtime(const Runtime&) = delete;
+  Runtime(Runtime&&) = default;
+  Runtime& operator=(const Runtime&) = delete;
+  Runtime& operator=(Runtime&&) = default;
+  ~Runtime() = default;
+
+  inline VkInstance instance() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_);
+    return instance_.get();
+  }
+
+  typedef std::function<bool (const Adapter&)> Selector;
+  Adapter select(const Selector& selector);
+
+ private:
+  class Debug final {
+   public:
+    explicit Debug(VkInstance instance);
+    void operator()(VkDebugReportCallbackEXT debug_report_callback) const;
+
+   private:
+    VkInstance instance_;
+  };
+
+ private:
+  // Construction and destruction order matters.  Do not move members around.
+  Handle<VkInstance, decltype(&VK_DELETER(Instance))> instance_;
+  Handle<VkDebugReportCallbackEXT, Debug> debug_report_callback_;
+};
+
+bool available();
+Runtime* runtime();
+
+} // namespace api
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp
index 4cde24a2eef9..977f915a61d1 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.cpp
+++ b/aten/src/ATen/native/vulkan/api/Shader.cpp
@@ -9,11 +9,12 @@ namespace native {
 namespace vulkan {
 namespace api {
 
-Shader::Layout::Factory::Factory(const VkDevice device)
-  : device_(device) {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        device_,
-        "Invalid Vulkan device!");
+
+Shader::Layout::Factory::Factory(const GPU& gpu)
+  : device_(gpu.device) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      device_,
+      "Invalid Vulkan device!");
 }
 
 Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()(
@@ -113,8 +114,8 @@ struct Shader::Factory::Compiler final {
 
 #endif /* USE_VULKAN_SHADERC_RUNTIME */
 
-Shader::Factory::Factory(const VkDevice device)
- : device_(device),
+Shader::Factory::Factory(const GPU& gpu)
+ : device_(gpu.device),
    compiler_(new Compiler) {
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h
index 4a0080cb888d..ff02b2ba9064 100644
--- a/aten/src/ATen/native/vulkan/api/Shader.h
+++ b/aten/src/ATen/native/vulkan/api/Shader.h
@@ -52,7 +52,7 @@ struct Shader final {
 
     class Factory final {
      public:
-      explicit Factory(VkDevice device);
+      explicit Factory(const GPU& gpu);
 
       typedef Layout::Descriptor Descriptor;
       typedef VK_DELETER(DescriptorSetLayout) Deleter;
@@ -75,8 +75,8 @@ struct Shader final {
     typedef api::Cache<Factory> Cache;
     Cache cache;
 
-    explicit Layout(const VkDevice device)
-      : cache(Factory(device)) {
+    explicit Layout(const GPU& gpu)
+      : cache(Factory(gpu)) {
     }
   } layout;
 
@@ -122,7 +122,7 @@ struct Shader final {
 
   class Factory final {
    public:
-    explicit Factory(VkDevice device);
+    explicit Factory(const GPU& gpu);
     Factory(const Factory&) = delete;
     Factory& operator=(const Factory&) = delete;
     Factory(Factory&&);
@@ -152,9 +152,9 @@ struct Shader final {
   typedef api::Cache<Factory> Cache;
   Cache cache;
 
-  explicit Shader(const VkDevice device)
-    : layout(device),
-      cache(Factory(device)) {
+  explicit Shader(const GPU& gpu)
+    : layout(gpu),
+      cache(Factory(gpu)) {
   }
 };
 
diff --git a/aten/src/ATen/native/vulkan/api/api.h b/aten/src/ATen/native/vulkan/api/api.h
index 394f55d7d525..658824e3bf2b 100644
--- a/aten/src/ATen/native/vulkan/api/api.h
+++ b/aten/src/ATen/native/vulkan/api/api.h
@@ -2,9 +2,11 @@
 
 #include <ATen/native/vulkan/api/Common.h>
 
+#include <ATen/native/vulkan/api/Adapter.h>
 #include <ATen/native/vulkan/api/Command.h>
 #include <ATen/native/vulkan/api/Context.h>
 #include <ATen/native/vulkan/api/Descriptor.h>
 #include <ATen/native/vulkan/api/Pipeline.h>
 #include <ATen/native/vulkan/api/Resource.h>
+#include <ATen/native/vulkan/api/Runtime.h>
 #include <ATen/native/vulkan/api/Shader.h>
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 28c1827485b7..ebf9ffce99d0 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -6,11 +6,6 @@
 
 namespace {
 
-TEST(VulkanAPITest, Context) {
-  constexpr bool kDebug = true;
-  ASSERT_NO_THROW(at::native::vulkan::api::Context{kDebug});
-}
-
 } // namespace
 
 #endif /* USE_VULKAN_API */

From 1539d4a66478cc2288ab63f971f9074e747018c2 Mon Sep 17 00:00:00 2001
From: Haixin Liu <haixin@fb.com>
Date: Thu, 24 Sep 2020 15:18:04 -0700
Subject: [PATCH 109/449] Add operator to compute the equalization scale
 (#45096)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45096

Add operator to compute the equalization scale. This will be used in the integration of equalization into dper int8 fixed quant scheme quantization flow.

Design docs:
https://fb.quip.com/bb7SAGBxPGNC

https://fb.quip.com/PDAOAsgoLfRr

Test Plan: buck test caffe2/caffe2/quantization/server:compute_equalization_scale_test

Reviewed By: jspark1105

Differential Revision: D23779870

fbshipit-source-id: 5e6a8c220935a142ecf8e61100a8c71932afa8d7
---
 caffe2/opt/bound_shape_inferencer.cc          |  4 +-
 .../server/compute_equalization_scale.cc      | 96 +++++++++++++++++++
 .../server/compute_equalization_scale.h       | 18 ++++
 .../server/compute_equalization_scale_test.py | 89 +++++++++++++++++
 4 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100644 caffe2/quantization/server/compute_equalization_scale.cc
 create mode 100644 caffe2/quantization/server/compute_equalization_scale.h
 create mode 100644 caffe2/quantization/server/compute_equalization_scale_test.py

diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc
index d37717d5b957..d8fe956a0ddd 100644
--- a/caffe2/opt/bound_shape_inferencer.cc
+++ b/caffe2/opt/bound_shape_inferencer.cc
@@ -857,7 +857,8 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
   try {
     const static std::unordered_set<std::string>
         types_with_independent_output_shape = {"Int8GenQuantParams",
-                                               "Int8QuantSchemeBlobFill"};
+                                               "Int8QuantSchemeBlobFill",
+                                               "ComputeEqualizationScale"};
     std::vector<TensorShape> input_shapes;
     for (const auto& input : op.input()) {
       const auto it = shape_info_.find(input);
@@ -883,6 +884,7 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
     bool is_quantized = !(op.type().compare(0, 4, "Int8")) &&
         (op.type() != "Int8Dequantize") &&
         (op.type() != "Int8QuantSchemeBlobFill") &&
+        (op.type() != "ComputeEqualizationScale") &&
         (op.type() != "Int8GenQuantParams");
     float scale = 1;
     int offset = 0;
diff --git a/caffe2/quantization/server/compute_equalization_scale.cc b/caffe2/quantization/server/compute_equalization_scale.cc
new file mode 100644
index 000000000000..6e2f73ebd840
--- /dev/null
+++ b/caffe2/quantization/server/compute_equalization_scale.cc
@@ -0,0 +1,96 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#include "caffe2/quantization/server/compute_equalization_scale.h"
+#include <functional>
+
+namespace caffe2 {
+using namespace std;
+
+bool ComputeEqualizationScaleOp::RunOnDevice() {
+  // Generate equalization scale based on the input data (last N samples of
+  // the activations) and the weight
+  const auto& X = Input(0);
+  const auto& W = Input(1);
+  CAFFE_ENFORCE_EQ(X.dim(), 2);
+  CAFFE_ENFORCE_EQ(W.dim(), 2);
+
+  const int64_t M = X.size_to_dim(1);
+  const int64_t N = W.size_to_dim(1);
+  const int64_t K = W.size_from_dim(1);
+  auto* S = Output(0, K, at::dtype<float>());
+  auto* S_INV = Output(1, K, at::dtype<float>());
+  const float* X_data = X.template data<float>();
+  const float* W_data = W.template data<float>();
+  float* S_data = S->template mutable_data<float>();
+  float* S_INV_data = S_INV->template mutable_data<float>();
+
+  float WcolMax, XcolMax;
+  for (int64_t j = 0; j < K; j++) {
+    WcolMax = std::abs(W_data[j]);
+    XcolMax = std::abs(X_data[j]);
+    int64_t idx;
+    for (int64_t i = 0; i < N; i++) {
+      idx = i * K + j;
+      WcolMax = std::max(WcolMax, std::abs(W_data[idx]));
+    }
+    for (int64_t i = 0; i < M; i++) {
+      idx = i * K + j;
+      XcolMax = std::max(XcolMax, std::abs(X_data[idx]));
+    }
+    if (WcolMax == 0 || XcolMax == 0) {
+      S_data[j] = 1;
+      S_INV_data[j] = 1;
+    } else {
+      S_data[j] = std::sqrt(WcolMax / XcolMax);
+      S_INV_data[j] = 1 / S_data[j];
+    }
+  }
+  return true;
+}
+
+REGISTER_CPU_OPERATOR(ComputeEqualizationScale, ComputeEqualizationScaleOp);
+OPERATOR_SCHEMA(ComputeEqualizationScale)
+    .NumInputs(2)
+    .NumOutputs(2)
+    .SetDoc(R"DOC(
+Given a weight matrix W and input matrix X, the output S is the equalization parameter
+vector computed from W and X, and S_INV = 1 / S
+
+S is computed by:
+S[j] = max(abs(W[][j])) == 0 || max(abs(X[][j])) == 0 ? 1 :
+  sqrt(max(abs(W[][j])) / max(abs(X[][j]))),
+
+)DOC")
+    .TensorInferenceFunction([](const OperatorDef& /* def */,
+                                const vector<TensorShape>& in) {
+      vector<TensorShape> out(2);
+
+      if (in[0].unknown_shape() || in[1].unknown_shape()) {
+        out[0].set_unknown_shape(true);
+        out[1].set_unknown_shape(true);
+        return out;
+      }
+      const int64_t K = size_from_dim_(1, GetDimsVector(in[1]));
+      vector<int64_t> s_shape(2);
+      s_shape[0] = 1;
+      s_shape[1] = K;
+      out[0] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT);
+      out[1] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT);
+      return out;
+    })
+    .Input(
+        0,
+        "X",
+        "The input data, or last N samples of the output activations.")
+    .Input(1, "W", "The weight that we want to equalize with the input.")
+    .Output(
+        0,
+        "S",
+        "Scale computed that will be multiplied to the columns of input.")
+    .Output(
+        1,
+        "S_INV",
+        "Scale inverse that will be multiplied to the columns of weight.")
+    .SetDoc(
+        R"DOC(Operator to compute equalization scale given the input data and weight)DOC");
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/compute_equalization_scale.h b/caffe2/quantization/server/compute_equalization_scale.h
new file mode 100644
index 000000000000..a9facf8e1206
--- /dev/null
+++ b/caffe2/quantization/server/compute_equalization_scale.h
@@ -0,0 +1,18 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
+#include "caffe2/quantization/server/dnnlowp.h"
+
+namespace caffe2 {
+
+class ComputeEqualizationScaleOp final : public Operator<CPUContext> {
+ public:
+  ComputeEqualizationScaleOp(const OperatorDef& operator_def, Workspace* ws)
+      : Operator<CPUContext>(operator_def, ws) {}
+
+  bool RunOnDevice() override;
+
+}; // class ComputeEqualizationScaleOp
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/compute_equalization_scale_test.py b/caffe2/quantization/server/compute_equalization_scale_test.py
new file mode 100644
index 000000000000..74d34c5502d3
--- /dev/null
+++ b/caffe2/quantization/server/compute_equalization_scale_test.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2016-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.python import core, workspace
+from hypothesis import given, settings
+
+
+class TestComputeEqualizationScaleOp(hu.HypothesisTestCase):
+    @settings(max_examples=10)
+    @given(
+        m=st.integers(1, 50),
+        n=st.integers(1, 50),
+        k=st.integers(1, 50),
+        rnd_seed=st.integers(1, 5),
+        **hu.gcs_cpu_only
+    )
+    def test_compute_equalization_scale(self, m, n, k, rnd_seed, gc, dc):
+        np.random.seed(rnd_seed)
+        W = np.random.rand(n, k).astype(np.float32) - 0.5
+        X = np.random.rand(m, k).astype(np.float32) - 0.5
+
+        def ref_compute_equalization_scale(X, W):
+            S = np.ones([X.shape[1]])
+            S_INV = np.ones([X.shape[1]])
+            for j in range(W.shape[1]):
+                WcolMax = np.absolute(W[:, j]).max()
+                XcolMax = np.absolute(X[:, j]).max()
+                if WcolMax and XcolMax:
+                    S[j] = np.sqrt(WcolMax / XcolMax)
+                    S_INV[j] = 1 / S[j]
+            return S, S_INV
+
+        net = core.Net("test")
+
+        ComputeEqualizationScaleOp = core.CreateOperator(
+            "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"]
+        )
+        net.Proto().op.extend([ComputeEqualizationScaleOp])
+
+        self.ws.create_blob("X").feed(X, device_option=gc)
+        self.ws.create_blob("W").feed(W, device_option=gc)
+        self.ws.run(net)
+
+        S = self.ws.blobs["S"].fetch()
+        S_INV = self.ws.blobs["S_INV"].fetch()
+        S_ref, S_INV_ref = ref_compute_equalization_scale(X, W)
+        np.testing.assert_allclose(S, S_ref, atol=1e-3, rtol=1e-3)
+        np.testing.assert_allclose(S_INV, S_INV_ref, atol=1e-3, rtol=1e-3)
+
+    def test_compute_equalization_scale_shape_inference(self):
+        X = np.array([[1, 2], [2, 4], [6, 7]]).astype(np.float32)
+        W = np.array([[2, 3], [5, 4], [8, 2]]).astype(np.float32)
+        ComputeEqualizationScaleOp = core.CreateOperator(
+            "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"]
+        )
+        workspace.FeedBlob("X", X)
+        workspace.FeedBlob("W", W)
+
+        net = core.Net("test_shape_inference")
+        net.Proto().op.extend([ComputeEqualizationScaleOp])
+        shapes, types = workspace.InferShapesAndTypes(
+            [net],
+            blob_dimensions={"X": X.shape, "W": W.shape},
+            blob_types={"X": core.DataType.FLOAT, "W": core.DataType.FLOAT},
+        )
+        assert (
+            "S" in shapes and "S" in types and "S_INV" in shapes and "S_INV" in types
+        ), "Failed to infer the shape or type of output"
+        self.assertEqual(shapes["S"], [1, 2])
+        self.assertEqual(shapes["S_INV"], [1, 2])
+        self.assertEqual(types["S"], core.DataType.FLOAT)
+        self.assertEqual(types["S_INV"], core.DataType.FLOAT)

From 0137e3641d61983dbd22b2b2f9cc8c002e86aab4 Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@fb.com>
Date: Thu, 24 Sep 2020 15:22:16 -0700
Subject: [PATCH 110/449] Refactor subgraph merging (#44238)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44238

Refactor create_autodiff_subgraphs to use the same updating of output aliasing properties logic as tensorexpr fuser, and factor that out to a common function in subgraph utils.

Test Plan: Imported from OSS

Reviewed By: Krovatkin, robieta

Differential Revision: D23871565

Pulled By: eellison

fbshipit-source-id: 72df253b16baf8e4aabf3d68b103b29e6a54d44c
---
 .../jit/passes/create_autodiff_subgraphs.cpp  | 73 +-------------
 torch/csrc/jit/passes/tensorexpr_fuser.cpp    | 63 +-----------
 .../csrc/jit/passes/utils/subgraph_utils.cpp  | 97 +++++++++++++++++++
 torch/csrc/jit/passes/utils/subgraph_utils.h  | 16 +++
 4 files changed, 121 insertions(+), 128 deletions(-)

diff --git a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
index 11bee519292c..6ac510b13777 100644
--- a/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
+++ b/torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
@@ -13,66 +13,6 @@ namespace jit {
 
 namespace {
 
-std::vector<c10::optional<const Use>> gatherLastUses(
-    at::ArrayRef<Value*> values) {
-  return fmap(values, [&](Value* v) -> c10::optional<const Use> {
-    return firstOrLastUse(v, /*find_first*/ false);
-  });
-}
-
-// When merging a node into a subgraph, we wish to preserve all of the
-// aliasing properties of the node's outputs. It is difficult to track
-// the node or its contained nodes through all of the ir manipulation
-// involved in merging; it is pretty easy to uniquely identify the value
-// based on its uses. We can identify the value by its last use in the graph.
-// Values which do not have uses or which do not have a last use
-// outside of the subgraph to be merged into we do not need to track.
-struct ValueMapper {
-  ValueMapper(Node* n, AliasDb& db, size_t subgraph_num_outputs) {
-    last_uses_ = gatherLastUses(n->outputs());
-    subgraph_num_outputs_ = subgraph_num_outputs;
-    WithInsertPoint guard(n);
-    auto g = n->owningGraph();
-    // temporary node to put the aliasing properties of the node before its
-    // merged and destroyed
-    placeholder_node_ = g->insertNode(g->create(prim::Uninitialized, 0));
-    for (size_t i = 0; i < n->outputs().size(); ++i) {
-      Value* existing = n->outputs().at(i);
-      Value* new_value =
-          placeholder_node_->insertOutput(i)->copyMetadata(n->outputs().at(i));
-      db.replaceWithNewValue(existing, new_value);
-    }
-  }
-
-  bool usesEqual(const Use& a, const Use& b) {
-    return a.user == b.user && a.offset == b.offset;
-  }
-
-  void copyAliasing(Node* merged_node, AliasDb& db) {
-    auto num_outputs = merged_node->outputs().size();
-    auto new_outputs = merged_node->outputs().slice(
-        subgraph_num_outputs_, num_outputs - subgraph_num_outputs_);
-    for (Value* v : new_outputs) {
-      auto maybe_last_use = firstOrLastUse(v, /*find_first*/ false);
-      // if it doesnt have a use it shouldnt have been added as output
-      TORCH_INTERNAL_ASSERT(maybe_last_use);
-      const Use last_use = *maybe_last_use;
-      size_t i = 0;
-      while (i < last_uses_.size() && last_uses_.at(i).has_value() &&
-             !usesEqual(*last_uses_.at(i), last_use)) {
-        ++i;
-      }
-      TORCH_INTERNAL_ASSERT(i != last_uses_.size());
-      db.replaceWithNewValue(placeholder_node_->outputs().at(i), v);
-    }
-    placeholder_node_->destroy();
-  }
-
-  std::vector<c10::optional<const Use>> last_uses_;
-  size_t subgraph_num_outputs_;
-  Node* placeholder_node_;
-};
-
 struct WorkBlock : public std::pair<Node*, Node*> {
   using pair::pair;
 
@@ -285,11 +225,8 @@ class SubgraphSlicer {
   std::pair<graph_node_list::iterator, bool> scanNode(Node* consumer) {
     if (shouldConsiderForMerge(consumer)) {
       if (consumer->kind() != prim::DifferentiableGraph) {
-        // ValueMapper preserves the aliasing information of the node's outputs
-        ValueMapper vm(consumer, aliasDb_, 0);
-        consumer = SubgraphUtils::createSingletonSubgraph(
-            consumer, prim::DifferentiableGraph);
-        vm.copyAliasing(consumer, aliasDb_);
+        consumer = SubgraphUtils::createSingletonSubgraphAndUpdateAliasing(
+            consumer, prim::DifferentiableGraph, aliasDb_);
       }
       auto inputs = sortReverseTopological(consumer->inputs());
       for (auto input : inputs) {
@@ -315,10 +252,8 @@ class SubgraphSlicer {
       return c10::nullopt;
     }
 
-    ValueMapper vm(producer, aliasDb_, consumer->outputs().size());
-    SubgraphUtils::mergeNodeIntoSubgraph(producer, consumer);
-    vm.copyAliasing(consumer, aliasDb_);
-
+    SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing(
+        producer, consumer, aliasDb_);
     return consumer;
   }
 
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 4d98110d3975..67a232d94088 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -305,69 +305,13 @@ class TensorExprFuser {
   }
 
  private:
-  // Merges `to_merge` into a subgraph by executing merge_fn.
-  // merge_fn takes in map that will be filled with the mapping b/w
-  // to_merge's outputs and the corresponding values in the subgraph.
-  // merge_fn returns the merged-into subgraph
-  Node* aliasingSafeSubgraphMerge(
-      Node* to_merge,
-      const std::function<Node*(std::unordered_map<Value*, Value*>&)>&
-          merge_fn) {
-    // When we merge a node into a subgraph, the new subgraph outputs
-    // have the same aliasing properties as the original node's outputs.
-    // Here we create a placeholder node, transfer the aliasing properties
-    // to the placeholder, execute the merge, and transfer the aliasing
-    // properties to the appropriate fusion group outputs
-    Node* placeholder_node =
-        graph_->insertNode(graph_->create(prim::Uninitialized, 0));
-    std::vector<Value*> existing_values;
-    for (size_t i = 0; i < to_merge->outputs().size(); ++i) {
-      Value* existing = to_merge->outputs().at(i);
-      Value* new_value = placeholder_node->insertOutput(i)->copyMetadata(
-          to_merge->outputs().at(i));
-      aliasDb_->replaceWithNewValue(existing, new_value);
-      existing_values.push_back(existing);
-    }
-    std::unordered_map<Value*, Value*> vmap;
-    Node* fusion_group = merge_fn(vmap);
-    for (size_t i = 0; i < existing_values.size(); ++i) {
-      TORCH_INTERNAL_ASSERT(vmap.count(existing_values.at(i)));
-      Value* subgraph_value = vmap[existing_values.at(i)];
-      auto subgraph = SubgraphUtils::getSubgraph(fusion_group);
-      size_t subgraph_output_index = 0;
-      for (; subgraph_output_index < subgraph->outputs().size();
-           ++subgraph_output_index) {
-        if (subgraph->outputs().at(subgraph_output_index) == subgraph_value) {
-          break;
-        }
-      }
-      if (subgraph_output_index != subgraph->outputs().size()) {
-        aliasDb_->replaceWithNewValue(
-            placeholder_node->outputs().at(i),
-            fusion_group->outputs().at(subgraph_output_index));
-      }
-    }
-    placeholder_node->destroy();
-    return fusion_group;
-  }
-
   Node* getOrCreateTensorExprSubgraph(Node* n) {
     if (n->hasAttribute(attr::Subgraph) && n->kind() == prim::TensorExprGroup) {
       return n;
     }
     GRAPH_UPDATE("Creating a tensorexpr::Group node from: ", *n);
-    return aliasingSafeSubgraphMerge(
-        n, [&](std::unordered_map<Value*, Value*>& vmap) {
-          return SubgraphUtils::createSingletonSubgraph(
-              n, prim::TensorExprGroup, vmap);
-        });
-  }
-
-  void mergeNodeIntoSubgraphAndUpdateAliasing(Node* n, Node* subgraph) {
-    aliasingSafeSubgraphMerge(n, [&](std::unordered_map<Value*, Value*>& vmap) {
-      SubgraphUtils::mergeNodeIntoSubgraph(n, subgraph, vmap);
-      return subgraph;
-    });
+    return SubgraphUtils::createSingletonSubgraphAndUpdateAliasing(
+        n, prim::TensorExprGroup, *aliasDb_);
   }
 
   // Add unvisited input nodes to the queue for further merging into the fusion
@@ -557,7 +501,8 @@ class TensorExprFuser {
 
     for (auto n : nodes_to_merge) {
       GRAPH_UPDATE("Merging ", getHeader(n));
-      mergeNodeIntoSubgraphAndUpdateAliasing(n, fusion_group);
+      SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing(
+          n, fusion_group, *aliasDb_);
     }
     return fusion_group;
   }
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index 6c9aad77cf93..1576aca36fa8 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <torch/csrc/jit/passes/canonicalize.h>
 
 namespace torch {
 namespace jit {
@@ -9,6 +10,82 @@ bool hasSubgraph(Node* n) {
   return n->hasAttribute(attr::Subgraph);
 }
 
+std::vector<c10::optional<const Use>> gatherLastUses(
+    at::ArrayRef<Value*> values) {
+  return fmap(values, [&](Value* v) -> c10::optional<const Use> {
+    return firstOrLastUse(v, /*find_first*/ false);
+  });
+}
+
+// When merging a node into a subgraph, we wish to preserve all of the
+// aliasing properties of the node's outputs. It is difficult to track
+// the node or its contained nodes through all of the ir manipulation
+// involved in merging; it is pretty easy to uniquely identify the value
+// based on its uses. We can identify the value by its last use in the graph.
+// Values which do not have uses or which do not have a last use
+// outside of the subgraph to be merged into we do not need to track.
+struct ValueMapper {
+  ValueMapper(Node* to_merge, AliasDb& db, size_t subgraph_num_outputs) {
+    last_uses_ = gatherLastUses(to_merge->outputs());
+    subgraph_num_outputs_ = subgraph_num_outputs;
+    WithInsertPoint guard(to_merge);
+    auto g = to_merge->owningGraph();
+    // temporary node to put the aliasing properties of the node before its
+    // merged and destroyed
+    placeholder_node_ = g->insertNode(g->create(prim::Uninitialized, 0));
+    for (size_t i = 0; i < to_merge->outputs().size(); ++i) {
+      Value* existing = to_merge->outputs().at(i);
+      Value* new_value = placeholder_node_->insertOutput(i)->copyMetadata(
+          to_merge->outputs().at(i));
+      db.replaceWithNewValue(existing, new_value);
+    }
+  }
+
+  bool usesEqual(const Use& a, const Use& b) {
+    return a.user == b.user && a.offset == b.offset;
+  }
+
+  void copyAliasing(Node* merged_node, AliasDb& db) {
+    auto num_outputs = merged_node->outputs().size();
+    auto new_outputs = merged_node->outputs().slice(
+        subgraph_num_outputs_, num_outputs - subgraph_num_outputs_);
+    for (Value* v : new_outputs) {
+      auto maybe_last_use = firstOrLastUse(v, /*find_first*/ false);
+      // if it doesnt have a use it shouldnt have been added as output
+      TORCH_INTERNAL_ASSERT(maybe_last_use);
+      const Use last_use = *maybe_last_use;
+      size_t i = 0;
+      while (i < last_uses_.size() && last_uses_.at(i).has_value() &&
+             !usesEqual(*last_uses_.at(i), last_use)) {
+        ++i;
+      }
+      TORCH_INTERNAL_ASSERT(i != last_uses_.size());
+      db.replaceWithNewValue(placeholder_node_->outputs().at(i), v);
+    }
+    placeholder_node_->destroy();
+  }
+
+  std::vector<c10::optional<const Use>> last_uses_;
+  size_t subgraph_num_outputs_;
+  Node* placeholder_node_;
+};
+
+Node* executeSubgraphMergeAndUpdateAliasing(
+    Node* to_merge,
+    c10::optional<Node*> existing,
+    AliasDb& db,
+    const std::function<Node*(void)>& merge_fn) {
+  // When we merge a node into a subgraph, the new subgraph outputs
+  // have the same aliasing properties as the original node's outputs.
+  // Here we create a placeholder node, transfer the aliasing properties
+  // to the placeholder, execute the merge, and transfer the aliasing
+  // properties to the appropriate fusion group outputs
+  ValueMapper vm(to_merge, db, existing ? (*existing)->outputs().size() : 0);
+  Node* fusion_group = merge_fn();
+  vm.copyAliasing(fusion_group, db);
+  return fusion_group;
+}
+
 // Combine the nodes in two subgraph together. The nodes will end up in
 // `mergeTo`, and `mergeFrom` is destroyed.
 void mergeSubgraph(
@@ -281,6 +358,26 @@ Node* createSingletonSubgraph(Node* n, Symbol subgraphKind) {
   return createSingletonSubgraph(n, subgraphKind, vmap);
 }
 
+void mergeNodeIntoSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Node* subgraphNode,
+    AliasDb& db) {
+  executeSubgraphMergeAndUpdateAliasing(to_merge, subgraphNode, db, [&]() {
+    mergeNodeIntoSubgraph(to_merge, subgraphNode);
+    return subgraphNode;
+  });
+}
+
+Node* createSingletonSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Symbol subgraphKind,
+    AliasDb& db) {
+  return executeSubgraphMergeAndUpdateAliasing(
+      to_merge, c10::nullopt, db, [&]() {
+        return createSingletonSubgraph(to_merge, subgraphKind);
+      });
+}
+
 } // namespace SubgraphUtils
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.h b/torch/csrc/jit/passes/utils/subgraph_utils.h
index 94150258b5fa..77c3d388425f 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.h
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/ir.h>
 
 namespace torch {
@@ -26,6 +27,13 @@ TORCH_API Node* createSingletonSubgraph(
     Symbol subgraphKind,
     std::unordered_map<Value*, Value*>& vmap);
 
+// Creates a new subgraph that only contains `n`, amd udpates the new outputs
+// of the subgraph to have the aliasing properties of the original `n` outputs
+TORCH_API Node* createSingletonSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Symbol subgraphKind,
+    AliasDb& db);
+
 // Merge a node into a subgraph node. If `toMerge` is also a subgraph, the
 // subgraphs are merged.
 // `toMerge` is destroyed.
@@ -37,6 +45,14 @@ TORCH_API void mergeNodeIntoSubgraph(
     Node* subgraphNode,
     std::unordered_map<Value*, Value*>& vmap);
 
+// Merges a node into a subgraph node, and updates the new outputs of the
+// subgraph to have the aliasing properties of the corresponding `to_merge`
+// outputs
+TORCH_API void mergeNodeIntoSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Node* subgraphNode,
+    AliasDb& db);
+
 // Move nodes from a subgraph node to the outer graph.
 // `subgraphNode` is destroyed.
 // An optional argument 'vmap' could be used to retrieve value mappings.

From 5dd288eb066bc178a89447453c7fba961a3e0174 Mon Sep 17 00:00:00 2001
From: Elias Ellison <eellison@devfair044.maas>
Date: Thu, 24 Sep 2020 15:22:16 -0700
Subject: [PATCH 111/449] [JIT] Regularize tensorexpr fuser strategy with other
 fusers (#44972)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44972

Previously, our fusion strategy would be:
- start at the end of the block, find a fusable node
- iteratively try to merge inputs into the fusion group, sorted topologically

This strategy works pretty well, but has the possibility of missing fusion groups. See my attached test case for an example where we wouldn't find all possible fusion groups. bertmaher found an example of a missed fusion groups in one of our rnn examples (jit_premul) that caused a regression from the legacy fuser.

Here, I'm updating our fusion strategy to be the same as our other fusion passes - create_autodiff_subgraphs, and graph_fuser.cpp.

The basic strategy is:
- iterate until you find a fusible node
- try to merge the nodes inputs, whenever a succesful merge occurs restart at the beginning of the nodes inputs
- after you've exhausted a node, continue searching the block for fusion opportunities from the node
- continue doing this on the block until we go through an iteration without an succesful merges

Since we create the fusion groups once, and only re-specialize within the fusion groups, we should be running this very infrequently (only re-triggers when we fail undefinedness specializations). Also bc it's the same algorithm as the existing fuser it is unlikely to cause a regression.

Test Plan: Imported from OSS

Reviewed By: Krovatkin, robieta

Differential Revision: D23821581

Pulled By: eellison

fbshipit-source-id: e513d1ef719120dadb0bfafc7a14f4254cd806ee
---
 test/jit/test_profiler.py                     |  22 +++
 torch/csrc/jit/passes/tensorexpr_fuser.cpp    | 184 +++++++++---------
 .../csrc/jit/passes/utils/subgraph_utils.cpp  |  28 ++-
 torch/csrc/jit/passes/utils/subgraph_utils.h  |  10 +-
 4 files changed, 146 insertions(+), 98 deletions(-)

diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
index 50d4351a4870..55604f5ff6bf 100644
--- a/test/jit/test_profiler.py
+++ b/test/jit/test_profiler.py
@@ -83,6 +83,7 @@ def test_fuse(a, b):
         # that guards a tensorexpr group
         optimized_block = next(g.findNode("prim::If").blocks())
         if_nodes = list(optimized_block.findAllNodes("prim::If"))
+
         self.assertEqual(len(if_nodes), 1)
         FileCheck().check("Group[Subgraph").run(str(if_nodes[0]))
         # no broadcasts occurred, sum_to_size have been specialized out
@@ -191,3 +192,24 @@ def foo(a, b):
 
         g = torch.jit.last_executed_optimized_graph()
         FileCheck().check("fallback_function").check_next("CallFunction").run(g)
+
+    def test_iterative_fusion(self):
+        @torch.jit.script
+        def foo(a, b, c, d):
+            a = a + b
+            b.add_(3)
+            c = c + b + d
+            a = a + 1
+            return a, c
+
+        x = torch.ones(1, requires_grad=False)
+        foo(x, x, x, x)
+        foo(x, x, x, x)
+
+        # when we iterate through the block, we will start
+        # by fusing a = a + b with a = a + 1
+        # if we were to continue iteration from that fusion point,
+        # would miss the fusion opportunity of c = c + d + b
+
+        g = torch.jit.last_executed_optimized_graph()
+        self.assertEqual(len(list(g.findAllNodes("prim::TensorExprGroup"))), 2)
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 67a232d94088..3782c2af4f33 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -166,6 +166,7 @@ bool isSupported(Node* node) {
   switch (node->kind()) {
     case prim::ConstantChunk:
     case prim::ListConstruct:
+    case prim::TensorExprGroup:
       return true;
   }
 
@@ -201,12 +202,6 @@ bool texprReductionsEnabled() {
   return texpr_reductions_enabled;
 }
 
-struct nodesComparator {
-  bool operator()(Node* a, Node* b) const {
-    return a->isAfter(b);
-  }
-};
-
 // TODO: if a value has differently typed uses, temporarrily insert a node
 // specializing the type for each use and later remove, instead of bailing
 bool profiledWithDifferentTypes(Value* v) {
@@ -298,6 +293,11 @@ class TensorExprFuser {
     GRAPH_DUMP("After removing redundant profile nodes: ", graph_);
     createFusionGroups(graph_->block());
     GRAPH_DUMP("After creating fusion groups: ", graph_);
+    // we maintain alias db correctness during initial fusion, but it is
+    // difficult to maintain correctness after inlining so inline only after
+    // fusion is done.
+    inlineSmallFusionGroups(graph_->block());
+    GRAPH_DUMP("After inlining small fusion groups: ", graph_);
     guardFusionGroups(graph_->block());
     GRAPH_DUMP("After guarding fusion groups: ", graph_);
     removeTensorTypeSpecializations(graph_->block());
@@ -314,56 +314,44 @@ class TensorExprFuser {
         n, prim::TensorExprGroup, *aliasDb_);
   }
 
-  // Add unvisited input nodes to the queue for further merging into the fusion
-  // group.
-  void updateQueue(
-      Node* fusion_group,
-      std::set<Node*, nodesComparator>& queue,
-      const std::unordered_set<Node*>& visited) {
-    for (auto input : fusion_group->inputs()) {
-      if (!visited.count(input->node())) {
-        queue.insert(input->node());
+  value_list sortReverseTopological(ArrayRef<Value*> inputs, Block* b) {
+    value_list result;
+    for (auto i : inputs) {
+      if (i->node()->owningBlock() == b) {
+        result.push_back(i);
       }
     }
+    // Sort in reverse topological order
+    std::sort(result.begin(), result.end(), [&](Value* a, Value* b) {
+      return a->node()->isAfter(b->node());
+    });
+    return result;
   }
 
   // Create a fusion group starting from the node N.
   // We then try to pull inputs into the fusion group and repeat that process
   // until there is nothing we can pull in.
-  Node* createFusionGroup(Node* n) {
-    // Queue of the nodes we should consider for merging into the fusion groups
-    // (those nodes are usually inputs of the fusion group).
-    // We use an ordered set here to visit them in the right order: the fusion
-    // group is closer to the end of the block and we are trying to pull later
-    // nodes first.
-    // NB: the order in the list in theory could stale if we move nodes around.
-    // However, this should only happen to the nodes we could not fuse, and
-    // hence it should not be a problem.
-    std::set<Node*, nodesComparator> queue;
-    std::unordered_set<Node*> visited_nodes;
-
-    Node* fusion_group = n;
+  std::pair<graph_node_list::iterator, bool> createFusionGroup(
+      Node* fusion_node) {
     if (min_group_size_ == 1) {
-      fusion_group = getOrCreateTensorExprSubgraph(n);
+      fusion_node = getOrCreateTensorExprSubgraph(fusion_node);
     }
 
-    updateQueue(fusion_group, queue, visited_nodes);
-
     GRAPH_DEBUG("Iteratively pull input nodes into the fusion group...\n");
-    while (!queue.empty()) {
-      debugDumpFusionGroup("Current fusion group: ", fusion_group);
-      GRAPH_DEBUG(queue.size(), " nodes are in the queue.\n");
-
-      Node* input_node = *queue.begin();
-      queue.erase(queue.begin());
-
-      GRAPH_DEBUG("Trying to merge: ", *input_node);
-      fusion_group = tryMerge(fusion_group, input_node);
-      visited_nodes.insert(input_node);
-      updateQueue(fusion_group, queue, visited_nodes);
+    auto inputs = sortReverseTopological(
+        fusion_node->inputs(), fusion_node->owningBlock());
+    for (auto input : inputs) {
+      debugDumpFusionGroup("Current fusion group: ", fusion_node);
+      GRAPH_DEBUG("Trying to merge: ", *input->node());
+      if (auto maybe_fusion_group = tryMerge(fusion_node, input->node())) {
+        // we successfully merged, so the new group's `inputs` may have
+        // changed. So rescan the new group for more merging opportunities.
+        return std::make_pair(
+            maybe_fusion_group.value()->reverseIterator(), true);
+      }
     }
 
-    return fusion_group;
+    return std::make_pair(++fusion_node->reverseIterator(), false);
   }
 
   static void debugDumpFusionGroup(const std::string& msg, Node* n) {
@@ -373,69 +361,75 @@ class TensorExprFuser {
     }
   }
 
+  std::pair<graph_node_list::iterator, bool> scanNode(Node* n) {
+    GRAPH_DEBUG("Considering node:", *n)
+
+    if (!canHandle(n)) {
+      return std::make_pair(++n->reverseIterator(), false);
+    }
+    // There are some nodes that we can support, but we don't want to start a
+    // fusion group from - skip them.
+    if (n->kind() == prim::ListConstruct || n->kind() == aten::slice ||
+        n->kind() == aten::unsqueeze || n->kind() == prim::ConstantChunk ||
+        n->kind() == prim::Constant) {
+      return std::make_pair(++n->reverseIterator(), false);
+    }
+    return createFusionGroup(n);
+  }
+
   // Merge fusible nodes into subgraphs in prim::TensorExprGroup nodes.
   void createFusionGroups(Block* block) {
-    std::vector<Node*> fusion_groups;
-    auto reverse_iter = block->nodes().reverse();
-    Node* prev_fusion_group = nullptr;
-    for (auto it = reverse_iter.begin(); it != reverse_iter.end();) {
-      Node* n = *it;
-      GRAPH_DEBUG("Considering node:", *n)
+    bool any_changed = true;
+    while (any_changed) {
+      any_changed = false;
+      for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) {
+        bool changed;
+        std::tie(it, changed) = scanNode(*it);
+        any_changed |= changed;
+      }
+    }
 
+    for (Node* n : block->nodes()) {
       for (Block* b : n->blocks()) {
         createFusionGroups(b);
       }
+    }
 
-      if (!canHandle(n)) {
-        it++;
-        continue;
-      }
-      // There are some nodes that we can support, but we don't want to start a
-      // fusion group from - skip them.
-      if (n->kind() == prim::ListConstruct || n->kind() == aten::slice ||
-          n->kind() == aten::unsqueeze || n->kind() == prim::ConstantChunk ||
-          n->kind() == prim::Constant) {
-        it++;
-        continue;
+    // Try to merge adjacent fusion groups together. Because we have only merged
+    // by looking at graph inputs, without this we would not attempt to merge
+    // adjacent fusion groups that don't have a depdency on each other
+
+    std::vector<Node*> initial_fusion_groups;
+    for (Node* n : block->nodes()) {
+      if (n->kind() == prim::TensorExprGroup) {
+        initial_fusion_groups.push_back(n);
       }
+    }
 
-      Node* fusion_group = createFusionGroup(n);
-      debugDumpFusionGroup("Fusion group constructed: ", fusion_group);
+    Node* prev_fusion_group =
+        initial_fusion_groups.size() ? initial_fusion_groups[0] : nullptr;
 
+    for (size_t i = 1; i < initial_fusion_groups.size(); ++i) {
       // Try merging the just created fusion group into the previous one.
       // If it did not work, then put the previous fusion group into
       // fusion_groups vector - we will not touch it anymore in this loop.
       // If merging suceeded, save the merged group as the "previous" fusion
       // group so that we can try to merge the next one into it.
-      if (prev_fusion_group) {
+
+      Node* fusion_group = initial_fusion_groups[i];
+      debugDumpFusionGroup(
+          "Trying to merge into the previous fusion group: ",
+          prev_fusion_group);
+      if (auto merged_fusion_group =
+              tryMerge(prev_fusion_group, fusion_group)) {
+        prev_fusion_group = *merged_fusion_group;
         debugDumpFusionGroup(
-            "Trying to merge into the previous fusion group: ",
+            "Successfully merged into the previous fusion group: ",
             prev_fusion_group);
-        if (canMerge(prev_fusion_group, fusion_group)) {
-          prev_fusion_group = tryMerge(prev_fusion_group, fusion_group);
-          debugDumpFusionGroup(
-              "Successfully merged into the previous fusion group: ",
-              prev_fusion_group);
-        } else {
-          GRAPH_DEBUG("Cannot merge into the previous fusion group");
-          fusion_groups.push_back(prev_fusion_group);
-          prev_fusion_group = fusion_group;
-        }
       } else {
+        GRAPH_DEBUG("Cannot merge into the previous fusion group");
         prev_fusion_group = fusion_group;
       }
-      it = prev_fusion_group->reverseIterator();
-      it++;
-    }
-
-    // We were adding groups into the vector lagging by one - catch up with
-    // adding the last one
-    if (prev_fusion_group) {
-      fusion_groups.push_back(prev_fusion_group);
-    }
-
-    for (Node* n : fusion_groups) {
-      inlineIfTooSmall(n);
     }
   }
 
@@ -471,9 +465,21 @@ class TensorExprFuser {
     return false;
   }
 
-  Node* tryMerge(Node* fusion_group, Node* to_merge) {
+  void inlineSmallFusionGroups(Block* block) {
+    for (auto it = block->nodes().begin(); it != block->nodes().end();) {
+      Node* n = *it;
+      it++;
+
+      for (Block* b : n->blocks()) {
+        inlineSmallFusionGroups(b);
+      }
+      inlineIfTooSmall(n);
+    }
+  }
+
+  c10::optional<Node*> tryMerge(Node* fusion_group, Node* to_merge) {
     if (!canMerge(fusion_group, to_merge)) {
-      return fusion_group;
+      return c10::nullopt;
     }
 
     std::vector<Node*> nodes_to_merge = {to_merge};
@@ -490,7 +496,7 @@ class TensorExprFuser {
       GRAPH_UPDATE("Trying to move node next to fusion group: ", getHeader(n));
       if (!aliasDb_->moveBeforeTopologicallyValid(n, move_point)) {
         GRAPH_UPDATE("Failed to move because of AliasDB checks!");
-        return fusion_group;
+        return c10::nullopt;
       }
       move_point = n;
     }
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index 1576aca36fa8..73976cb66bc8 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -117,11 +117,20 @@ void mergeSubgraph(
   // Now we're merging the "unmerged" nodes into the mergeFrom subgraph. That
   // will give us a new map: "unmerged" -> "merged".
   std::unordered_map<Value*, Value*> merge_vmap;
+
+  // defer destroying nodes until after all nodes have been merged, otherwise we
+  // run into lifetime issues where the previous mapping of the merged nodes
+  // inputs/outputs can be overwritten with newly created values
+  std::vector<Node*> merged_nodes;
   while (it != end_it) {
-    // NB: mergeNodeIntoSubgraph destroys node, hence the complications
     Node* node = *it;
     ++it;
-    mergeNodeIntoSubgraph(node, mergeTo, merge_vmap);
+    merged_nodes.push_back(node);
+    mergeNodeIntoSubgraph(node, mergeTo, merge_vmap, /*destroyNode*/ false);
+  }
+
+  for (Node* n : merged_nodes) {
+    n->destroy();
   }
 
   // Vmap should contain "original" -> "merged" mapping, thus we basically need
@@ -228,7 +237,8 @@ std::unordered_set<Value*> closedOverValues(
 void mergeNodeIntoSubgraph(
     Node* toMerge,
     Node* subgraphNode,
-    std::unordered_map<Value*, Value*>& vmap) {
+    std::unordered_map<Value*, Value*>& vmap,
+    bool destroyNode) {
   AT_ASSERT(hasSubgraph(subgraphNode) && toMerge != subgraphNode);
   if (hasSubgraph(toMerge)) {
     return mergeSubgraph(subgraphNode, toMerge, vmap);
@@ -334,11 +344,17 @@ void mergeNodeIntoSubgraph(
     }
   }
   // Remove the original node now that the merge is complete
-  toMerge->destroy();
+  if (destroyNode) {
+    toMerge->destroy();
+  }
 }
-void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode) {
+
+void mergeNodeIntoSubgraph(
+    Node* toMerge,
+    Node* subgraphNode,
+    bool destroyNode) {
   std::unordered_map<Value*, Value*> vmap;
-  mergeNodeIntoSubgraph(toMerge, subgraphNode, vmap);
+  mergeNodeIntoSubgraph(toMerge, subgraphNode, vmap, destroyNode);
 }
 
 Node* createSingletonSubgraph(
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.h b/torch/csrc/jit/passes/utils/subgraph_utils.h
index 77c3d388425f..c0ffc3635031 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.h
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.h
@@ -36,14 +36,18 @@ TORCH_API Node* createSingletonSubgraphAndUpdateAliasing(
 
 // Merge a node into a subgraph node. If `toMerge` is also a subgraph, the
 // subgraphs are merged.
-// `toMerge` is destroyed.
+// If `destroyNode` is true `toMerge` is destroyed.
 // An optional argument 'vmap' could be used to retrieve value mappings.
 // Values will be mapped to their new subgraph values
-TORCH_API void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode);
 TORCH_API void mergeNodeIntoSubgraph(
     Node* toMerge,
     Node* subgraphNode,
-    std::unordered_map<Value*, Value*>& vmap);
+    bool destroyNode = true);
+TORCH_API void mergeNodeIntoSubgraph(
+    Node* toMerge,
+    Node* subgraphNode,
+    std::unordered_map<Value*, Value*>& vmap,
+    bool destroyNode = true);
 
 // Merges a node into a subgraph node, and updates the new outputs of the
 // subgraph to have the aliasing properties of the corresponding `to_merge`

From bee1d448e76837e7ffc066fcad576ccb98e92ee1 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Thu, 24 Sep 2020 15:55:35 -0700
Subject: [PATCH 112/449] Fix test_rpc_profiling_remote_record_function
 (#45162)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45162

This test was flaky because it was not able to validate that the
overall record_function's CPU times are greater than the sum of its children.
It turns out that this is a general bug in the profiler that can be reproduced
without RPC, see https://github.com/pytorch/pytorch/issues/45160. Hence,
removing this from the test and replacing it by just validating the expected
children.

Ran the test 1000 times and they all passed.
ghstack-source-id: 112632327

Test Plan: CI

Reviewed By: mrshenli

Differential Revision: D23851854

fbshipit-source-id: 5d9023acd17800a6668ba4849659d8cc902b8d6c
---
 .../_internal/distributed/rpc/rpc_test.py     | 48 ++++++++++++++-----
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 797e5a010b86..163a772628a5 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1302,19 +1302,41 @@ def test_rpc_profiling_remote_record_function(self):
                 "aten::zero_",
                 "aten::fill_",
             ]
-            remote_ops_time = sum(
-                evt.cpu_time_total
-                for evt in remaining_remote_events
-                if not any(
-                    [
-                        rf_entry_event in evt.name
-                        for rf_entry_event in remote_events_denylist
-                    ]
-                )
-            )
-            self.assertGreaterEqual(
-                record_function_remote_event.cpu_time_total, remote_ops_time
-            )
+
+            REMOTE_OP_STR = "#remote_op: "
+
+            def convert_remote_to_local(event_name):
+                remote_op_key = REMOTE_OP_STR
+                return event_name[event_name.find(remote_op_key) + len(remote_op_key) :]
+
+            # Ideally, we should validate that the sum of remote operations within
+            # record_function are less than record_function's CPU time. However,
+            # there is a known bug in profiling
+            # (https://github.com/pytorch/pytorch/issues/45160) due to which we
+            # can't do this. So, we just validate they are child events.
+            prof.key_averages()
+
+            # cpu_children only returns direct children, so here we get all
+            # children recursively.
+            def get_cpu_children(event):
+                if not event.cpu_children:
+                    return []
+                cpu_children = event.cpu_children
+                for e in event.cpu_children:
+                    cpu_children.extend(get_cpu_children(e))
+                return cpu_children
+
+            record_function_children_names = [
+                convert_remote_to_local(c.name)
+                for c in get_cpu_children(record_function_remote_event)
+            ]
+            for evt in remaining_remote_events:
+                local_name = convert_remote_to_local(evt.name)
+                if local_name not in remote_events_denylist:
+                    self.assertTrue(
+                        local_name in record_function_children_names,
+                        f"{local_name} not in {record_function_children_names}",
+                    )
 
     def validate_profiling_workload(self, dst, prof):
         REMOTE_OP_STR = "#remote_op: "

From 92ebb04f9206882e6d312a8b91318545f43a53c2 Mon Sep 17 00:00:00 2001
From: Himangshu <hlahkar@gmail.com>
Date: Thu, 24 Sep 2020 16:24:36 -0700
Subject: [PATCH 113/449] added check for NumberType (#44375)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/44107

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44375

Reviewed By: mrshenli

Differential Revision: D23906728

Pulled By: eellison

fbshipit-source-id: 3b534e5dd3af1f5e43a7314953e64117cbe8ffe4
---
 torch/csrc/jit/frontend/ir_emitter.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 690e52d7131d..99ce4140c58a 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -2189,13 +2189,13 @@ struct to_ir {
   NamedValue emitValueToTensor(
       const NamedValue& value,
       const NamedValue& matchTypeOf) {
-    // Add implicit conversion of int/float/bool types to tensors
+    // Add implicit conversion of int/float/bool/number types to tensors
     // Used in emitSubscriptAssign to convert:
     //   `tensor(...)[x] = 99` to `tensor(...)[x] = tensor(99)`
     // Mirrors the `valueToTensor` behavior in python_variable_indexing.cpp
     const auto kind = value.type()->kind();
-    if (kind == c10::TypeKind::IntType || kind == c10::TypeKind::BoolType ||
-        kind == c10::TypeKind::FloatType) {
+    if (kind == c10::TypeKind::NumberType || kind == c10::TypeKind::IntType ||
+        kind == c10::TypeKind::BoolType || kind == c10::TypeKind::FloatType) {
       auto dtype = graph->insert(prim::dtype, {matchTypeOf}, {});
       auto device = graph->insert(prim::device, {matchTypeOf}, {});
       auto converted = graph->insert(

From 0b6e5ad4a92636ec82fb103b82303785c078407a Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Thu, 24 Sep 2020 16:38:14 -0700
Subject: [PATCH 114/449] Resolve comments in #44354. (#45150)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45150

Test Plan: Imported from OSS

Reviewed By: bhosmer

Differential Revision: D23846796

Pulled By: ailzhang

fbshipit-source-id: 7bef89d833848ac3f8993c4c037acf1d4f2ca674
---
 aten/src/ATen/core/boxing/KernelFunction.cpp  |  1 +
 aten/src/ATen/core/dispatch/OperatorEntry.cpp | 13 ++++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
index b5d552e0e31c..f84352ebee1f 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) {
 void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) {
   TORCH_INTERNAL_ASSERT(0,
     op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "
+    "This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). "
     "If it's intended to override Math kernel behavior, please open an issue to request a dedicated "
     "Autograd dispatch key for the backend.");
 }
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 5fa379e40710..0942659d2960 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -157,10 +157,9 @@ const KernelFunction& OperatorEntry::computeDispatchTableEntry(const c10::Dispat
 }
 
 bool OperatorEntry::hasKernelForDispatchKeySet(DispatchKeySet ks) const {
-  for (auto k : ks) {
-    if (kernels_.find(k) != kernels_.end()) {
-      return true;
-    }
+  TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end());
+  for (auto& kv : kernels_) {
+    if (ks.has(kv.first)) return true;
   }
   return false;
 }
@@ -196,6 +195,9 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //          In the past we directly call into backends(filled with catchAll) after BackendSelect.
   //          Now that we first call Autograd backend keys after BackendSelect, we should fill those
   //          with catchAll as well.
+  //    The implementation of (2.1) & (2.3) relies on the invariant that for a given backend,
+  //    `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the
+  //    backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_]
   //  (3) Use fallthrough kernel that are registered as fallback.
   //  (4) Use catchAll kernel if available
   // Alias Key Precedence:
@@ -272,7 +274,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
   for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) {
     updateDispatchTableEntry_(dispatcher, k);
   }
-  // Registering to backend key might affect computed entry at its Autograd backend key due to 2.2.
+  // Note [Refresh Runtime Autograd entries in dispatchTable_]
+  // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
   DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
   updateDispatchTableEntry_(dispatcher, autograd_key);
 }

From 677a59dcaa72fbc91abfe01731a41e0849e81154 Mon Sep 17 00:00:00 2001
From: Daya Khudia <dskhudia@fb.com>
Date: Thu, 24 Sep 2020 17:19:08 -0700
Subject: [PATCH 115/449] [aten] Call fbgemm functions for embedding
 prepack/unpack (#44845)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44845

fbgemm functions are vectorized and faster

```
Finished test run: https://our.intern.facebook.com/intern/testinfra/testrun/6473924484856786
Summary (total time 15.08s):
  PASS: 7
  FAIL: 0
  SKIP: 0
  FATAL: 0
  TIMEOUT: 0
  OMIT: 0
```

Performance Before:
```
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: qembeddingbag_byte_prepack
# Mode: Eager
# Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 68.727

# Benchmarking PyTorch: qembeddingbag_byte_prepack
# Mode: Eager
# Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 131.500

# Benchmarking PyTorch: qembeddingbag_byte_prepack
# Mode: Eager
# Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 248.190

# Benchmarking PyTorch: qembeddingbag_4bit_prepack
# Mode: Eager
# Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 172.742

# Benchmarking PyTorch: qembeddingbag_4bit_prepack
# Mode: Eager
# Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 333.008

# Benchmarking PyTorch: qembeddingbag_4bit_prepack
# Mode: Eager
# Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 652.423

# Benchmarking PyTorch: qembeddingbag_2bit_prepack
# Mode: Eager
# Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 167.282

# Benchmarking PyTorch: qembeddingbag_2bit_prepack
# Mode: Eager
# Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 398.901

# Benchmarking PyTorch: qembeddingbag_2bit_prepack
# Mode: Eager
# Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 785.254

# Benchmarking PyTorch: qembeddingbag_byte_unpack
# Mode: Eager
# Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 122.653

# Benchmarking PyTorch: qembeddingbag_byte_unpack
# Mode: Eager
# Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 230.617

# Benchmarking PyTorch: qembeddingbag_byte_unpack
# Mode: Eager
# Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 408.807

# Benchmarking PyTorch: qembeddingbag_4bit_unpack
# Mode: Eager
# Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 176.087

# Benchmarking PyTorch: qembeddingbag_4bit_unpack
# Mode: Eager
# Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 337.514

# Benchmarking PyTorch: qembeddingbag_4bit_unpack
# Mode: Eager
# Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 659.716

# Benchmarking PyTorch: qembeddingbag_2bit_unpack
# Mode: Eager
# Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 342.529

# Benchmarking PyTorch: qembeddingbag_2bit_unpack
# Mode: Eager
# Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 665.197

# Benchmarking PyTorch: qembeddingbag_2bit_unpack
# Mode: Eager
# Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 1307.923
```

Performance After:
```
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : short

# Benchmarking PyTorch: qembeddingbag_byte_prepack
# Mode: Eager
# Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 10.782

# Benchmarking PyTorch: qembeddingbag_byte_prepack
# Mode: Eager
# Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 17.443

# Benchmarking PyTorch: qembeddingbag_byte_prepack
# Mode: Eager
# Name: qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 25.898

# Benchmarking PyTorch: qembeddingbag_4bit_prepack
# Mode: Eager
# Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 13.903

# Benchmarking PyTorch: qembeddingbag_4bit_prepack
# Mode: Eager
# Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 18.575

# Benchmarking PyTorch: qembeddingbag_4bit_prepack
# Mode: Eager
# Name: qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 30.650

# Benchmarking PyTorch: qembeddingbag_2bit_prepack
# Mode: Eager
# Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 14.158

# Benchmarking PyTorch: qembeddingbag_2bit_prepack
# Mode: Eager
# Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 19.818

# Benchmarking PyTorch: qembeddingbag_2bit_prepack
# Mode: Eager
# Name: qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 30.852

# Benchmarking PyTorch: qembeddingbag_byte_unpack
# Mode: Eager
# Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 47.596

# Benchmarking PyTorch: qembeddingbag_byte_unpack
# Mode: Eager
# Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 91.025

# Benchmarking PyTorch: qembeddingbag_byte_unpack
# Mode: Eager
# Name: qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 131.425

# Benchmarking PyTorch: qembeddingbag_4bit_unpack
# Mode: Eager
# Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 12.637

# Benchmarking PyTorch: qembeddingbag_4bit_unpack
# Mode: Eager
# Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 20.856

# Benchmarking PyTorch: qembeddingbag_4bit_unpack
# Mode: Eager
# Name: qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 33.944

# Benchmarking PyTorch: qembeddingbag_2bit_unpack
# Mode: Eager
# Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128
# Input: num_embeddings: 80, embedding_dim: 128
Forward Execution Time (us) : 21.181

# Benchmarking PyTorch: qembeddingbag_2bit_unpack
# Mode: Eager
# Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256
# Input: num_embeddings: 80, embedding_dim: 256
Forward Execution Time (us) : 34.213

# Benchmarking PyTorch: qembeddingbag_2bit_unpack
# Mode: Eager
# Name: qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512
# Input: num_embeddings: 80, embedding_dim: 512
Forward Execution Time (us) : 59.622
```
ghstack-source-id: 112836216

Test Plan: buck test //caffe2/test:quantization -- 'test_embedding_bag*'  --print-passing-details

Reviewed By: radkris-git

Differential Revision: D23675777

fbshipit-source-id: 0b1a787864663daecc7449295f9ab6264eac52fc
---
 .../quantized/cpu/qembeddingbag_prepack.cpp   | 118 ++++++++++--------
 .../quantized/cpu/qembeddingbag_unpack.cpp    |  17 ++-
 2 files changed, 83 insertions(+), 52 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 6c67b6cc6c86..96d592594d04 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -104,8 +104,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
       embedding_rows,
       embedding_cols +
           8}; // extra 8 bytes to store FP scale and zero_point per row.
-  size_t output_columns = output_shape[1];
-  constexpr float kEpsilon = 1e-8f;
 
   // Allocate output packed weights
   auto output = at::empty(
@@ -114,6 +112,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
       weight_contig.suggest_memory_format());
   auto* output_data = output.data_ptr<uint8_t>();
 
+#ifdef USE_FBGEMM
+  fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat(
+      weight_data, embedding_rows, embedding_cols, output_data);
+#else
+  size_t output_columns = output_shape[1];
+  constexpr float kEpsilon = 1e-8f;
   for (std::size_t row = 0; row < embedding_rows; ++row) {
     const float* input_row = weight_data + row * embedding_cols;
     std::uint8_t* output_row = output_data + row * output_columns;
@@ -134,6 +138,8 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
           lrintf((input_row[col] - minimum_element) * inverse_scale);
     } // embedding_cols
   } // embedding_rows
+#endif // USE_FBGEMM
+
   return output;
 }
 
@@ -175,57 +181,69 @@ Tensor _qembeddingbag_nbit_prepack_helper(
       weight_contig.options().dtype(at::kByte),
       weight_contig.suggest_memory_format());
   auto* output_data = output.data_ptr<uint8_t>();
-  const auto output_columns = output.size(output.dim() - 1);
-
-  for (int row = 0; row < embedding_rows; ++row) {
-    const float* input_row = weight_data + row * embedding_cols;
-    std::uint8_t* output_row = output_data + row * output_columns;
 
-    float Xmin, Xmax;
-    if (optimized_qparams) {
-      std::tie(Xmax, Xmin) = at::choose_qparams_optimized(
-          weight_contig[row], embedding_cols, 200, 0.16, bit_width);
-    } else {
-      Xmin = *std::min_element(input_row, input_row + embedding_cols);
-      Xmax = *std::max_element(input_row, input_row + embedding_cols);
-    }
-    Xmin = static_cast<at::Half>(Xmin);
-    float range = Xmax - Xmin;
-    // Set scale to 1.0f for the corner case of Xmax == Xmin .
-    // Any non-zero scale would work because during quantization
-    // (X - Xmin) / scale will be 0 for all X unless scale is 0.
-    at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1);
-    float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
-    if (scale == 0 || std::isinf(inverse_scale)) {
-      // Corner case handling when Xmax == Xmin
-      // Any scale would work because X - Xmin will be 0 for all X
-      scale = 1.0f;
-      inverse_scale = 1.0f;
-    }
-    // Update the scale and zero_point of each row.
-    at::Half* output_row_scale_zp = reinterpret_cast<at::Half*>(
-        output_row +
-        (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
-
-    output_row_scale_zp[0] = scale;
-    output_row_scale_zp[1] = Xmin;
-
-    // Pack the weight values.
-    for (int col = 0; col < embedding_cols; ++col) {
-      float X = input_row[col];
-      std::uint8_t quantized = std::max(
-          0,
-          std::min<int>(lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1));
-      // We pack 2 4-bit values in a byte. Index 0 is packed in the lower 4-bits
-      // and index 1 is packed in the upper 4-bits.
-      if (col % NUM_ELEM_PER_BYTE == 0) {
-        output_row[col / NUM_ELEM_PER_BYTE] = quantized;
+#ifdef USE_FBGEMM
+  if (!optimized_qparams) {
+    fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf(
+        bit_width, weight_data, embedding_rows, embedding_cols, output_data);
+  } else {
+#endif // USE_FBGEMM
+    const auto output_columns = output.size(output.dim() - 1);
+
+    for (int row = 0; row < embedding_rows; ++row) {
+      const float* input_row = weight_data + row * embedding_cols;
+      std::uint8_t* output_row = output_data + row * output_columns;
+
+      float Xmin, Xmax;
+      if (optimized_qparams) {
+        std::tie(Xmax, Xmin) = at::choose_qparams_optimized(
+            weight_contig[row], embedding_cols, 200, 0.16, bit_width);
       } else {
-        output_row[col / NUM_ELEM_PER_BYTE] |=
-            (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width));
+        Xmin = *std::min_element(input_row, input_row + embedding_cols);
+        Xmax = *std::max_element(input_row, input_row + embedding_cols);
       }
-    } // embedding_cols
-  } // embedding_rows
+      Xmin = static_cast<at::Half>(Xmin);
+      float range = Xmax - Xmin;
+      // Set scale to 1.0f for the corner case of Xmax == Xmin .
+      // Any non-zero scale would work because during quantization
+      // (X - Xmin) / scale will be 0 for all X unless scale is 0.
+      at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1);
+      float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
+      if (scale == 0 || std::isinf(inverse_scale)) {
+        // Corner case handling when Xmax == Xmin
+        // Any scale would work because X - Xmin will be 0 for all X
+        scale = 1.0f;
+        inverse_scale = 1.0f;
+      }
+      // Update the scale and zero_point of each row.
+      at::Half* output_row_scale_zp = reinterpret_cast<at::Half*>(
+          output_row +
+          (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE);
+
+      output_row_scale_zp[0] = scale;
+      output_row_scale_zp[1] = Xmin;
+
+      // Pack the weight values.
+      for (int col = 0; col < embedding_cols; ++col) {
+        float X = input_row[col];
+        std::uint8_t quantized = std::max(
+            0,
+            std::min<int>(
+                lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1));
+        // We pack 2 4-bit values in a byte. Index 0 is packed in the lower
+        // 4-bits and index 1 is packed in the upper 4-bits.
+        if (col % NUM_ELEM_PER_BYTE == 0) {
+          output_row[col / NUM_ELEM_PER_BYTE] = quantized;
+        } else {
+          output_row[col / NUM_ELEM_PER_BYTE] |=
+              (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width));
+        }
+      } // embedding_cols
+    } // embedding_rows
+#ifdef USE_FBGEMM
+  }
+#endif // USE_FBGEMM
+
   return output;
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
index 72d42c61d0e5..4a9ae73ee137 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
@@ -73,6 +73,10 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) {
       packed_weight.suggest_memory_format());
   float* output_data = output.data_ptr<float>();
 
+#ifdef USE_FBGEMM
+  fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat(
+      input, input_rows, input_columns, output_data);
+#else
   for (std::size_t row = 0; row < input_rows; ++row) {
     const std::uint8_t* input_row = input + row * input_columns;
     const float* input_row_scale_zp =
@@ -84,14 +88,17 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) {
           input_row[col] * input_row_scale_zp[0] + input_row_scale_zp[1];
     } // output_columns
   } // input_rows
+#endif // USE_FBGEMM
   return output;
 }
 
-Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RATE) {
+Tensor _qembeddingbag_nbit_unpack_helper(
+    const Tensor& packed_weight,
+    int BIT_RATE) {
   const auto input_rows = packed_weight.size(0);
   const auto input_columns = packed_weight.size(1);
   const auto* input_data = packed_weight.data_ptr<uint8_t>();
-  int NUM_ELEM_PER_BYTE = 8/BIT_RATE;
+  int NUM_ELEM_PER_BYTE = 8 / BIT_RATE;
 
   // The last 4 bytes per row are two fp16 scale and zero_point.
   // The rest of input_columns is the number of values in the original row.
@@ -105,6 +112,10 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA
       packed_weight.options().dtype(kFloat),
       packed_weight.suggest_memory_format());
   float* output_data = output.data_ptr<float>();
+#ifdef USE_FBGEMM
+  fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat(
+      BIT_RATE, input_data, input_rows, input_columns, output_data);
+#else
   auto output_columns = output_dimensions[1];
   for (size_t row = 0; row < input_rows; ++row) {
     float* output_row = output_data + row * output_columns;
@@ -122,6 +133,8 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA
       output_row[col] = scale * quantized + zero_point;
     } // output_columns
   } // input_rows
+#endif // USE_FBGEMM
+
   return output;
 }
 

From 03dde4c62af35a8a8a0c2e1ea9f6486ac897a780 Mon Sep 17 00:00:00 2001
From: Dianshi Li <dianshi@fb.com>
Date: Thu, 24 Sep 2020 18:39:54 -0700
Subject: [PATCH 116/449] Resend diff D23858329 (#45315)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45315

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45314

in D23858329 (https://github.com/pytorch/pytorch/commit/721cfbf8425cf2c1dc5e27d1332e32e1a42ef541), we put PriorCorrectionCalibrationPrediction unit test in OSS file which causes test failure issue in public trunk.

this diff moves it to FB only test file.

Test Plan:
```
 buck test //caffe2/caffe2/python/operator_test:torch_integration_test -- test_gather_ranges_to_dense_op

buck test //caffe2/caffe2/fb/python/operator_test:torch_integration_test -- test_prior_correct_calibration_prediction_op
```
all pass.

Reviewed By: houseroad

Differential Revision: D23899012

fbshipit-source-id: 1ed97d8702e2765991e6caf5695d4c49353dae82
---
 caffe2/operators/gather_ranges_to_dense_op.cc |  8 ++++
 caffe2/operators/gather_ranges_to_dense_op.h  |  3 ++
 .../operator_test/torch_integration_test.py   | 41 +++++++++++++++++++
 3 files changed, 52 insertions(+)

diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc
index 10396aafc97e..aa31ef12b36a 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.cc
+++ b/caffe2/operators/gather_ranges_to_dense_op.cc
@@ -104,3 +104,11 @@ NO_GRADIENT(GatherRangesToDense);
 
 } // namespace
 } // namespace caffe2
+
+using GatherRangesToDenseCPUOp =
+    caffe2::GatherRangesToDenseOp<caffe2::CPUContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    GatherRangesToDense,
+    "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs",
+    GatherRangesToDenseCPUOp);
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index c1dd5a527005..217a61b25129 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -5,6 +5,7 @@
 
 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
@@ -15,6 +16,8 @@
 #include <map>
 #include <utility>
 
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense);
+
 namespace caffe2 {
 template <class Context>
 class GatherRangesToDenseOp final : public Operator<Context> {
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index 55f26a89987f..9bec64764240 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -875,6 +875,47 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries):
         )
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
+    def test_gather_ranges_to_dense_op(self):
+        data = np.array([1, 2, 3, 4, 5, 6, 7, 8])
+        ranges = np.array([[[2, 4]], [[0, 0]]])
+        key = np.array([0, 1, 3, 2, 1, 0, 1, 0])
+        lengths = np.array([4])
+        min_observation = 2
+        max_mismatched_ratio = 0.5
+        max_empty_ratio = 1.0
+
+        outputs_name = ["X_{}".format(i) for i in range(len(lengths))]
+        ref_op = core.CreateOperator(
+            "GatherRangesToDense",
+            ["data", "ranges", "key"],
+            outputs_name,
+            lengths=lengths,
+            min_observation=min_observation,
+            max_mismatched_ratio=max_mismatched_ratio,
+            max_empty_ratio=max_empty_ratio,
+        )
+        workspace.FeedBlob("data", data)
+        workspace.FeedBlob("ranges", ranges)
+        workspace.FeedBlob("key", key)
+        workspace.RunOperatorOnce(ref_op)
+        ref_outputs = []
+        for output_name in outputs_name:
+            ref_outputs.append(workspace.FetchBlob(output_name))
+
+        outputs = torch.ops._caffe2.GatherRangesToDense(
+            torch.from_numpy(data),
+            torch.from_numpy(ranges),
+            torch.from_numpy(key),
+            lengths=lengths,
+            min_observation=min_observation,
+            max_mismatched_ratio=max_mismatched_ratio,
+            max_empty_ratio=max_empty_ratio,
+        )
+
+        self.assertEqual(len(ref_outputs), len(outputs))
+        for i in range(0, len(ref_outputs)):
+            np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy())
+
     @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10))
     @settings(deadline=1000)
     def test_merge_id_lists(self, lengths_0, lengths_1):

From 0f2c648c970d33fe7cc6a8198e9ce59a584ae734 Mon Sep 17 00:00:00 2001
From: Linbin Yu <linbin@fb.com>
Date: Thu, 24 Sep 2020 20:06:42 -0700
Subject: [PATCH 117/449] log metadata when model loading failed (#44430)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44430

log metadata even when model loading is failed

Test Plan: {F331550976}

Reviewed By: husthyc

Differential Revision: D23577711

fbshipit-source-id: 0504e75625f377269f1e5df0f1ebe34b8e564c4b
---
 torch/csrc/jit/mobile/import.cpp | 19 +++++++++++++++----
 torch/csrc/jit/mobile/observer.h |  3 +++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index e812fd978c9f..e26177605674 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -228,6 +228,8 @@ class BytecodeDeserializer final {
  public:
   explicit BytecodeDeserializer(std::unique_ptr<PyTorchStreamReader> reader);
   mobile::Module deserialize(c10::optional<at::Device> device);
+  std::unordered_map<std::string, std::string> deserializeMetadata(
+      c10::optional<at::Device> device);
 
  private:
   c10::IValue readArchive(
@@ -246,6 +248,13 @@ BytecodeDeserializer::BytecodeDeserializer(
     : compilation_unit_(std::make_shared<CompilationUnit>()),
       reader_(std::move(reader)) {}
 
+std::unordered_map<std::string, std::string> BytecodeDeserializer::
+    deserializeMetadata(c10::optional<at::Device> device) {
+  device_ = device;
+  auto mcu = std::make_shared<mobile::CompilationUnit>();
+  return readMobileMetadata(mcu);
+}
+
 mobile::Module BytecodeDeserializer::deserialize(
     c10::optional<at::Device> device) {
   device_ = device;
@@ -397,9 +406,9 @@ mobile::Module _load_for_mobile(
   if (observer) {
     observer->onEnterLoadModel();
   }
+  auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
+  BytecodeDeserializer deserializer(std::move(reader));
   try {
-    auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
-    BytecodeDeserializer deserializer(std::move(reader));
     mobile::Module result = deserializer.deserialize(std::move(device));
     std::unordered_map<std::string, std::string> copied_metadata =
         result.metadata();
@@ -412,7 +421,8 @@ mobile::Module _load_for_mobile(
     return result;
   } catch (c10::Error& error) {
     if (observer) {
-      observer->onFailLoadModel(error.what());
+      observer->onFailLoadModel(
+          error.what(), deserializer.deserializeMetadata(std::move(device)));
     }
     TORCH_RETHROW(error);
   } catch (...) {
@@ -429,7 +439,8 @@ mobile::Module _load_for_mobile(
       }
     } catch (c10::Error& error) {
       if (observer) {
-        observer->onFailLoadModel(error.what());
+        observer->onFailLoadModel(
+            error.what(), deserializer.deserializeMetadata(std::move(device)));
       }
       TORCH_RETHROW(error);
     }
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
index fde99f501f72..2935fa078fc7 100644
--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@@ -78,6 +78,9 @@ class MobileModuleObserver {
   virtual void onExitLoadModel(
       const std::unordered_map<std::string, std::string>&) {}
   virtual void onFailLoadModel(const char*) {}
+  virtual void onFailLoadModel(
+      const char*,
+      const std::unordered_map<std::string, std::string>&) {}
 };
 
 class MobileObserverConfig {

From 7e5492e1bedef05752f8c8961d8bcc1a7e5f641e Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Thu, 24 Sep 2020 20:09:47 -0700
Subject: [PATCH 118/449] [minor] Fix undefined variable (#45246)

Summary:
The commit https://github.com/pytorch/pytorch/commit/2a37f3fd2f74e2d10f3440e6dfef2d5389caab62 https://github.com/pytorch/pytorch/pull/45130 deleted the python variable `capability` which is used in later lines.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45246

Reviewed By: walterddr

Differential Revision: D23923916

Pulled By: malfet

fbshipit-source-id: c5d7fef9e4a87ccc621191200e5965710e9d6aaa
---
 torch/cuda/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index e8687cad17e8..1176c6ee3060 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -100,6 +100,7 @@ def _check_cubins():
         supported = any([sm // 10 == cap_major for sm in supported_sm])
         if not supported:
             device_name = get_device_name(idx)
+            capability = cap_major * 10 + cap_minor
             warnings.warn(incompatible_device_warn.format(device_name, capability, " ".join(arch_list), device_name))
 
 

From 630bd85aae958495682fb5959f5a97832c2223d7 Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Thu, 24 Sep 2020 20:15:31 -0700
Subject: [PATCH 119/449] [pytorch] refine dispatch keys in
 native_functions.yaml (2/N) (#45284)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45284

This is the 2nd batch of the change described in #45010.

In this batch we relaxed some filters to cover more 'backend specific' ops:
* ops that not call any 'Tensor::is_xxx()' method OR only call
  'Tensor::is_cuda()' - we are adding CUDA dispatch key anyway;
* ops that call other ATen ops but ARE differentiable - differentiability
  is a fuzzy indicator of not being 'composite';

Inherited other filters from the 1st batch:
* These ops don't already have dispatch section in native_functions.yaml;
* These ops call one or more DispatchStub (thus "backend specific");

Differential Revision: D23909901

Test Plan: Imported from OSS

Reviewed By: ailzhang

Pulled By: ljk53

fbshipit-source-id: 3b31e176324b6ac814acee0b0f80d18443bd81a1
---
 aten/src/ATen/native/native_functions.yaml | 148 +++++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f5bbb263ed9c..0d5582572d6e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -226,6 +226,8 @@
   variants: function, method
 
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: abs_out
 
 # Note [Adding an alias]
 # To add an alias do the following:
@@ -268,6 +270,8 @@
   variants: function, method
 
 - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: angle_out
 
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
@@ -285,6 +289,8 @@
   variants: method
 
 - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sgn_out
 
 - func: real(Tensor(a) self) -> Tensor(a)
   use_c10_dispatcher: full
@@ -425,8 +431,12 @@
 - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: all
 
 - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: all_out
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   variants: function, method
@@ -440,8 +450,12 @@
 - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: any
 
 - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: any_out
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   variants: function, method
@@ -688,9 +702,13 @@
 
 - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
   variants: function
+  dispatch:
+    CPU, CUDA: bernoulli_out
 
 - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: bernoulli_
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
   variants: method
@@ -900,6 +918,8 @@
   variants: function, method
 
 - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_out
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
   use_c10_dispatcher: full
@@ -910,6 +930,8 @@
   variants: function, method
 
 - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_max_out
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
   use_c10_dispatcher: full
@@ -920,6 +942,8 @@
   variants: function, method
 
 - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: clamp_min_out
 
 # clip is an alias for clamp
 - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
@@ -1811,6 +1835,8 @@
 
 - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
   variants: function, method
+  dispatch:
+    CPU, CUDA: index
   # NB: This function is special-cased in tools/autograd/gen_variable_type.py
   # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp:
   # - Tensor Tensor::index(ArrayRef<TensorIndex> indices)
@@ -1843,6 +1869,8 @@
 
 - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
   variants: function
+  dispatch:
+    CPU, CUDA: _index_put_impl_
 
 - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
   use_c10_dispatcher: full
@@ -2142,6 +2170,8 @@
 - func: matrix_exp(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: matrix_exp
 
 - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
   use_c10_dispatcher: full
@@ -2171,6 +2201,8 @@
   variants: function, method
 
 - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU, CUDA: max_out
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -2187,6 +2219,8 @@
   variants: function, method
 
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: amax_out
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -2258,6 +2292,8 @@
   variants: function, method
 
 - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+  dispatch:
+    CPU, CUDA: min_out
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -2269,6 +2305,8 @@
   variants: function, method
 
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: amin_out
 
 - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
   use_c10_dispatcher: full
@@ -2584,18 +2622,26 @@
 
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _cdist_forward
 
 - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _cdist_backward
 
 - func: pdist(Tensor self, float p=2) -> Tensor
   use_c10_dispatcher: full
 
 - func: _pdist_forward(Tensor self, float p=2) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _pdist_forward
 
 - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
   use_c10_dispatcher: full
+  dispatch:
+    CPU, CUDA: _pdist_backward
 
 - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
   use_c10_dispatcher: full
@@ -2899,10 +2945,14 @@
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: hardshrink
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: hardshrink_backward
 
 - func: rsqrt(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -3191,27 +3241,39 @@
 - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: sum
 
 - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: sum
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
 
 - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: sum_out
 
 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
 
 - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
 
 - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: nansum
 
 - func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: nansum_out
 
 - func: sum_to_size(Tensor self, int[] size) -> Tensor
   use_c10_dispatcher: full
@@ -3241,23 +3303,33 @@
 - func: std(Tensor self, bool unbiased=True) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: std
 
 - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: std
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: std_mean
 
 - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: std_mean
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   variants: function
 
 - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: std_out
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   variants: function, method
@@ -3267,12 +3339,18 @@
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: prod
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: prod
 
 - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: prod_out
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   variants: function, method
@@ -3428,6 +3506,8 @@
   variants: function, method
 
 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: trunc_out
 
 # Alias for trunc
 - func: fix(Tensor self) -> Tensor
@@ -3506,12 +3586,18 @@
 - func: var(Tensor self, bool unbiased=True) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: var
 
 - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: var
 
 - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: var_out
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   variants: function, method
@@ -3521,10 +3607,14 @@
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: var_mean
 
 - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: var_mean
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   variants: function
@@ -3560,6 +3650,8 @@
 - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _s_where
 
 - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
   use_c10_dispatcher: full
@@ -3720,8 +3812,12 @@
   variants: function, method
 
 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: norm_out
 
 - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: norm_out
 
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   variants: function, method
@@ -3830,6 +3926,8 @@
 - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: rsub
 
 - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -4279,6 +4377,8 @@
 - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_tensor_affine
 
 - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
@@ -4287,6 +4387,8 @@
 - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_tensor_affine
 
 - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
   use_c10_dispatcher: full
@@ -4295,6 +4397,8 @@
 - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: fake_quantize_per_channel_affine
 
 - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
@@ -4303,6 +4407,8 @@
 - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
   use_c10_dispatcher: full
   variants: function
+  dispatch:
+    CPU, CUDA: _fake_quantize_learnable_per_channel_affine
 
 - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
   use_c10_dispatcher: full
@@ -4999,6 +5105,8 @@
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: uniform_
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
@@ -5037,10 +5145,14 @@
   device_guard: False
 
 - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: cross_out
 
 - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: cross
 
 - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -5711,6 +5823,8 @@
 - func: digamma(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: digamma
 
 - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -5782,6 +5896,8 @@
 - func: atan2(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: atan2
 
 - func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -5906,8 +6022,12 @@
 - func: maximum(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: maximum
 
 - func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: maximum_out
 
 # binary max, alias of maximum
 # NOTE: max is not an alias for maximum, since there is also unary max
@@ -5920,8 +6040,12 @@
 - func: minimum(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: minimum
 
 - func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: minimum_out
 
 # binary min, alias for minimum
 # NOTE: min is not an alias for minimum, since there is also unary min
@@ -6002,6 +6126,8 @@
 - func: all(Tensor self) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
+  dispatch:
+    CPU, CUDA: all
 
 - func: any(Tensor self) -> Tensor
   use_c10_dispatcher: full
@@ -6077,18 +6203,32 @@
 
 - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
   variants: method
+  dispatch:
+    CPU, CUDA: normal_
 
 - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: normal_out
 
 - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+  dispatch:
+    CPU, CUDA: normal
 
 - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
 
@@ -6396,10 +6536,14 @@
 
 - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss_out
 
 - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: mse_loss
 
 - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -6562,6 +6706,8 @@
 - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: smooth_l1_loss
 
 - func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -7603,6 +7749,8 @@
 - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
+  dispatch:
+    CPU, CUDA: sigmoid_backward
 
 - func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn

From c6500bcf1494aadf7bd86adb554fdad376b7f105 Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Thu, 24 Sep 2020 20:52:17 -0700
Subject: [PATCH 120/449] [reland] Make grad point to bucket buffer in DDP to
 save memory usage (#44344)

Summary:
[test all]
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44344

reland #41954

Add one argument in DDP API to enable/disable letting grads pointing  to views. When it is disabled, behavior is the same as DDP right now; when it is enabled, Make both variable.grad() and grad in distautograd context point to bucket buffer in DDP to save memory usage.
In this case, grad will be view of bucket buffer tensors, in order to make it compatiable with optimizer.zero_grad(), we
made changes in #41283.

Also be noted that we can not make variable.grad() pointing to bucket buffer during construction time, because we want to
keep grad undefined for unused parameters.
ghstack-source-id: 112845787

Test Plan:
1. When grad_is_view=false:
a. roberta_base, peak memory usage 8250MB, p50 per iteration latency 0.923second, https://www.internalfb.com/intern/fblearner/details/218029699/?notif_channel=cli
b. resnet, peak memory usage 3089MB, p50 per iteration latency 0.120second, https://www.internalfb.com/intern/fblearner/details/218029035/?notif_channel=cli
c. accuracy benchmark, distributed=false, .accuracy 40.914535522461, .loss: 1.6370717287064; distributed=true, .accuracy: 39.966053009033, .loss: 1.6849111318588
https://www.internalfb.com/intern/fblearner/details/218035688/?notif_channel=cli
d. classy vision uru production flow, https://www.internalfb.com/intern/fblearner/details/219065811/?notif_channel=cli
e. pytext flow, https://www.internalfb.com/intern/fblearner/details/219137458/?notif_channel=cli

2. When grad_is_view=true:
a. roberta_base, peak memory usage 7183MB, p50 per iteration latency 0.908second, https://www.internalfb.com/intern/fblearner/details/217882539?tab=operator_details
b. resnet, peak memory usage 2988 MB, p50 per iteration latency 0.119second, https://www.internalfb.com/intern/fblearner/details/218028479/?notif_channel=cli
c. accuracy benchmark, distributed=false, .accuracy 41.713260650635, .loss: 1.69939661026; distributed=true, .accuracy: 39.966053009033, .loss: 1.6849111318588, https://www.internalfb.com/intern/fblearner/details/218037058/?notif_channel=cli
d. classy vision uru production flow, expected, can not work well with apex.amp https://www.internalfb.com/intern/fblearner/details/219205218/?notif_channel=cli
e. pytext flow, detach_() related error, expected, as pytext zero_grad depends on apex repo where detach_() is called. also seeing the warning in finalize_bucket_dense due to tied weights, which is expected. https://www.internalfb.com/intern/fblearner/details/219150229/?notif_channel=cli

Reviewed By: mrshenli

Differential Revision: D23588186

fbshipit-source-id: f724d325b954ef6f06ede31759bf01dd29a6f5e5
---
 test/distributed/test_c10d.py                 | 180 +++++++++----
 torch/csrc/autograd/VariableTypeManual.cpp    |   7 +-
 .../csrc/autograd/functions/accumulate_grad.h |   5 +
 torch/csrc/distributed/c10d/init.cpp          |   2 +
 torch/csrc/distributed/c10d/reducer.cpp       | 239 +++++++++++++-----
 torch/csrc/distributed/c10d/reducer.h         |  17 +-
 torch/nn/parallel/distributed.py              |  29 ++-
 .../_internal/distributed/distributed_test.py |  66 ++++-
 8 files changed, 425 insertions(+), 120 deletions(-)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 64e255fce3e6..a81bc53f175a 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -1974,13 +1974,15 @@ def tearDown(self):
     def world_size(self):
         return 2
 
-    def _prepare_single_device_module(self, process_group, devices, device_ids, global_batch_size):
+    def _prepare_single_device_module(
+            self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False):
         model = Net()
         ddp_model = DistributedDataParallel(
             copy.deepcopy(model).to(devices[0]),
             device_ids=device_ids,
             process_group=process_group,
-            bucket_cap_mb=0.001)
+            bucket_cap_mb=0.001,
+            gradient_as_bucket_view=gradient_as_bucket_view)
 
         model.to(devices[0])
 
@@ -1989,7 +1991,7 @@ def _prepare_single_device_module(self, process_group, devices, device_ids, glob
 
         return model, ddp_model, input, target
 
-    def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size):
+    def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False):
         self.assertTrue(
             len(devices) == 2 or len(devices) == 4,
             "unexpected devices for ddp tests {}".format(devices))
@@ -2002,14 +2004,15 @@ def _prepare_multi_device_module(self, process_group, devices, device_ids, globa
             copy.deepcopy(model),
             device_ids=device_ids,
             process_group=process_group,
-            bucket_cap_mb=0.001)
+            bucket_cap_mb=0.001,
+            gradient_as_bucket_view=gradient_as_bucket_view)
 
         input = torch.randn(global_batch_size, 2).cuda(devices[0])
         target = torch.randn(global_batch_size, 4)
 
         return model, ddp_model, input, target
 
-    def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False):
+    def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         """
         Note: we pass down `device_ids` all the way to DistributedDataParallel
         as part of the test. Below you find tests that either use a list of
@@ -2023,11 +2026,11 @@ def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi
         if multi_device:
             model, ddp_model, input, target = \
                 self._prepare_multi_device_module(
-                    process_group, devices, device_ids, global_batch_size)
+                    process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view)
         else:
             model, ddp_model, input, target = \
                 self._prepare_single_device_module(
-                    process_group, devices, device_ids, global_batch_size)
+                    process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view)
 
         def step_model(model, input, target):
             model.train()
@@ -2062,17 +2065,21 @@ def update_parameters(model):
             torch.manual_seed(1337 + iteration)
             input = input[torch.randperm(global_batch_size)]
 
-    def _test_gloo_backend(self, devices, device_ids, multi_device=False):
+    def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
         options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
-        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device)
+        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view)
 
     @requires_gloo()
     def test_gloo_backend_cpu_module(self):
         self._test_gloo_backend([torch.device("cpu")], [])
 
+    @requires_gloo()
+    def test_gloo_backend_cpu_module_grad_is_view(self):
+        self._test_gloo_backend([torch.device("cpu")], [], gradient_as_bucket_view=True)
+
     @requires_gloo()
     @skip_if_not_multigpu
     def test_gloo_backend_1gpu_module_device_ids_integer_list(self):
@@ -2101,10 +2108,10 @@ def test_gloo_backend_4gpu_module(self):
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
         self._test_gloo_backend(devices, [], multi_device=True)
 
-    def _test_nccl_backend(self, devices, device_ids, multi_device=False):
+    def _test_nccl_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
-        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device)
+        self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view)
 
     @requires_nccl()
     @skip_if_not_multigpu
@@ -2169,10 +2176,7 @@ def test_ddp_multi_device_module_config(self):
             ddp_model = DistributedDataParallel(
                 model, device_ids=gpus, process_group=process_group)
 
-    @requires_nccl()
-    @skip_if_not_multigpu
-    @skip_if_rocm
-    def test_fp16(self):
+    def _test_fp16(self, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
 
@@ -2184,6 +2188,7 @@ def test_fp16(self):
             device_ids=[gpus[0]],
             process_group=process_group,
             bucket_cap_mb=0.001,
+            gradient_as_bucket_view=gradient_as_bucket_view
         )
 
         # Input 2**15, so that the gradients will overflow with a
@@ -2204,7 +2209,16 @@ def test_fp16(self):
     @requires_nccl()
     @skip_if_not_multigpu
     @skip_if_rocm
-    def test_arbitrary_forward_return_value(self):
+    def test_fp16(self):
+        self._test_fp16()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_fp16_grad_is_view(self):
+        self._test_fp16(gradient_as_bucket_view=True)
+
+    def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False):
         """
         Note: this test can be sped up by only running it on a CPU module
         once DistributedDataParallel supports them.
@@ -2240,6 +2254,7 @@ def forward(self, x, fn):
             ForwardReturnValueModule().float().to(device_id),
             device_ids=[device_id],
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         batch_size = 4
@@ -2295,7 +2310,16 @@ def test(box, unbox):
     @requires_nccl()
     @skip_if_not_multigpu
     @skip_if_rocm
-    def test_find_unused_parameters_kwarg(self):
+    def test_arbitrary_forward_return_value(self):
+        self._test_arbitrary_forward_return_value()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_arbitrary_forward_return_value_grad_is_view(self):
+        self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True)
+
+    def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False):
         """
         Note: this test can be sped up by only running it on a CPU module
         once DistributedDataParallel supports them.
@@ -2325,12 +2349,13 @@ def forward(self, x):
         input = torch.rand([batch_size, 2], dtype=torch.float)
         target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id)
 
-        def test_find_unused_parameters(find_unused_parameters, test_default=False):
+        def test_find_unused_parameters(find_unused_parameters, test_default=False, gradient_as_bucket_view=False):
             if test_default:
                 model = DistributedDataParallel(
                     FindUnusedParametersModule().float().to(device_id),
                     device_ids=[device_id],
                     process_group=process_group,
+                    gradient_as_bucket_view=gradient_as_bucket_view,
                 )
             else:
                 model = DistributedDataParallel(
@@ -2338,6 +2363,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False):
                     device_ids=[device_id],
                     process_group=process_group,
                     find_unused_parameters=find_unused_parameters,
+                    gradient_as_bucket_view=gradient_as_bucket_view,
                 )
 
             output, fc3 = model(input)
@@ -2349,7 +2375,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False):
         # trigger an error when `backward` is called (because fc3 is an unused
         # parameter and will therefore be marked ready twice).
         try:
-            test_find_unused_parameters(True)
+            test_find_unused_parameters(True, gradient_as_bucket_view=gradient_as_bucket_view)
         except Exception as ex:
             self.assertTrue(
                 str(ex).startswith("Expected to mark a variable ready only once."))
@@ -2359,19 +2385,29 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False):
         # Then test that the default behavior can be overridden by setting
         # `find_unused_parameters=False`.
         try:
-            test_find_unused_parameters(False)
+            test_find_unused_parameters(False, gradient_as_bucket_view=gradient_as_bucket_view)
         except Exception as ex:
             self.fail("Unexpected exception: %s" % ex)
 
         # Test find_unused_parameters defaults to False
         try:
-            test_find_unused_parameters(True, test_default=True)
+            test_find_unused_parameters(True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view)
         except Exception as ex:
             self.fail("Unexpected exception: %s" % ex)
 
-    @requires_gloo()
-    @skip_if_lt_x_gpu(2)
-    def test_global_local_unused_params_grad(self):
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_find_unused_parameters_kwarg(self):
+        self._test_find_unused_parameters_kwarg()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_find_unused_parameters_kwarg_grad_is_view(self):
+        self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True)
+
+    def _test_global_local_unused_params_grad(self, gradient_as_bucket_view=False):
         """
         By simulating a multi-task training, this test is to make sure:
         1) DDP does not touch the grad of globally unused parameters.
@@ -2417,6 +2453,7 @@ def run_and_verify_grad(model):
             GlobalLocalUnusedParamModule().cpu(),
             process_group=process_group,
             find_unused_parameters=True,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
         run_and_verify_grad(cpu_model)
 
@@ -2427,9 +2464,20 @@ def run_and_verify_grad(model):
             device_ids=[device_id],
             process_group=process_group,
             find_unused_parameters=True,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
         run_and_verify_grad(gpu_model)
 
+    @requires_gloo()
+    @skip_if_lt_x_gpu(2)
+    def test_global_local_unused_params_grad(self):
+        self._test_global_local_unused_params_grad()
+
+    @requires_gloo()
+    @skip_if_lt_x_gpu(2)
+    def test_global_local_unused_params_grad_with_grad_is_view(self):
+        self._test_global_local_unused_params_grad(gradient_as_bucket_view=True)
+
     @requires_gloo()
     @skip_if_lt_x_gpu(2)
     def test_find_unused_parameters_when_unused_parameters_empty(self):
@@ -2486,10 +2534,7 @@ def run_and_verify_grad(model):
         )
         run_and_verify_grad(gpu_model)
 
-    @requires_nccl()
-    @skip_if_not_multigpu
-    @skip_if_rocm
-    def test_multiple_outputs_multiple_backward(self):
+    def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False):
         """
         Note: this test can be sped up by only running it on a CPU module
         once DistributedDataParallel supports them.
@@ -2523,6 +2568,7 @@ def forward(self, x):
             MultipleOutputModule().float().to(device_id),
             device_ids=[device_id],
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         batch_size = 4
@@ -2537,6 +2583,18 @@ def forward(self, x):
         loss2 = criterion(output2, target)
         loss2.backward()
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_multiple_outputs_multiple_backward(self):
+        self._test_multiple_outputs_multiple_backward()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_multiple_outputs_multiple_backward_grad_is_view(self):
+        self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True)
+
     @requires_nccl()
     @skip_if_not_multigpu
     @skip_if_rocm
@@ -2586,7 +2644,7 @@ def check_no_grads():
         # No parameter should have their gradient set.
         check_no_grads()
 
-    def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None):
+    def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False):
         """
         This is the recommended way to implement accumulate grads.
         If ``ddp_comm_hook`` input was specified, it will also register that hook
@@ -2601,7 +2659,7 @@ def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None):
         local_batch_size = len(devices)
 
         model, ddp_model, input, target = self._prepare_single_device_module(
-            process_group, devices, devices, global_batch_size
+            process_group, devices, devices, global_batch_size, gradient_as_bucket_view
         )
 
         if ddp_comm_hook is not None:
@@ -2658,6 +2716,15 @@ def test_accumulate_gradients_no_sync(self):
         """
         self._test_accumulate_gradients_no_sync()
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_accumulate_gradients_no_sync_grad_is_view(self):
+        """
+        Runs _test_accumulate_gradients_no_sync using default inputs
+        """
+        self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True)
+
     @requires_nccl()
     @skip_if_not_multigpu
     @skip_if_rocm
@@ -2708,10 +2775,7 @@ def div(fut):
             num_iters=4, ddp_comm_hook=allreduce_with_then_hook
         )
 
-    @requires_nccl()
-    @skip_if_not_multigpu
-    @skip_if_rocm
-    def test_accumulate_gradients_module(self):
+    def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False):
         # This is NOT the recommended way to implement accumulating grads, but
         # we would like to make sure DDP does not mess up with the underlying
         # module.
@@ -2723,7 +2787,7 @@ def test_accumulate_gradients_module(self):
 
         model, ddp_model, input, target = \
             self._prepare_single_device_module(
-                process_group, devices, devices, global_batch_size)
+                process_group, devices, devices, global_batch_size, gradient_as_bucket_view)
 
         def step_model(model, input, target):
             model.train()
@@ -2763,6 +2827,18 @@ def step_model(model, input, target):
             torch.manual_seed(1337 + iteration)
             input = input[torch.randperm(global_batch_size)]
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_accumulate_gradients_module(self):
+        self._test_accumulate_gradients_module()
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    @skip_if_rocm
+    def test_accumulate_gradients_module_with_grad_is_view(self):
+        self._test_accumulate_gradients_module(gradient_as_bucket_view=True)
+
     @requires_gloo()
     def test_ignored_output(self):
         """
@@ -3022,8 +3098,7 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model):
         ddp_parameter = next(ddp_model.parameters())
         self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad)
 
-    @requires_gloo()
-    def test_sparse_gradients(self):
+    def _test_sparse_gradients(self, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size)
 
@@ -3034,10 +3109,19 @@ def test_sparse_gradients(self):
         ddp_model = DistributedDataParallel(
             copy.deepcopy(vanilla_model),
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
 
+    @requires_gloo()
+    def test_sparse_gradients(self):
+        self._test_sparse_gradients()
+
+    @requires_gloo()
+    def test_sparse_gradients_grad_is_view(self):
+        self._test_sparse_gradients(gradient_as_bucket_view=True)
+
     def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size):
         store = c10d.FileStore(self.file_name, self.world_size)
         process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -3206,12 +3290,13 @@ def test_ddp_comm_hook_future_passing_cpu(self):
         # without the comm_hook, result would be 0.25 * torch.ones(2, 2).
         self._run_and_verify_hook(cpu_model, 8, 2 * torch.ones(2, 2))
 
-    def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None):
+    def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None, gradient_as_bucket_view=False):
         device_id = gpus_for_rank(self.world_size)[self.rank][0]
         gpu_model = DistributedDataParallel(
             ModuleForDdpCommHook().to(device_id),
             device_ids=[device_id],
             process_group=process_group,
+            gradient_as_bucket_view=gradient_as_bucket_view,
         )
 
         # Register DDP Communication Hook if defined
@@ -3276,10 +3361,7 @@ def test_ddp_comm_hook_future_passing_gpu_nccl(self):
         # without the comm_hook, result would be 0.25 * torch.ones(2, 2).
         self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2))
 
-    @requires_nccl()
-    @skip_if_lt_x_gpu(2)
-    @skip_if_rocm
-    def test_ddp_comm_hook_allreduce_hook_nccl(self):
+    def _test_ddp_comm_hook_allreduce_hook_nccl(self, gradient_as_bucket_view=False):
         """
         This unit test verifies whether a DDP communication hook that just calls
         allreduce gives the same result result with the case of no hook registered.
@@ -3294,11 +3376,23 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future:
             return process_group.allreduce(tensors).get_future()
 
         # Get GPU model with allreduce_hook registered.
-        gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook)
+        gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook, gradient_as_bucket_view)
 
         # check whether the grads are equal to what DDP without hook would return.
         self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2))
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_ddp_comm_hook_allreduce_hook_nccl(self):
+        self._test_ddp_comm_hook_allreduce_hook_nccl()
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    @skip_if_rocm
+    def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self):
+        self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     @skip_if_rocm
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index c72c67eb5230..18e5e4f54820 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -269,7 +269,12 @@ Tensor & detach_(Tensor & self) {
                    "of detach_(). Alternatively, create this view with an "
                    "`unsafe_` version of the function that produced it.");
     } else {
-      AT_ERROR("Can't detach views in-place. Use detach() instead");
+      AT_ERROR("If you are using DistributedDataParallel (DDP) for training, "
+               "and gradient_as_bucket_view is set as True, gradients are "
+               "views of DDP buckets, and hence detach_() cannot be called "
+               "on these gradients. To fix this error, please refer to the "
+               "Optimizer.zero_grad() function in torch/optim/optimizer.py "
+               "as the solution.");
     }
   }
   // I think the choice here is conservative.  In principle, doing
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index e1a02dc19fd8..dafd07f64b84 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -161,6 +161,11 @@ struct TORCH_API AccumulateGrad : public Node {
         // valid operation which adds `new_grad` to `variable_grad` in
         // place. `variable_grad` is thus still referring to the same tensor
         // after the operation.
+        // Also DistributedDataParallel(DDP) package relies on grad being
+        // mutated in place for saving peak memory usage. DDP will still
+        // work correctly if it is mutated out of place here, but DDP will
+        // maintain one extra copy of grad tensors in buffer and thus
+        // increase peak memory usage.
         variable_grad += new_grad;
         CHECK_RESULT(variable_grad, variable);
         // ^ We could enforce the contract more aggressively here by writing:
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index aff2da31c133..165d6a1c8603 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -159,6 +159,7 @@ PyObject* c10d_init(PyObject* _unused) {
               std::shared_ptr<::c10d::ProcessGroup>,
               std::vector<std::vector<bool>>,
               int64_t,
+              bool,
               bool>(),
           py::arg("replicas"),
           py::arg("bucket_indices"),
@@ -166,6 +167,7 @@ PyObject* c10d_init(PyObject* _unused) {
           py::arg("expect_sparse_gradients") = std::vector<std::vector<bool>>(),
           py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap,
           py::arg("find_unused_parameters") = false,
+          py::arg("gradient_as_bucket_view") = false,
           py::call_guard<py::gil_scoped_release>())
       .def(
           "initialize_buckets",
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 1a5766eea84e..86916c7994dd 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -32,7 +32,8 @@ Reducer::Reducer(
     std::shared_ptr<c10d::ProcessGroup> process_group,
     std::vector<std::vector<bool>> expect_sparse_gradients,
     int64_t bucket_bytes_cap,
-    bool find_unused_parameters)
+    bool find_unused_parameters,
+    bool gradient_as_bucket_view)
     : replicas_(std::move(replicas)),
       process_group_(std::move(process_group)),
       expect_sparse_gradients_(std::move(expect_sparse_gradients)),
@@ -41,6 +42,7 @@ Reducer::Reducer(
       next_bucket_(0),
       has_marked_unused_parameters_(false),
       find_unused_parameters_(find_unused_parameters),
+      gradient_as_bucket_view_(gradient_as_bucket_view),
       local_used_maps_reduced_(false),
       backward_stats_base_(0),
       has_rebuilt_bucket_(false),
@@ -310,6 +312,56 @@ void Reducer::verify_replica0_across_processes() {
   }
 }
 
+void Reducer::check_grad_layout(
+    const at::Tensor& grad,
+    const at::Tensor& bucket_view) {
+  // Ensure that the gradient type matches the bucket type.
+  TORCH_CHECK(
+      grad.options().type_equal(bucket_view.options()),
+      "Expected ",
+      bucket_view.toString(),
+      ", got ",
+      grad.toString());
+  TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device());
+  TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel());
+  // AccumulateGrad doesn't HAVE to obey the grad layout contract.
+  // The penalty for disobedience is reduced performance, not numerical
+  // death. Warnings here help diagnose poor DDP performance.
+  if (grad.strides() != bucket_view.strides()) {
+    TORCH_WARN_ONCE(
+        "Grad strides do not match bucket view strides. "
+        "This may indicate grad was not created according to the "
+        "gradient layout contract, or that the param's strides "
+        "changed since DDP was constructed.  This is not an error, "
+        "but may impair performance.\n"
+        "grad.sizes() = ",
+        grad.sizes(),
+        ", strides() = ",
+        grad.strides(),
+        "\n",
+        "bucket_view.sizes() = ",
+        bucket_view.sizes(),
+        ", strides() = ",
+        bucket_view.strides());
+  }
+  if (!gradient_as_bucket_view_) {
+    TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view));
+  }
+}
+
+void Reducer::copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view) {
+  // See Note [DDP Communication Hook]
+  if (comm_hook_ == nullptr) {
+    // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp
+    auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_);
+    wrapped.unsafeGetTensorImpl()->set_wrapped_number(true);
+    // Divides while copying into the bucket view.
+    at::native::mul_out(bucket_view, grad, wrapped);
+  } else {
+    bucket_view.copy_(grad);
+  }
+}
+
 void Reducer::mark_variable_ready_dense(VariableIndex index) {
   const auto replica_index = index.replica_index;
   const auto variable_index = index.variable_index;
@@ -327,49 +379,27 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) {
   // of the bucket it would otherwise hold.
   runGradCallbackForVariable(variable, [&](auto& grad) {
     if (grad.defined()) {
-      // Ensure that the gradient type matches the bucket type.
-      TORCH_CHECK(
-          grad.options().type_equal(bucket_view.options()),
-          "Expected ",
-          bucket_view.toString(),
-          ", got ",
-          grad.toString());
-      // Assert that the grad tensor and the bucket don't share storage.
-      // If they did, we could avoid the copy altogether.
-      // The reason for not doing this is that existing code calls
-      // `detach_` from `zero_grad`, which is incompatible with views.
-      TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view));
-      TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device());
-      TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel());
-      // AccumulateGrad doesn't HAVE to obey the grad layout contract.
-      // The penalty for disobedience is reduced performance, not numerical
-      // death. Warnings here help diagnose poor DDP performance.
-      if (grad.strides() != bucket_view.strides()) {
-        TORCH_WARN_ONCE(
-            "Grad strides do not match bucket view strides. "
-            "This may indicate grad was not created according to the "
-            "gradient layout contract, or that the param's strides "
-            "changed since DDP was constructed.  This is not an error, "
-            "but may impair performance.\n"
-            "grad.sizes() = ",
-            grad.sizes(),
-            ", strides() = ",
-            grad.strides(),
-            "\n",
-            "bucket_view.sizes() = ",
-            bucket_view.sizes(),
-            ", strides() = ",
-            bucket_view.strides());
-      }
-      // See Note [DDP Communication Hook]
-      if (comm_hook_ == nullptr) {
-        // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp
-        auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_);
-        wrapped.unsafeGetTensorImpl()->set_wrapped_number(true);
-        // Divides while copying into the bucket view.
-        at::native::mul_out(bucket_view, grad, wrapped);
+      this->check_grad_layout(grad, bucket_view);
+      // When gradient_as_bucket_view_ is false, or even when
+      // gradient_as_bucket_view_ is true, in rare cases users may set grad to
+      // be None after every iteration. In these cases, grad and bucket_view are
+      // pointing to different storages and thus need to copy grads to
+      // bucket_view. If gradient_as_bucket_view_ is set as true, let grad point
+      // to bucket_view. If grad has already been set as views of buckets in
+      // previous iterations, no copy is needed.
+      if (!grad.is_alias_of(bucket_view)) {
+        this->copy_grad_to_bucket(grad, bucket_view);
+        if (gradient_as_bucket_view_) {
+          // Let grad point to bucket_view buffer.
+          grad = bucket_view;
+          // The grad is modified and need to be written back.
+          return true;
+        }
       } else {
-        bucket_view.copy_(grad);
+        // If grad and bucket view point to the same storage, no need to copy
+        if (comm_hook_ == nullptr) {
+          bucket_view.div_(divFactor_);
+        }
       }
     } else {
       bucket_view.zero_();
@@ -674,6 +704,17 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
 
 void Reducer::initialize_buckets(
     std::vector<std::vector<size_t>> bucket_indices) {
+  // If initialize_buckets is called inside DDP constructor, then
+  // it does not matter rpc context ptr is nullptr or not, as grad
+  // will not be mutated.
+  // If initialize_buckets is called during training loop, e.g, inside
+  // rebuild_buckets(), since grad could be mutated and be pointed to
+  // bucket_view, then it needs to check rpc context ptr is nullptr or not,
+  // If rpc context ptr is nullptr, mutate variable.grad(); otherwise,
+  // mutate grad in rpc context.
+  using torch::distributed::autograd::ThreadLocalDistAutogradContext;
+  this->rpc_context_.set(ThreadLocalDistAutogradContext::getContextPtr());
+
   // This shouldn't be called if we're expecting autograd hooks to fire.
   TORCH_CHECK(
       !expect_autograd_hooks_,
@@ -825,7 +866,7 @@ void Reducer::initialize_bucket_views(
     Reducer::BucketReplica& replica,
     at::Tensor& contents) {
   for (size_t i = 0; i < replica.variables.size(); i++) {
-    const auto& v = replica.variables[i];
+    auto& v = replica.variables[i];
     const auto offset = replica.offsets[i];
     const auto length = replica.lengths[i];
     if (v.is_non_overlapping_and_dense()) {
@@ -844,6 +885,29 @@ void Reducer::initialize_bucket_views(
     // By default `bucket_views_out` and `bucket_views_in` are
     // essentially the same thing.
     replica.bucket_views_out = replica.bucket_views_in;
+
+    // If gradient_as_bucket_view_ is set as true, then there are two cases to
+    // handle: initialize_bucket_views could be called inside initialize_buckets
+    // when rebuild_buckets, if grad has already been defined/calculated in
+    // previous iteration, old grad needs to be copied into new bucket_view and
+    // let grad point to the new bucket_view, initialize_bucket_views could also
+    // be called inside initialize_buckets during construction. Grads are not
+    // defined during construction time, in this case, do not let grad point to
+    // bucket_view, because grads should be kept as being undefined for globally
+    // unused parameters.
+    if (gradient_as_bucket_view_) {
+      auto& bucket_view = replica.bucket_views_in.back();
+      runGradCallbackForVariable(v, [&](auto& grad) {
+        if (grad.defined() && !grad.is_alias_of(bucket_view)) {
+          bucket_view.copy_(grad);
+          grad = bucket_view;
+          // The grad is modefied and needs to be written back.
+          return true;
+        }
+        // The grad is not modified and does not need to be written back.
+        return false;
+      });
+    }
   }
 }
 
@@ -965,6 +1029,31 @@ void Reducer::prepare_for_backward(
   }
 }
 
+void Reducer::copy_bucket_to_grad(
+    torch::autograd::Variable& variable,
+    Reducer::BucketReplica& replica,
+    size_t intra_bucket_index,
+    bool global_unused) {
+  const auto& bucket_view = replica.bucket_views_out[intra_bucket_index];
+  runGradCallbackForVariable(variable, [&](auto& grad) {
+    // If a parameter is globally unused, we keep its grad untouched.
+    if (!global_unused) {
+      if (!grad.defined()) {
+        // Creates grad according to the "Gradient Layout Contract"
+        // (see torch/csrc/grad/AccumulateGrad.h)
+        grad =
+            torch::autograd::utils::clone_obey_contract(bucket_view, variable);
+      } else {
+        grad.copy_(bucket_view);
+      }
+      // The grad is modified and needs to be written back.
+      return true;
+    }
+    // The grad is not modified.
+    return false;
+  });
+}
+
 // A bucket with one or more dense tensors needs to be unflattened.
 void Reducer::finalize_bucket_dense(Bucket& bucket) {
   for (size_t replica_index = 0; replica_index < bucket.replicas.size();
@@ -1015,24 +1104,52 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) {
         }
       }
 
-      const auto& bucket_view = replica.bucket_views_out[intra_bucket_index];
-      runGradCallbackForVariable(variable, [&](auto& grad) {
-        // If a parameter is globally unused, we keep its grad untouched.
-        if (!global_unused) {
-          if (!grad.defined()) {
-            // Creates grad according to the "Gradient Layout Contract"
-            // (see torch/csrc/grad/AccumulateGrad.h)
-            grad = torch::autograd::utils::clone_obey_contract(
-                bucket_view, variable);
-          } else {
-            grad.copy_(bucket_view);
-          }
-          // The grad is modified and needs to be written back.
-          return true;
+      if (!gradient_as_bucket_view_) {
+        copy_bucket_to_grad(
+            variable, replica, intra_bucket_index, global_unused);
+      } else {
+        const auto& bucket_view_out =
+            replica.bucket_views_out[intra_bucket_index];
+        auto& bucket_view_in = replica.bucket_views_in[intra_bucket_index];
+        // If communication_hook is registered, bucket_view_out stores
+        // allreduced results in a newly allocated tensor, copy bucket_view_out
+        // back to bucket_view_in that referring to replica.content tensor and
+        // grad.
+        if (!bucket_view_in.is_alias_of(bucket_view_out)) {
+          bucket_view_in.copy_(bucket_view_out);
         }
-        // The grad is not modified.
-        return false;
-      });
+        runGradCallbackForVariable(variable, [&](auto& grad) {
+          // If a parameter is globally unused, we keep its grad untouched.
+          if (!global_unused) {
+            // If grad is globally used but locally unused, let grad point to
+            // bucket_view_in
+            if (!grad.defined()) {
+              grad = bucket_view_in;
+            } else {
+              if (!grad.is_alias_of(bucket_view_in)) {
+                grad.copy_(bucket_view_in);
+                TORCH_WARN_ONCE(
+                    "Detected at least one parameter gradient is not the "
+                    "expected DDP bucket view when setting "
+                    "gradient_as_bucket_view=True. This can happen when "
+                    "multiple parameters sharing the same gradient. For "
+                    "example, param0 and param1 share the same gradient "
+                    "grad0. In this case, grad0 would first point to "
+                    "bucket_view_in0 when param0 is ready. Later, when "
+                    "param1 is ready, it will override grad0 to point to "
+                    "bucket_view_in1. However, param0 still expects grad0 "
+                    "to point to bucket_view_in0, and hence hit this "
+                    "warning. If you saw this message, please double-check if "
+                    "the above situation is expected for your application.");
+              }
+            }
+            // The grad is modified and needs to be written back.
+            return true;
+          }
+          // The grad is not modified.
+          return false;
+        });
+      }
     }
   }
 }
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 3b441c99a3b6..960a32356acf 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -30,7 +30,8 @@ class Reducer {
       std::shared_ptr<c10d::ProcessGroup> process_group,
       std::vector<std::vector<bool>> expect_sparse_gradients,
       int64_t bucket_bytes_cap,
-      bool find_unused_parameters);
+      bool find_unused_parameters,
+      bool gradient_as_bucket_view);
 
   ~Reducer() noexcept(false);
 
@@ -124,6 +125,7 @@ class Reducer {
 
   bool has_marked_unused_parameters_;
   const bool find_unused_parameters_;
+  const bool gradient_as_bucket_view_;
   std::vector<VariableIndex> unused_parameters_;
   // Locally used parameter maps indicating if parameters are used locally
   // during the current iteration or no_sync session if no_sync is on. One
@@ -230,6 +232,19 @@ class Reducer {
   // with the result of `future_work`.
   void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor);
 
+  // If gradient_as_bucket_view_ is false, after allreduce buckets,
+  // copy bucket results back to grads.
+  void copy_bucket_to_grad(
+      torch::autograd::Variable& variable,
+      Reducer::BucketReplica& replica,
+      size_t intra_bucket_index,
+      bool global_unused);
+  // Check layout of grad and bucket_view before calling copy_grad_to_bucket
+  void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view);
+  // If gradient_as_bucket_view_ is false, before allreduce buckets,
+  // copy grads to buckets.
+  void copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view);
+
   // A bucket holds N bucket replicas (1 per model replica).
   //
   // If every bucket in this struct is ready, the reduction can be kicked off.
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 790a9d1c2fc4..5ec2b0148a21 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -316,6 +316,28 @@ class DistributedDataParallel(Module):
                          are getting different gradients, which should not
                          happen if DistributedDataParallel is correctly used.
                          (default: ``False``)
+        gradient_as_bucket_view (bool): this is a prototype feature. When set to ``True``,
+                      gradients will be views pointing to different offsets of
+                      allreduce communication buckets. This can reduce peak memory
+                      usage, where the saved memory size will be equal to the total
+                      gradients size. Moreover, it avoids the overhead of copying
+                      between gradients and allreduce communication buckets.
+                      When gradients are views, ``detach_()`` cannot be called on the
+                      gradients. If hitting such errors, please fix it by referring to
+                      the :meth:`~torch.optim.Optimizer.zero_grad` function in
+                      ``torch/optim/optimizer.py`` as the solution.
+                      Warning! It is also found that ``gradient_as_bucket_view = true``
+                      does not work as expected when ``apex.amp`` is used for
+                      mixed precision training. ``apex.amp`` maintained stashed gradients
+                      that are used for unscaling gradients. These stashed gradients
+                      are pointed to gradients (will be communication buckets when
+                      ``gradient_as_bucket_view = true``) before starting new iteration.
+                      In new iteration, the communication buckets are mutated and thus
+                      these stashed gradients will be unexpectedly mutated as well,
+                      the unexpectedly muated stashed gradients may result in wrong
+                      results. To fix it, these stashed gradients should not be pointed
+                      to gradients, instead they should be copied from gradients when
+                      ``gradient_as_bucket_view = true``.
 
     Attributes:
         module (Module): the module to be parallelized
@@ -330,7 +352,8 @@ def __init__(self, module, device_ids=None,
                  process_group=None,
                  bucket_cap_mb=25,
                  find_unused_parameters=False,
-                 check_reduction=False):
+                 check_reduction=False,
+                 gradient_as_bucket_view=False):
 
         super(DistributedDataParallel, self).__init__()
 
@@ -381,6 +404,7 @@ def __init__(self, module, device_ids=None,
         self.require_backward_grad_sync = True
         self.require_forward_param_sync = True
         self.ddp_join_enabled = False
+        self.gradient_as_bucket_view = gradient_as_bucket_view
 
         if check_reduction:
             # This argument is no longer used since the reducer
@@ -516,7 +540,8 @@ def produces_sparse_gradient(module):
             self.process_group,
             expect_sparse_gradient,
             self.bucket_bytes_cap,
-            self.find_unused_parameters)
+            self.find_unused_parameters,
+            self.gradient_as_bucket_view)
 
         # passing a handle to torch.nn.SyncBatchNorm layer
         self._passing_sync_batchnorm_handle(self._module_copies)
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 85b1d65a06ec..f6f2b9a6fbfb 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -2096,6 +2096,14 @@ def _model_step(self, model):
                         param += param.grad
                     param.grad = None
 
+        def _model_step_with_zero_grad(self, model):
+            for param in model.parameters():
+                if param.grad is not None:
+                    with torch.no_grad():
+                        param += param.grad
+                    param.grad.requires_grad_(False)
+                    param.grad.zero_()
+
         def _prepare_dummy_data(self, local_bs):
             # global_bs for DDP should be divisible by WORLD_SIZE
             world_size = int(os.environ["WORLD_SIZE"])
@@ -2118,7 +2126,8 @@ def _assert_equal_param(self, param_gpu, param_DDP):
                 self.assertEqual(p_gpu, p_DDP)
 
         def _test_DDP_5iter(
-            self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save, offset=None, world_size=0
+            self, model_base, model_DDP, input, target, loss, local_bs, rank, batch_size, test_save,
+            offset=None, world_size=0, zero_grad=False
         ):
             for idx in range(5):
                 # single cpu/gpu training
@@ -2137,8 +2146,12 @@ def _test_DDP_5iter(
                 )
 
                 # Update weights and run a second iteration to shake out errors
-                self._model_step(model_base)
-                self._model_step(model_DDP)
+                if zero_grad:
+                    self._model_step_with_zero_grad(model_base)
+                    self._model_step_with_zero_grad(model_DDP)
+                else:
+                    self._model_step(model_base)
+                    self._model_step(model_DDP)
                 self._assert_equal_param(
                     list(model_base.parameters()), list(model_DDP.module.parameters())
                 )
@@ -2159,7 +2172,7 @@ def _test_DDP_5iter(
             for k in model_DDP.state_dict():
                 self.assertEqual(model_DDP.state_dict()[k], saved_model.state_dict()[k])
 
-        def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
+        def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gradient_as_bucket_view=False):
             # Run a simple end to end DDP model, use result of single node model
             # as baseline
 
@@ -2174,7 +2187,7 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
             model_DDP = copy.deepcopy(model)
             model_DDP.cuda(gpu_subset[0])
             model_DDP = nn.parallel.DistributedDataParallel(
-                model_DDP, device_ids=gpu_subset
+                model_DDP, device_ids=gpu_subset, gradient_as_bucket_view=gradient_as_bucket_view
             )
 
             # test serializable/unserializable
@@ -2196,14 +2209,11 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None):
                 local_bs,
                 rank,
                 global_bs,
-                True
+                True,
             )
             self._barrier()
 
-        @unittest.skipIf(
-            BACKEND == "nccl", "nccl does not support DDP on CPU models"
-        )
-        def test_DistributedDataParallelCPU(self):
+        def _test_DistributedDataParallelCPU(self, gradient_as_bucket_view=False):
             # Run a simple end to end DDP-CPU model, use result of single node
             # model as baseline
             group, group_id, rank = self._init_global_test()
@@ -2213,7 +2223,8 @@ def test_DistributedDataParallelCPU(self):
 
             # DDP-CPU training setup
             model_DDP = copy.deepcopy(model_base)
-            model_DDP = nn.parallel.DistributedDataParallelCPU(model_DDP)
+            model_DDP = nn.parallel.DistributedDataParallel(
+                model_DDP, gradient_as_bucket_view=gradient_as_bucket_view)
 
             # dummy data initialization
             local_bs = 2
@@ -2221,10 +2232,22 @@ def test_DistributedDataParallelCPU(self):
 
             # check two model parameters over 5 iterations
             self._test_DDP_5iter(
-                model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False
+                model_base, model_DDP, input_cpu, target, loss, local_bs, rank, global_bs, False, zero_grad=True
             )
             self._barrier()
 
+        @unittest.skipIf(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_DistributedDataParallelCPU(self):
+            self._test_DistributedDataParallelCPU()
+
+        @unittest.skipIf(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_DistributedDataParallelCPU_grad_is_view(self):
+            self._test_DistributedDataParallelCPU(gradient_as_bucket_view=True)
+
         @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
                          "Only Nccl & Gloo backend support DistributedDataParallel")
         def test_DistributedDataParallel_requires_grad(self):
@@ -2288,6 +2311,25 @@ def test_DistributedDataParallel(self):
             gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
             self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'))
 
+        @unittest.skipIf(BACKEND != 'nccl' and BACKEND != 'gloo',
+                         "Only Nccl & Gloo backend support DistributedDataParallel")
+        @skip_if_no_gpu
+        @skip_if_rocm
+        def test_DistributedDataParallel_with_grad_is_view(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = self._init_multigpu_helper()
+            gpus = list(rank_to_GPU[rank])
+            self._test_DistributedDataParallel(gpu_subset=gpus, rank=rank, gradient_as_bucket_view=True)
+
+            # test output_device
+            self._test_DistributedDataParallel(
+                gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True)
+
+            # test device_ids
+            gpus = list(map(lambda i: torch.device('cuda:' + str(i)), gpus))
+            self._test_DistributedDataParallel(
+                gpu_subset=gpus, rank=rank, output_device=torch.device('cuda'), gradient_as_bucket_view=True)
+
         def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs, global_bs, offset, output_device=None):
             # Run a simple end to end DDP model, use result of single node model
             # as baseline

From 0122299f9ba729aa0c9bd43764af53225e03672c Mon Sep 17 00:00:00 2001
From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com>
Date: Thu, 24 Sep 2020 21:12:16 -0700
Subject: [PATCH 121/449] Enable distributed package on windows, Gloo backend
 supported only (#42897)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/42095

For test case part will be committed to this PR later

mrshenli, please help to review

Pull Request resolved: https://github.com/pytorch/pytorch/pull/42897

Reviewed By: osalpekar

Differential Revision: D23841786

Pulled By: mrshenli

fbshipit-source-id: 334ba1ed73eff2f668857390fc32d1bc7f08e5f3
---
 .../install_miniconda3.bat                    |  7 +++
 CMakeLists.txt                                |  8 ++-
 caffe2/CMakeLists.txt                         | 49 +++++++++------
 cmake/Dependencies.cmake                      |  5 +-
 test/cpp/dist_autograd/CMakeLists.txt         |  2 +-
 test/distributed/test_c10d.py                 | 49 ++++++++++-----
 test/distributed/test_c10d_spawn.py           |  8 ++-
 test/run_test.py                              | 11 ++--
 tools/build_variables.bzl                     |  7 ++-
 torch/CMakeLists.txt                          | 33 +++++-----
 torch/csrc/Module.cpp                         |  4 +-
 torch/csrc/WindowsTorchApiMacro.h             |  6 ++
 torch/csrc/distributed/c10d/comm.h            |  4 +-
 torch/csrc/distributed/c10d/init.cpp          | 10 ++-
 torch/csrc/distributed/c10d/reducer.cpp       | 22 +++----
 torch/csrc/distributed/c10d/reducer.h         | 14 +++++
 torch/csrc/jit/python/pybind_utils.h          |  8 +--
 .../csrc/jit/python/python_sugared_value.cpp  |  2 +-
 torch/csrc/jit/runtime/interpreter.cpp        |  8 +--
 torch/csrc/jit/serialization/pickler.cpp      |  6 +-
 torch/csrc/jit/serialization/unpickler.cpp    |  6 +-
 torch/csrc/utils/future.h                     |  2 +-
 torch/distributed/rendezvous.py               | 14 ++++-
 torch/lib/c10d/CMakeLists.txt                 | 32 ++++++----
 torch/lib/c10d/FileStore.cpp                  | 51 +++++++++++++++-
 torch/lib/c10d/GlooDeviceFactory.cpp          | 33 ++++++----
 torch/lib/c10d/ProcessGroupGloo.cpp           | 61 ++++++++++++++++---
 torch/lib/c10d/Utils.cpp                      |  3 +-
 torch/lib/c10d/Utils.hpp                      |  4 ++
 torch/lib/c10d/test/CMakeLists.txt            | 15 +++--
 torch/lib/c10d/test/CUDATest.hpp              | 10 ++-
 torch/lib/c10d/test/FileStoreTest.cpp         |  8 +++
 torch/lib/c10d/test/ProcessGroupGlooTest.cpp  |  9 ++-
 torch/lib/c10d/test/TestUtils.hpp             | 30 ++++++++-
 torch/testing/_internal/common_distributed.py | 17 +++++-
 torch/testing/_internal/common_utils.py       |  4 ++
 torch/testing/_internal/dist_utils.py         |  3 +-
 .../ddp_under_dist_autograd_test.py           | 16 ++---
 .../_internal/distributed/distributed_test.py | 48 +++++++++++----
 39 files changed, 462 insertions(+), 167 deletions(-)

diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
index a66ef4b651c5..cf7255ce3789 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
@@ -12,4 +12,11 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic
 if "%REBUILD%"=="" (
   call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3
   call conda install -y -q -c conda-forge cmake
+  call conda install -y -q -c rdonnelly libuv
 )
+
+:: Get installed libuv path
+@echo off
+set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library
+@echo on
+echo libuv_ROOT=%libuv_ROOT%
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 826c187b602e..3d937e0e1655 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,7 @@ endif()
 
 # For non-supported platforms, turn USE_DISTRIBUTED off by default.
 # It is not tested and likely won't work without additional changes.
-if(NOT LINUX)
+if(NOT LINUX AND NOT WIN32)
   set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed")
   # On macOS, if USE_DISTRIBUTED is enabled (specified by the user),
   # then make Gloo build with the libuv transport.
@@ -226,6 +226,12 @@ option(USE_TBB "Use TBB" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 
+# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
+if(WIN32)
+  set(USE_TENSORPIPE OFF)
+  message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
+endif()
+
 # Linux distributions do not want too many embedded sources, in that sense we
 # need to be able to build pytorch with an (almost) empty third_party
 # directory.
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 65f072b6f29d..219b28c69695 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -291,26 +291,29 @@ endif()
 
 if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   if(USE_DISTRIBUTED)
-    add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
-    target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
-    add_dependencies(process_group_agent torch c10d)
 
     # Define this target even if we're building without TensorPipe, to make life
     # easier to other targets that depend on this. However, in that case, by not
     # setting the USE_TENSORPIPE compile definition, this target will just end
     # up being empty. Downstream targets should also add a #ifdef guard.
-    add_library(tensorpipe_agent
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
-      )
-    target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
-    add_dependencies(tensorpipe_agent torch c10d)
-    if(USE_TENSORPIPE)
-      target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
-      target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
-      add_dependencies(tensorpipe_agent tensorpipe)
+    if(NOT WIN32)
+      add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
+      target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
+      add_dependencies(process_group_agent torch c10d)
+
+      add_library(tensorpipe_agent
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
+        )
+      target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
+      add_dependencies(tensorpipe_agent torch c10d)
+      if(USE_TENSORPIPE)
+        target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
+        target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
+        add_dependencies(tensorpipe_agent tensorpipe)
+      endif()
     endif()
   endif()
 
@@ -493,7 +496,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
         PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
       )
     endif()
-    if(USE_DISTRIBUTED)
+    if(USE_DISTRIBUTED AND NOT WIN32)
       append_filelist("libtorch_distributed_sources" TORCH_SRCS)
     endif()
   endif()
@@ -837,7 +840,7 @@ endif()
   if(BUILD_TEST AND NOT USE_ROCM)
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
     add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr)
-    if(USE_DISTRIBUTED)
+    if(USE_DISTRIBUTED AND NOT WIN32)
       add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
     endif()
   endif()
@@ -889,9 +892,7 @@ endif()
     DESTINATION share/cmake/Torch)
 
   if(USE_DISTRIBUTED)
-    if(NOT MSVC)
-      add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
-    endif()
+    add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
   endif()
 
 
@@ -966,6 +967,14 @@ if(USE_DISTRIBUTED)
   target_compile_definitions(torch_cpu PRIVATE
     USE_DISTRIBUTED
   )
+  # Pass USE_RPC in order to reduce use of 
+  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+  # need to be removed when RPC is supported
+  if(NOT WIN32)
+    target_compile_definitions(torch_cpu PRIVATE
+      USE_RPC
+    )
+  endif()
   # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
   # can only be compiled with USE_TENSORPIPE is set.
   if(USE_TENSORPIPE)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 028098f61d36..023bbe9e8d07 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1253,10 +1253,7 @@ if(USE_CUDA)
 endif()
 
 if(USE_GLOO)
-  if(MSVC)
-    message(WARNING "Gloo can not be used on Windows.")
-    caffe2_update_option(USE_GLOO OFF)
-  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+  if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     message(WARNING "Gloo can only be used on 64-bit systems.")
     caffe2_update_option(USE_GLOO OFF)
   else()
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 5d23602881f0..9969c63e16d5 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(USE_DISTRIBUTED)
+if(USE_DISTRIBUTED AND NOT WIN32)
   set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
   set(DIST_AUTOGRAD_TEST_SOURCES
     ${TORCH_ROOT}/test/cpp/common/main.cpp
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index a81bc53f175a..911a73ce432e 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -29,7 +29,7 @@
 from torch.testing._internal.common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
-    simple_sparse_reduce_tests
+    simple_sparse_reduce_tests, skip_if_win32, create_device
 
 from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
     retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -255,6 +255,7 @@ def create_tcp_store(addr):
     raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
 
 
+@skip_if_win32()
 class TCPStoreTest(TestCase, StoreTestBase):
     def _create_store(self):
         store = create_tcp_store('localhost')
@@ -273,6 +274,7 @@ def test_address_already_in_use(self):
             store2 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
 
 
+@skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
         super(PrefixTCPStoreTest, self).setUp()
@@ -329,6 +331,7 @@ def test_unknown_handler(self):
             c10d.rendezvous('invalid://')
 
 
+@skip_if_win32()
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     def test_common_errors(self):
@@ -455,7 +458,7 @@ def test_common_errors(self):
 
     def test_nominal(self):
         with tempfile.NamedTemporaryFile(delete=False) as file:
-            url = 'file://%s?world_size=%d' % (file.name, 2)
+            url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2'
             gen0 = c10d.rendezvous(url + "&rank=0")
             store0, rank0, size0 = next(gen0)
             self.assertEqual(0, rank0)
@@ -474,6 +477,7 @@ def test_nominal(self):
             self.assertEqual(b"value1", store0.get("key1"))
 
 
+@skip_if_win32()
 class RendezvousTCPTest(TestCase):
 
     def create_tcp_url(self):
@@ -544,9 +548,13 @@ def _test_store_timeout(self, backend, init_method, c2p):
 
     def _init_methods(self):
         f = tempfile.NamedTemporaryFile(delete=False)
-        yield "file://%s" % f.name
-        f.close()
-        yield "tcp://127.0.0.1:%d" % common.find_free_port()
+        if sys.platform == 'win32':
+            yield "file:///%s" % f.name.replace("\\", "/")
+            f.close()
+        else:
+            yield "file://%s" % f.name
+            f.close()
+            yield "tcp://127.0.0.1:%d" % common.find_free_port()
 
     def _test_default_store_timeout(self, backend):
         for init_method in self._init_methods():
@@ -584,11 +592,16 @@ def test_default_store_timeout_gloo(self):
 class ProcessGroupGlooTest(MultiProcessTestCase):
     def setUp(self):
         super(ProcessGroupGlooTest, self).setUp()
-        self._fork_processes()
+
+        # For Windows platform, Python does not support fork, change it to spawn here.
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def opts(self, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        opts.devices = [create_device(interface=LOOPBACK)]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
@@ -598,8 +611,8 @@ def test_multi_device_constructor(self):
         opts = c10d.ProcessGroupGloo.Options()
         opts.timeout = 5.0
         opts.devices = [
-            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
-            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
+            create_device(interface=LOOPBACK),
+            create_device(interface=LOOPBACK),
         ]
         pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
 
@@ -1514,6 +1527,7 @@ def test_barrier_implies_wait(self):
         for i, tensor in enumerate(tensors):
             self.assertEqual(torch.full(size, float(i * self.world_size)), tensor)
 
+    @skip_if_win32()
     def test_round_robin(self):
         num_process_groups = 2
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -1531,6 +1545,7 @@ def test_round_robin(self):
             pg.broadcast(tensor, root=0).wait()
             self.assertEqual(torch.full([100, 100], 0.), tensor)
 
+    @skip_if_win32()
     def test_round_robin_create_destroy(self):
         store = c10d.FileStore(self.file_name, self.world_size)
 
@@ -1959,7 +1974,10 @@ def forward(self, x):
 class DistributedDataParallelTest(MultiProcessTestCase):
     def setUp(self):
         super(DistributedDataParallelTest, self).setUp()
-        self._fork_processes()
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def tearDown(self):
         # DistributedDataParallel test doesn't seem to call FileStore destructor
@@ -2068,7 +2086,7 @@ def update_parameters(model):
     def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view)
 
@@ -3947,7 +3965,10 @@ def test_nccl_timeout(self):
 class CommTest(MultiProcessTestCase):
     def setUp(self):
         super(CommTest, self).setUp()
-        self._fork_processes()
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def tearDown(self):
         super(CommTest, self).tearDown()
@@ -4013,7 +4034,7 @@ def test_broadcast_coalesced_nccl(self):
     def test_broadcast_coalesced_gloo_cuda(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cuda:%d" % self.rank)
         ranks = list(range(self.world_size))
@@ -4024,7 +4045,7 @@ def test_broadcast_coalesced_gloo_cuda(self):
     def test_broadcast_coalesced_gloo_cpu(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cpu")
         ranks = list(range(self.world_size))
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index d0bf00b8a08a..c84608e8f178 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -10,8 +10,10 @@
 import torch.nn as nn
 
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
-from torch.testing._internal.common_distributed import requires_gloo
-from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm
+from torch.testing._internal.common_distributed import requires_gloo, \
+    create_device
+from torch.testing._internal.common_utils import TestCase, load_tests, \
+    run_tests, skipIfRocm
 from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN
 
 
@@ -39,7 +41,7 @@ class ProcessGroupShareTensorTest(TestCase):
     @classmethod
     def opts(cls, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")]
+        opts.devices = [create_device(interface='lo')]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
diff --git a/test/run_test.py b/test/run_test.py
index d63fc372f9c2..0f9d14a78605 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -13,7 +13,7 @@
 import torch
 import torch._six
 from torch.utils import cpp_extension
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell
+from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA
 import torch.distributed as dist
 from typing import Dict, Optional
 
@@ -99,7 +99,6 @@
     'distributed/rpc/test_process_group_agent',
     'distributed/rpc/test_tensorpipe_agent',
     'distributed/test_distributed_fork',
-    'distributed/test_distributed_spawn',
 ]
 
 ROCM_BLOCKLIST = [
@@ -306,9 +305,13 @@ def test_distributed(test_module, test_directory, options):
             'MPI not available -- MPI backend tests will be skipped')
     config = DISTRIBUTED_TESTS_CONFIG
     for backend, env_vars in config.items():
+        if sys.platform == 'win32' and backend != 'gloo':
+            continue
         if backend == 'mpi' and not mpi_available:
             continue
         for with_init_file in {True, False}:
+            if sys.platform == 'win32' and not with_init_file:
+                continue
             tmp_dir = tempfile.mkdtemp()
             if options.verbose:
                 init_str = "with {} init_method"
@@ -322,9 +325,9 @@ def test_distributed(test_module, test_directory, options):
             os.environ.update(env_vars)
             if with_init_file:
                 if test_module in ["test_distributed_fork", "test_distributed_spawn"]:
-                    init_method = 'file://{}/'.format(tmp_dir)
+                    init_method = f'{FILE_SCHEMA}{tmp_dir}/'
                 else:
-                    init_method = 'file://{}/shared_init_file'.format(tmp_dir)
+                    init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file'
                 os.environ['INIT_METHOD'] = init_method
             try:
                 os.mkdir(os.path.join(tmp_dir, 'barrier'))
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 174bb858da44..c21fab8ec2cf 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -537,11 +537,14 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
 ]
 
-libtorch_python_distributed_sources = [
-    "torch/csrc/distributed/autograd/init.cpp",
+libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/comm.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
+]
+
+libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
+    "torch/csrc/distributed/autograd/init.cpp",
     "torch/csrc/distributed/rpc/init.cpp",
     "torch/csrc/distributed/rpc/process_group_agent.cpp",
     "torch/csrc/distributed/rpc/py_rref.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index b78dc4a362a7..2ae2f7f737fe 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -160,25 +160,28 @@ endif()
 
 if(USE_DISTRIBUTED)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED)
-    if(NOT MSVC)
+    if(WIN32)
+      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
+    else()
+      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC)
       append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
-      # Disable certain warnings for GCC-9.X
-      if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      endif()
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
-      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
-      if(USE_TENSORPIPE)
-        list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
-        list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
-      endif()
     endif()
+    # Disable certain warnings for GCC-9.X
+    if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+    endif()
+    if(USE_TENSORPIPE)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
+      list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
+    endif()
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 endif()
 
-if(USE_NCCL)
+if(USE_NCCL AND NOT WIN32)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ed4aa21a8f76..ae6f15155f2a 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -688,9 +688,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
-#ifdef USE_DISTRIBUTED
-#ifdef USE_C10D
+#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
   THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions());
+#ifndef _WIN32
   THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions());
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::autograd::python_functions());
diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h
index 7f8ef4e01677..7f44db0baba9 100644
--- a/torch/csrc/WindowsTorchApiMacro.h
+++ b/torch/csrc/WindowsTorchApiMacro.h
@@ -5,3 +5,9 @@
 // There's no difference between aten, torch and caffe2 libs any more
 // TODO: clean up the naming for consistency
 #define TORCH_API CAFFE2_API
+
+#ifdef _WIN32
+#define TORCH_PYTHON_API
+#else
+#define TORCH_PYTHON_API CAFFE2_API
+#endif
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index e2b501f08aff..2eb626c40232 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -38,7 +38,7 @@ class GradBucket {
 // DDP's c10d reducer allows communication hooks defined as a sub class
 // of CommHookInterface. CommHookInterface is an abstract class and can
 // be used to implement both Python and CPP hooks.
-struct TORCH_API CommHookInterface {
+struct TORCH_PYTHON_API CommHookInterface {
  public:
   virtual ~CommHookInterface() {}
 
@@ -59,7 +59,7 @@ struct TORCH_API CommHookInterface {
 
 // PythonCommHook enables registering a python hook to c10d reducer and is a
 // sub class of CommHookInterface.
-class TORCH_API PythonCommHook : public CommHookInterface {
+class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
  public:
   // The constructor takes a state and a callable hook. Inputs are Python
   // objects. The state is passed to the hook in runHook function can be used to
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 165d6a1c8603..be1752d7366f 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1,7 +1,11 @@
 #include <torch/csrc/python_headers.h>
 
 #include <c10d/FileStore.hpp>
+#ifndef _WIN32
 #include <c10d/HashStore.hpp>
+#include <c10d/TCPStore.hpp>
+#include <c10d/ProcessGroupRoundRobin.hpp>
+#endif
 #include <c10d/ProcessGroup.hpp>
 
 #ifdef USE_C10D_GLOO
@@ -17,8 +21,6 @@
 #endif
 
 #include <c10d/PrefixStore.hpp>
-#include <c10d/ProcessGroupRoundRobin.hpp>
-#include <c10d/TCPStore.hpp>
 #include <pybind11/chrono.h>
 
 #include <torch/csrc/Exceptions.h>
@@ -323,6 +325,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
   shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store)
       .def(py::init<const std::string&, int>());
 
+#ifndef _WIN32
   shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store)
       .def(py::init<>());
 
@@ -340,6 +343,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
           py::arg("is_master"),
           py::arg("timeout") =
               std::chrono::milliseconds(::c10d::Store::kDefaultTimeout));
+#endif
 
   shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store)
       .def(py::init<const std::string&, std::shared_ptr<::c10d::Store>>());
@@ -607,6 +611,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
               py::arg("opts") = ::c10d::BarrierOptions(),
               py::call_guard<py::gil_scoped_release>());
 
+#ifndef _WIN32
   module.def(
       "_round_robin_process_groups",
       [](std::vector<std::shared_ptr<::c10d::ProcessGroup>> processGroups)
@@ -620,6 +625,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
       },
       py::arg("process_groups"),
       py::call_guard<py::gil_scoped_release>());
+#endif
 
 #ifdef USE_C10D_GLOO
   auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>(
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 86916c7994dd..814d3494ff4e 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -89,10 +89,7 @@ Reducer::Reducer(
       for (size_t variable_index = 0; variable_index < variable_count;
            variable_index++) {
         auto& variable = replicas_[replica_index][variable_index];
-        const auto index = VariableIndex{
-            .replica_index = replica_index,
-            .variable_index = variable_index,
-        };
+        const auto index = VariableIndex(replica_index, variable_index);
 
         // The gradient accumulator function is lazily initialized once.
         // Therefore we can use its presence in the autograd graph as
@@ -100,15 +97,19 @@ Reducer::Reducer(
         auto grad_accumulator =
             torch::autograd::impl::grad_accumulator(variable);
 
+#ifndef _WIN32
         using torch::distributed::autograd::ThreadLocalDistAutogradContext;
+#endif
         // Hook to execute after the gradient accumulator has executed.
         hooks_.emplace_back(
             grad_accumulator->add_post_hook(
                 torch::make_unique<torch::autograd::utils::LambdaPostHook>(
                     [=](const torch::autograd::variable_list& outputs,
                         const torch::autograd::variable_list& /* unused */) {
+#ifndef _WIN32
                       this->rpc_context_.set(
                           ThreadLocalDistAutogradContext::getContextPtr());
+#endif
                       this->autograd_hook(index);
                       return outputs;
                     })),
@@ -477,10 +478,7 @@ void Reducer::push_rebuilt_params_for_all_indices() {
     const auto variable_count = replicas_[replica_index].size();
     for (size_t variable_index = 0; variable_index < variable_count;
          ++variable_index) {
-      const auto index = VariableIndex{
-          .replica_index = replica_index,
-          .variable_index = variable_index,
-      };
+      const auto index = VariableIndex(replica_index, variable_index);
       push_rebuilt_params(index);
     }
   }
@@ -850,10 +848,8 @@ void Reducer::initialize_buckets(
       TORCH_CHECK(
           variable_index < variable_locators_.size(),
           "Out of range variable index specified.");
-      variable_locators_[variable_index] = VariableLocator{
-          .bucket_index = bucket_index,
-          .intra_bucket_index = intra_bucket_index++,
-      };
+      variable_locators_[variable_index] = VariableLocator(
+        bucket_index, intra_bucket_index++);
     }
     bucket.variable_indices = std::move(bucket_indices[bucket_index]);
 
@@ -1235,7 +1231,9 @@ void Reducer::runGradCallbackForVariable(
     cb(variable.mutable_grad());
   } else {
     // Under distributed autograd
+#ifndef _WIN32
     context_ptr->runGradCallbackForVariable(variable, std::move(cb));
+#endif
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 960a32356acf..486b7337366a 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -104,6 +104,13 @@ class Reducer {
   struct VariableIndex {
     size_t replica_index;
     size_t variable_index;
+
+    VariableIndex() = default;
+
+    VariableIndex(size_t replica_index_, size_t variable_index_) {
+      replica_index = replica_index_;
+      variable_index = variable_index_;
+    }
   };
 
   void push_rebuilt_params(const VariableIndex& index);
@@ -281,6 +288,13 @@ class Reducer {
     size_t bucket_index;
     // Index of parameter in single bucket replica.
     size_t intra_bucket_index;
+
+    VariableLocator() = default;
+
+    VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) {
+      bucket_index = bucket_index_;
+      intra_bucket_index = intra_bucket_index_;
+    }
   };
 
   // Map the index of a variable to its location in the bucket structure.
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 65f5a49145c8..4be55a9caa90 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) {
   if (py::isinstance<Object>(input)) {
     auto object = py::cast<Object>(input);
     return InferredType(object.type());
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   } else if (py::isinstance<torch::distributed::rpc::PyRRef>(input)) {
     auto rref_ivalue = input.cast<torch::distributed::rpc::PyRRef>().toIValue();
     return InferredType(rref_ivalue.type());
@@ -716,7 +716,7 @@ inline IValue toIValue(
       }
     }
     case TypeKind::RRefType: {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
       return obj.cast<torch::distributed::rpc::PyRRef>().toIValue();
 #else
       AT_ERROR("RRef is only supported with the distributed package");
@@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) {
     }
     return std::move(py_dict);
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     auto RRefPtr =
         c10::dynamic_intrusive_pointer_cast<torch::distributed::rpc::RRef>(
             std::move(ivalue).toRRef());
@@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) {
     auto py_class = getScriptedClassOrError(qualified_class_name);
     return py_class.attr(enum_holder->name().c_str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     return py::cast(torch::distributed::rpc::PyRRef(
         c10::static_intrusive_pointer_cast<distributed::rpc::RRef>(
             ivalue.toRRef())));
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index ba94d33f37b3..119b6b5e5de7 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -916,7 +916,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) {
     return SpecialFormValue::create(prim::annotate);
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on.
   } else if (
       obj.ptr() ==
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 337fe66c0789..f61e2597447f 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -23,7 +23,7 @@
 #include <torch/csrc/jit/runtime/profiling_record.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/autograd/context/container.h>
 using torch::distributed::autograd::DistAutogradContainer;
 #endif
@@ -267,7 +267,7 @@ void insertLastUses(Graph& g) {
 }
 
 inline int64_t getDistAutogradContextId() {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   return DistAutogradContainer::currentContextId();
 #else
   return 0;
@@ -1690,7 +1690,7 @@ InterpreterState::InterpreterState(
     : pImpl(std::move(pImpl_)) {}
 
 void InterpreterContinuation::operator()() {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   auto prev_dist_id = DistAutogradContainer::currentContextId();
   DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_);
 #endif
@@ -1700,7 +1700,7 @@ void InterpreterContinuation::operator()() {
   } else {
     state.runAsync(stack);
   }
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   DistAutogradContainer::forceCurrentContextId(prev_dist_id);
 #endif
 }
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 6f911f4246cc..2bc9abea8c57 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <aten/src/ATen/quantized/Quantizer.h>
@@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
            "this class.";
     AT_ERROR(err.str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     TORCH_CHECK(
         torch::distributed::rpc::getAllowJitRRefPickle() == true,
         "RRef jit pickling is only allowed inside RPC calls.");
@@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) {
   }
 }
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 void Pickler::pushRRef(const IValue& ivalue) {
   // It is the same as how rref is pickled in python, see PyRRef::pickle
   auto rrefInterface = ivalue.toRRef();
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index c416f9641023..9b8fce0b4869 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <torch/csrc/jit/api/function_impl.h>
@@ -549,7 +549,7 @@ void Unpickler::readGlobal(
     stack_.emplace_back(int64_t(globals_.size() - 1));
     return;
   } else if (module_name == "torch.distributed.rpc" && class_name == "rref") {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     return rebuildRRef();
 #else
     TORCH_INTERNAL_ASSERT(
@@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) {
   });
 }
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
     // It is the same as how rref is unpickled in python,
diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h
index 6d672ee86cd5..093d043ecf7d 100644
--- a/torch/csrc/utils/future.h
+++ b/torch/csrc/utils/future.h
@@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception {
 // Most implementation is copied from FutureMessage and
 // c10::ivalue::Future
 template <typename T>
-class TORCH_API Future final {
+class TORCH_PYTHON_API Future final {
  public:
   Future() = default;
 
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 292634580aab..4545aea2bf56 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -6,9 +6,12 @@
 import torch._six as six
 import numbers
 import os
-from . import FileStore, TCPStore
+import sys
+from . import FileStore
 from .constants import default_pg_timeout
 
+if sys.platform != 'win32':
+    from . import TCPStore
 
 _rendezvous_handlers = {}
 
@@ -90,6 +93,10 @@ def _error(msg):
 
     result = urlparse(url)
     path = result.path
+    if sys.platform == 'win32':
+        import urllib.request
+        path = urllib.request.url2pathname(result.path)
+
     if not path:
         raise _error("path missing")
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
@@ -175,7 +182,8 @@ def _env_error(var):
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using env:// method")
 
+if sys.platform != 'win32':
+    register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+    register_rendezvous_handler("env", _env_rendezvous_handler)
 
 register_rendezvous_handler("file", _file_rendezvous_handler)
-register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
-register_rendezvous_handler("env", _env_rendezvous_handler)
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 68fe49f411f5..4b206f380111 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -45,15 +45,16 @@ endfunction()
 
 set(C10D_SRCS
   FileStore.cpp
-  HashStore.cpp
   ProcessGroup.cpp
-  ProcessGroupRoundRobin.cpp
   Store.cpp
   PrefixStore.cpp
-  TCPStore.cpp
   Utils.cpp
   )
 
+if(NOT WIN32)
+  list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp)
+endif()
+
 set(C10D_LIBS torch)
 
 if(USE_C10D_NCCL)
@@ -77,14 +78,17 @@ endif()
 add_library(c10d STATIC ${C10D_SRCS})
 set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
-target_compile_options(c10d PUBLIC
-  -Wall
-  -Wextra
-  -Wno-unused-parameter
-  -Wno-missing-field-initializers
-  -Wno-write-strings
-  -Wno-unknown-pragmas
-  )
+
+if(NOT MSVC)
+  target_compile_options(c10d PUBLIC
+    -Wall
+    -Wextra
+    -Wno-unused-parameter
+    -Wno-missing-field-initializers
+    -Wno-write-strings
+    -Wno-unknown-pragmas
+    )
+endif()
 
 add_dependencies(c10d torch)
 
@@ -118,17 +122,19 @@ if(USE_C10D_GLOO)
 endif()
 
 copy_header(FileStore.hpp)
-copy_header(HashStore.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
 copy_header(Store.hpp)
-copy_header(TCPStore.hpp)
 copy_header(Types.hpp)
 copy_header(Utils.hpp)
 if(USE_GLOO)
   copy_header(ProcessGroupGloo.hpp)
   copy_header(GlooDeviceFactory.hpp)
 endif()
+if(NOT WIN32)
+  copy_header(HashStore.hpp)
+  copy_header(TCPStore.hpp)
+endif()
 
 if(USE_C10D_NCCL)
   copy_header(ProcessGroupNCCL.hpp)
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index 55346e0fa635..eb25c52f787a 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -3,9 +3,16 @@
 #include <assert.h>
 #include <fcntl.h>
 #include <stdint.h>
-#include <sys/file.h>
 #include <sys/stat.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fileapi.h>
+#include <io.h>
+#else
+#include <sys/file.h>
 #include <unistd.h>
+#endif
 
 #include <chrono>
 #include <cstdio>
@@ -21,6 +28,40 @@
     throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \
   }
 
+#ifdef _WIN32
+#define LOCK_EX 0x00000001
+#define LOCK_SH 0x00000010
+#define LOCK_UN 0x00000100
+
+int flock_(int fd, int op) {
+    HANDLE hdl = (HANDLE) _get_osfhandle(fd);
+    DWORD low = 1, high = 0;
+    OVERLAPPED offset = {0, 0, 0, 0, NULL};
+
+    if (hdl < 0)
+      return -1;
+
+    switch (op) {
+      case LOCK_EX:
+        if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset))
+          return 0;
+        break;
+      case LOCK_SH:
+        if (LockFileEx(hdl, 0, 0, low, high, &offset))
+          return 0;
+        break;
+      case LOCK_UN:
+        if(UnlockFileEx(hdl, 0, low, high, &offset) != 0)
+          return 0;
+        break;
+      default:
+        break;
+    }
+    errno = EINVAL;
+    return -1;
+}
+#endif
+
 namespace c10d {
 
 namespace {
@@ -79,7 +120,11 @@ class Lock {
   int fd_{-1};
 
   void flock(int operation) {
+#ifdef _WIN32
+    auto rv = syscall(std::bind(::flock_, fd_, operation));
+#else
     auto rv = syscall(std::bind(::flock, fd_, operation));
+#endif
     SYSASSERT(rv, "flock");
   }
 };
@@ -92,7 +137,11 @@ class File {
       std::chrono::milliseconds timeout) {
     const auto start = std::chrono::steady_clock::now();
     while (true) {
+#ifdef _WIN32
+      fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE));
+#else
       fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644));
+#endif
       // Only retry when the file doesn't exist, since we are waiting for the
       // file to be created in this case to address the following issue:
       // https://github.com/pytorch/pytorch/issues/13750
diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp
index 70c3c2bb7a31..dca6b03eb9dd 100644
--- a/torch/lib/c10d/GlooDeviceFactory.cpp
+++ b/torch/lib/c10d/GlooDeviceFactory.cpp
@@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING(
 
 #if GLOO_HAVE_TRANSPORT_TCP
 static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
-    const std::string& interface,
+    const std::string& interfaceName,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interface.empty() || !hostname.empty(),
+      !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeTCPDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::tcp::attr attr;
-  if (!interface.empty()) {
-    attr.iface = interface;
+  if (!interfaceName.empty()) {
+    attr.iface = interfaceName;
   } else {
     attr.hostname = hostname;
   }
@@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice);
 
 #if GLOO_HAVE_TRANSPORT_UV
 static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
-    const std::string& interface,
+    const std::string& interfaceName,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interface.empty() || !hostname.empty(),
+      !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeUVDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::uv::attr attr;
-  if (!interface.empty()) {
-    attr.iface = interface;
+  if (!interfaceName.empty()) {
+    attr.iface = interfaceName;
   } else {
     attr.hostname = hostname;
   }
@@ -81,23 +81,28 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
 // the flexibility of other application to override by priority. Register
 // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override.
 C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice);
+C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice);
 C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice);
 #endif
 
 static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT");
 
 std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
-    makeDeviceForInterface(const std::string& interface) {
+    makeDeviceForInterface(const std::string& interfaceName) {
   if (glooDeviceTransport) {
-    return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, "");
+    return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, "");
   }
 
 #ifdef __linux__
-  return GlooDeviceRegistry()->Create("LINUX", interface, "");
+  return GlooDeviceRegistry()->Create("LINUX", interfaceName, "");
 #endif
 
 #ifdef __APPLE__
-  return GlooDeviceRegistry()->Create("APPLE", interface, "");
+  return GlooDeviceRegistry()->Create("APPLE", interfaceName, "");
+#endif
+
+#ifdef _WIN32
+  return GlooDeviceRegistry()->Create("WIN32", interfaceName, "");
 #endif
 
   throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device");
@@ -117,6 +122,10 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
   return GlooDeviceRegistry()->Create("APPLE", "", hostname);
 #endif
 
+#ifdef _WIN32
+  return GlooDeviceRegistry()->Create("WIN32", "", hostname);
+#endif
+
   throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device");
 }
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index 531fe751f1c9..c139ac7a34fd 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -2,10 +2,16 @@
 
 #include <c10d/GlooDeviceFactory.hpp>
 
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#include <gloo/common/win.h>
+#else
 #include <netdb.h>
 #include <sys/socket.h>
-#include <sys/types.h>
 #include <unistd.h>
+#endif
+#include <sys/types.h>
 
 #include <type_traits>
 
@@ -36,6 +42,36 @@
 #include <gloo/rendezvous/context.h>
 #include <gloo/rendezvous/prefix_store.h>
 
+#ifdef _WIN32
+#define GENERATE_ALL_TYPES(type, func, ...)            \
+  switch (type) {                                      \
+    case ::at::ScalarType::Float:                      \
+      func<float>(__VA_ARGS__);                        \
+      break;                                           \
+    case ::at::ScalarType::Double:                     \
+      func<double>(__VA_ARGS__);                       \
+      break;                                           \
+    case ::at::ScalarType::Half:                       \
+      func<gloo::float16>(__VA_ARGS__);                \
+      break;                                           \
+    case ::at::ScalarType::Char:                       \
+      func<int8_t>(__VA_ARGS__);                       \
+      break;                                           \
+    case ::at::ScalarType::Byte:                       \
+      func<uint8_t>(__VA_ARGS__);                      \
+      break;                                           \
+    case ::at::ScalarType::Int:                        \
+      func<int32_t>(__VA_ARGS__);                      \
+      break;                                           \
+    case ::at::ScalarType::Long:                       \
+      func<int64_t>(__VA_ARGS__);                      \
+      break;                                           \
+    default:                                           \
+      throw std::runtime_error("Invalid scalar type"); \
+  }
+
+#define HOST_NAME_MAX 256
+#else
 #define GENERATE_ALL_TYPES(type, func, args...)        \
   switch (type) {                                      \
     case ::at::ScalarType::Float:                      \
@@ -62,6 +98,7 @@
     default:                                           \
       throw std::runtime_error("Invalid scalar type"); \
   }
+#endif
 
 namespace c10d {
 
@@ -409,12 +446,19 @@ ProcessGroupGloo::Options::Options()
 
 namespace {
 
+void socketInitialize() {
+#ifdef _WIN32
+  ::gloo::init_winsock();
+#endif
+}
+
 // Gloo assumes that this machine's hostname can always be resolved
 // to an address. If it doesn't it throws a runtime error saying
 // that it can't be resolved. Instead of catching it, we choose
 // to proactively check if an address can be resolved, so we can
 // gracefully fall back to an alternative if it doesn't.
 bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
+  socketInitialize();
   struct addrinfo hints;
   memset(&hints, 0, sizeof(hints));
   hints.ai_family = AF_UNSPEC;
@@ -431,7 +475,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
       continue;
     }
     rv = bind(fd, rp->ai_addr, rp->ai_addrlen);
+#ifdef _WIN32
+    closesocket(fd);
+#else
     close(fd);
+#endif
     if (rv == -1) {
       continue;
     }
@@ -443,14 +491,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
 
 } // namespace
 
-#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
-    createDeviceForInterface(const std::string& interface) {
-  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface);
+    createDeviceForInterface(const std::string& interface_name) {
+  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name);
 }
-#endif
 
-#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDeviceForHostname(const std::string& hostname) {
   TORCH_CHECK(
@@ -460,14 +505,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
       " to a (local) address");
   return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname);
 }
-#endif
 
-#ifdef __linux__
+#if defined(__linux__) || defined(_WIN32)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDefaultDevice() {
   // Use the hostname to resolve the network address to
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
+  socketInitialize();
   std::array<char, HOST_NAME_MAX> hostname{};
   auto rv = gethostname(hostname.data(), HOST_NAME_MAX);
   if (rv != 0) {
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
index d975f6eb6bc5..6c6e941ef95d 100644
--- a/torch/lib/c10d/Utils.cpp
+++ b/torch/lib/c10d/Utils.cpp
@@ -1,5 +1,6 @@
 #include <c10d/Utils.hpp>
 
+#ifndef _WIN32
 #include <netdb.h>
 #include <sys/poll.h>
 
@@ -354,6 +355,6 @@ std::tuple<int, std::string> accept(
   return std::make_tuple(
       socket, sockaddrToString(reinterpret_cast<struct ::sockaddr*>(&addr)));
 }
-
 } // namespace tcputil
 } // namespace c10d
+#endif
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 1bdaddde9f24..1116cd39ba1c 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -1,6 +1,8 @@
 #pragma once
 
+#ifndef _WIN32
 #include <sys/socket.h>
+#endif
 #include <sys/types.h>
 
 #include <chrono>
@@ -480,6 +482,7 @@ class ResourceGuard {
   bool released_;
 };
 
+#ifndef _WIN32
 namespace tcputil {
 
 constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1);
@@ -609,4 +612,5 @@ std::tuple<int, std::string> accept(
     const std::chrono::milliseconds& timeout = kNoTimeout);
 
 } // namespace tcputil
+#endif
 } // namespace c10d
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 8429d1099b29..003f56f30861 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -8,14 +8,19 @@ function(c10d_add_test test_src)
   get_filename_component(test_name ${test_src} NAME_WE)
   add_executable(${test_name} "${test_src}")
   target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
-  target_link_libraries(${test_name} pthread ${ARGN})
-  target_compile_options(${test_name} PRIVATE -Wno-error)
+  target_link_libraries(${test_name} ${ARGN})
+  if(NOT WIN32)
+    target_link_libraries(${test_name} pthread)
+    target_compile_options(${test_name} PRIVATE -Wno-error)
+  endif()
   add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
 endfunction()
 
 c10d_add_test(FileStoreTest.cpp c10d gtest_main)
-c10d_add_test(HashStoreTest.cpp c10d gtest_main)
-c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
+if(NOT WIN32)
+  c10d_add_test(HashStoreTest.cpp c10d gtest_main)
+  c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
+endif()
 
 if(USE_CUDA)
   if(USE_C10D_GLOO)
@@ -29,7 +34,7 @@ if(USE_CUDA)
   endif()
 else()
   if(USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main)
   endif()
 endif()
 
diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp
index defaff895a18..328da2faf648 100644
--- a/torch/lib/c10d/test/CUDATest.hpp
+++ b/torch/lib/c10d/test/CUDATest.hpp
@@ -5,9 +5,15 @@
 namespace c10d {
 namespace test {
 
-void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
+#ifdef _WIN32
+#define EXPORT_TEST_API __declspec(dllexport)
+#else
+#define EXPORT_TEST_API
+#endif
 
-int cudaNumDevices();
+EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
+
+EXPORT_TEST_API int cudaNumDevices();
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp
index 77215f4521c2..cc8da6326091 100644
--- a/torch/lib/c10d/test/FileStoreTest.cpp
+++ b/torch/lib/c10d/test/FileStoreTest.cpp
@@ -1,6 +1,8 @@
 #include <c10d/test/StoreTestCommon.hpp>
 
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 
 #include <iostream>
 #include <thread>
@@ -10,6 +12,11 @@
 #include <c10d/FileStore.hpp>
 #include <c10d/PrefixStore.hpp>
 
+#ifdef _WIN32
+std::string tmppath() {
+  return c10d::test::autoGenerateTmpFilePath();
+}
+#else
 std::string tmppath() {
   const char* tmpdir = getenv("TMPDIR");
   if (tmpdir == nullptr) {
@@ -29,6 +36,7 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
+#endif
 
 void testGetSet(std::string path, std::string prefix = "") {
   // Basic Set/Get on File Store
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 6606e553e733..da4f9b5fc106 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -1,7 +1,10 @@
+#ifndef _WIN32
 #include <signal.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#endif
+
+#include <sys/types.h>
 
 #include <condition_variable>
 #include <iostream>
@@ -21,6 +24,7 @@ using namespace c10d::test;
 constexpr auto kSendDelay = std::chrono::milliseconds(100);
 constexpr auto kWaitTimeout = std::chrono::milliseconds(1);
 
+#ifndef _WIN32
 class SignalTest {
  public:
   SignalTest(const std::string& path) : path_(path) {}
@@ -92,6 +96,7 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal(
   test.arm(fork.pid, signal);
   return test.run(0, 2);
 }
+#endif
 
 class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo {
  public:
@@ -456,6 +461,7 @@ void testRecv(const std::string& path) {
   EXPECT_TRUE(recvCompleted);
 }
 
+#ifndef _WIN32
 TEST(ProcessGroupGlooTest, testSIGSTOPException) {
   // test SIGSTOP
   // Fork() and TSAN don't play well together, so skip the test if we're testing
@@ -485,6 +491,7 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) {
   EXPECT_FALSE(work->isSuccess());
   EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception);
 }
+#endif
 
 TEST(ProcessGroupGlooTest, testAllReduceCPU) {
   {
diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp
index c62695485573..5f5dfca315cb 100644
--- a/torch/lib/c10d/test/TestUtils.hpp
+++ b/torch/lib/c10d/test/TestUtils.hpp
@@ -1,9 +1,12 @@
 #pragma once
 
+#ifndef _WIN32
 #include <signal.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#endif
+
+#include <sys/types.h>
 #include <cstring>
 
 #include <condition_variable>
@@ -37,6 +40,28 @@ class Semaphore {
   std::condition_variable cv_;
 };
 
+#ifdef _WIN32
+std::string autoGenerateTmpFilePath() {
+  char tmp[L_tmpnam_s];
+  errno_t err;
+  err = tmpnam_s(tmp, L_tmpnam_s);
+  if (err != 0)
+  {
+    throw std::system_error(errno, std::system_category());
+  }
+  return std::string(tmp);
+}
+
+std::string tmppath() {
+  const char* tmpfile = getenv("TMPFILE");
+  if (tmpfile) {
+    return std::string(tmpfile);
+  }
+  else {
+    return autoGenerateTmpFilePath();
+  }
+}
+#else
 std::string tmppath() {
   // TMPFILE is for manual test execution during which the user will specify
   // the full temp file path using the environmental variable TMPFILE
@@ -63,6 +88,7 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
+#endif
 
 bool isTSANEnabled() {
   auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
@@ -80,6 +106,7 @@ struct TemporaryFile {
   }
 };
 
+#ifndef _WIN32
 struct Fork {
   pid_t pid;
 
@@ -101,6 +128,7 @@ struct Fork {
     return pid == 0;
   }
 };
+#endif
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index f8e5b4822bd8..b2cd30c66812 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -16,7 +16,7 @@
 import torch.distributed as c10d
 
 from functools import partial, reduce
-from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM
+from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA
 
 class TestSkip(NamedTuple):
     exit_code: int
@@ -143,10 +143,23 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+def skip_if_win32():
+    return unittest.skipIf(
+        sys.platform == 'win32',
+        "This unit test case is not supportted on Windows platform",
+    )
+
 TIMEOUT_DEFAULT = 100
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 
+def create_device(interface=None):
+    if sys.platform == 'win32' or interface is None:
+        return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
+    else:
+        return c10d.ProcessGroupGloo.create_device(interface=interface)
+
+
 def get_timeout(test_id):
     return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT)
 
@@ -206,7 +219,7 @@ def initialize_temp_directories(init_method=None):
     if init_method is not None:
         os.environ["INIT_METHOD"] = init_method
     else:
-        os.environ["INIT_METHOD"] = "file://" + os.path.join(
+        os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join(
             init_dir_path, "shared_init_file"
         )
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 9959551031ff..36434ff8aa2f 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -53,6 +53,10 @@
 
 torch.backends.disable_global_flags()
 
+FILE_SCHEMA = "file://"
+if sys.platform == 'win32':
+    FILE_SCHEMA = "file:///"
+
 IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
 
 class ProfilingMode(Enum):
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
index b88765211df1..93de304a53ca 100644
--- a/torch/testing/_internal/dist_utils.py
+++ b/torch/testing/_internal/dist_utils.py
@@ -7,6 +7,7 @@
 import torch.distributed as dist
 import torch.distributed.rpc as rpc
 from torch.distributed.rpc import _rref_context_get_debug_info  # type: ignore[attr-defined]
+from torch.testing._internal.common_utils import FILE_SCHEMA
 
 
 if not dist.is_available():
@@ -14,7 +15,7 @@
     sys.exit(0)
 
 
-INIT_METHOD_TEMPLATE = "file://{file_name}"
+INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}"
 
 
 def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 1b1f755ed4cc..09db831e9999 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -20,7 +20,7 @@
     skip_if_lt_x_gpu,
     skip_if_rocm,
 )
-from torch.testing._internal.dist_utils import dist_init
+from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
@@ -329,7 +329,7 @@ def _remote_worker_process(self):
         gLogger.info("The remote worker is running.")
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -346,7 +346,7 @@ def _trainer_process(self, rank: int):
         )
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -363,7 +363,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
         gLogger.info("Running the master process...")
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -500,7 +500,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -567,7 +567,7 @@ def test_ddp_dist_autograd_sparse_grads(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -604,7 +604,7 @@ def test_ddp_dist_autograd_local_vs_remote(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -651,7 +651,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index f6f2b9a6fbfb..af5e648f6acb 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,5 +1,4 @@
 import copy
-import fcntl
 import itertools
 import random
 import math
@@ -22,6 +21,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember
+from torch.testing._internal.common_utils import FILE_SCHEMA
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     TEST_SKIPS,
@@ -43,6 +43,10 @@
 except ImportError:
     HAS_TORCHVISION = False
 
+if sys.platform == 'win32':
+    import msvcrt
+else:
+    import fcntl
 
 class Foo:
     def __init__(self, x):
@@ -191,10 +195,17 @@ def _lock():
     lockfile = os.path.join(TEMP_DIR, "lockfile")
     with open(lockfile, "w") as lf:
         try:
-            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
-            yield
+            if sys.platform == 'win32':
+                msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1)
+                yield
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+                yield
         finally:
-            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            if sys.platform == 'win32':
+                msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
             lf.close()
 
 
@@ -270,7 +281,7 @@ def tearDown(self):
 
     @property
     def init_method(self):
-        return "file://{file_name}".format(file_name=self.file_name)
+        return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name)
 
     @classmethod
     def _run(cls, rank, test_name, file_name):
@@ -2162,8 +2173,13 @@ def _test_DDP_5iter(
                 # save the model in the middle and reload
                 if test_save and idx == 2 and INIT_METHOD.startswith("file://"):
                     with tempfile.NamedTemporaryFile() as tmp:
-                        torch.save(model_DDP, tmp.name)
-                        model_DDP = torch.load(tmp.name)
+                        if sys.platform == 'win32':
+                            torch.save(model_DDP, tmp)
+                            tmp.seek(0)
+                            model_DDP = torch.load(tmp)
+                        else:
+                            torch.save(model_DDP, tmp.name)
+                            model_DDP = torch.load(tmp.name)
 
             with tempfile.TemporaryFile() as tmp_file:
                 torch.save(model_DDP, tmp_file)
@@ -2192,8 +2208,13 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gr
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                torch.save(model_DDP, tmp.name)
-                model_DDP = torch.load(tmp.name)
+                if sys.platform == 'win32':
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
 
             # dummy data initialization
             local_bs = len(gpu_subset)
@@ -2350,8 +2371,13 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                torch.save(model_DDP, tmp.name)
-                model_DDP = torch.load(tmp.name)
+                if sys.platform == 'win32':
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
 
             # data initialization
             input_cpu = torch.randn(global_bs, 2)

From 31ae8117baec653f6d7688d33dbabc31be5378e1 Mon Sep 17 00:00:00 2001
From: Dhruv Matani <dhruvbird@fb.com>
Date: Thu, 24 Sep 2020 22:00:37 -0700
Subject: [PATCH 122/449] [RFC] Remove per-op-registration related code in
 caffe2/tools/codegen/gen.py (#45134)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45134

Per-Op-Registration was a mechanism used for mobile selective build v0. Since then, a new dispathing mechanism has been built for PyTorch, and this code path isn't used any more. Remove it to simplify understanding/updating the code-generator's code-flow.
ghstack-source-id: 112723942

Test Plan: `buck build` and sandcastle.

Reviewed By: ezyang

Differential Revision: D23806632

fbshipit-source-id: d93cd324650c541d9bfc8eeff2ddb2833b988ecc
---
 aten/src/ATen/templates/PerOpRegistration.cpp | 15 ------
 tools/codegen/gen.py                          | 53 ++-----------------
 2 files changed, 4 insertions(+), 64 deletions(-)
 delete mode 100644 aten/src/ATen/templates/PerOpRegistration.cpp

diff --git a/aten/src/ATen/templates/PerOpRegistration.cpp b/aten/src/ATen/templates/PerOpRegistration.cpp
deleted file mode 100644
index 72ac3d784dad..000000000000
--- a/aten/src/ATen/templates/PerOpRegistration.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// ${generated_comment}
-
-#include <ATen/Config.h>
-#include <torch/library.h>
-#include <ATen/TypeDefault.h>
-#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
-$extra_headers
-
-namespace at {
-
-TORCH_LIBRARY_FRAGMENT_THIS_API_IS_FOR_PER_OP_REGISTRATION_ONLY(aten, m) {
-  ${function_registrations}
-}
-
-}  // namespace at
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index be8c57f1061a..83d9fa04cf37 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -2,7 +2,7 @@
 import contextlib
 import textwrap
 import itertools
-from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, DefaultDict, Union, Sequence
+from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, Union, Sequence
 import yaml
 from enum import Enum
 from collections import OrderedDict
@@ -914,11 +914,6 @@ def main() -> None:
         nargs='*',
         help='filter dispatch backend by the whitelist (if set), '
              'e.g.: CPU CUDA QuantizedCPU ...')
-    parser.add_argument(
-        '--per_op_registration',
-        action='store_true',
-        help='group function registrations by op name and write to separate files; '
-             'must also set --op_registration_whitelist param')
     parser.add_argument(
         '--force_schema_registration',
         action='store_true',
@@ -1011,8 +1006,7 @@ def make_file_manager(install_dir: str) -> FileManager:
             'function_registrations': list(mapMaybe(
                 compute_type_method(
                     dispatch, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
-                native_functions
-            )) if not options.per_op_registration else [],
+                native_functions)),
         })
         del fm
 
@@ -1037,11 +1031,11 @@ def make_file_manager(install_dir: str) -> FileManager:
 
         'function_registrations': list(mapMaybe(
             compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
-            native_functions)) if not options.per_op_registration else [],
+            native_functions)),
 
         'math_function_registrations': list(mapMaybe(
             compute_type_method('Math', target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist),
-            native_functions)) if not options.per_op_registration else [],
+            native_functions)),
     })
     cpu_fm.write('Functions.h', lambda: {
         'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)),
@@ -1080,45 +1074,6 @@ def computeSchemaRegister() -> Dict[str, object]:
             }
         cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister)
 
-    if options.per_op_registration:
-        def gen_per_op_registration_filename(opname: str) -> str:
-            return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-'))
-
-        if op_registration_whitelist is None:
-            raise Exception("Must set --op_registration_whitelist for per-op registration.")
-
-        # First, group all native functions by unoverloaded operator name
-        grouped_functions : DefaultDict[str, List[NativeFunction]] = DefaultDict(list)
-        for f in native_functions:
-            grouped_functions[f"aten::{f.func.name.name}"].append(f)
-        extra_headers = []
-        for b in backends:
-            extra_headers.append(f'#include <ATen/{b}Type.h>')
-
-        # Next, generate registration for each one
-        for name in op_registration_whitelist:
-            def computePerOpRegistration() -> Dict[str, object]:
-                fs = grouped_functions[name]
-                registrations: List[str] = []
-                for mb_dispatch in itertools.chain([None], backends):
-                    # or you could pass in op_registration_whitelist, it doesn't
-                    # matter!
-                    # NB: Use of compute_type_method here is kind of an abuse;
-                    # this is why we have to unconditionally write in
-                    # torch::dispatch in the registration when it should be
-                    # contextually clear
-                    registrations.extend(
-                        mapMaybe(
-                            compute_type_method(mb_dispatch, target=Target.REGISTRATION, op_registration_whitelist=None),
-                            fs))
-                return {
-                    'extra_headers': extra_headers,
-                    'function_registrations': registrations,
-                }
-
-            cpu_fm.write_with_template(
-                gen_per_op_registration_filename(name), 'PerOpRegistration.cpp', computePerOpRegistration)
-
     cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions))))
 
     if options.output_dependencies:

From bc3151dee0b73e10c64788fce2d822e96aeffb4a Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Thu, 24 Sep 2020 22:10:52 -0700
Subject: [PATCH 123/449] [quant] Remove unused qconfig argument in qat linear
 module (#45307)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45307

fixes: https://github.com/pytorch/pytorch/issues/35634

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D23917339

fbshipit-source-id: 65f8844b98198bbf93547b3d71408c2a54605218
---
 torch/nn/intrinsic/qat/modules/conv_fused.py  | 17 ++++++++---------
 torch/nn/intrinsic/qat/modules/linear_relu.py |  4 ++--
 torch/nn/qat/modules/conv.py                  |  7 +++----
 torch/nn/qat/modules/linear.py                |  7 +++----
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
index db46bb5ac2ee..5a8b0f042db1 100644
--- a/torch/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -162,7 +162,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, miss
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
+    def from_float(cls, mod):
         r"""Create a qat module from a float module or qparams_dict
 
             Args: `mod` a float module, either produced by torch.quantization utilities
@@ -170,10 +170,9 @@ def from_float(cls, mod, qconfig=None):
         """
         assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \
             cls._FLOAT_MODULE.__name__
-        if not qconfig:
-            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
-            assert mod.qconfig, 'Input float module must have a valid qconfig'
-            qconfig = mod.qconfig
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
+        qconfig = mod.qconfig
         conv, bn = mod[0], mod[1]
         qat_convbn = cls(conv.in_channels, conv.out_channels, conv.kernel_size,
                          conv.stride, conv.padding, conv.dilation,
@@ -278,8 +277,8 @@ def forward(self, input):
         return F.relu(ConvBn2d._forward(self, input))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
-        return super(ConvBnReLU2d, cls).from_float(mod, qconfig)
+    def from_float(cls, mod):
+        return super(ConvBnReLU2d, cls).from_float(mod)
 
 class ConvReLU2d(nnqat.Conv2d):
     r"""
@@ -313,8 +312,8 @@ def forward(self, input):
             self._conv_forward(input, self.weight_fake_quant(self.weight)))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
-        return super(ConvReLU2d, cls).from_float(mod, qconfig)
+    def from_float(cls, mod):
+        return super(ConvReLU2d, cls).from_float(mod)
 
 def update_bn_stats(mod):
     if type(mod) in set([ConvBnReLU2d, ConvBn2d]):
diff --git a/torch/nn/intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py
index 03f556c4ac2e..b11072ddb7be 100644
--- a/torch/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/nn/intrinsic/qat/modules/linear_relu.py
@@ -34,5 +34,5 @@ def forward(self, input):
         return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
-        return super(LinearReLU, cls).from_float(mod, qconfig)
+    def from_float(cls, mod):
+        return super(LinearReLU, cls).from_float(mod)
diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py
index 63fb4b0fa1fd..7daeecddd4e1 100644
--- a/torch/nn/qat/modules/conv.py
+++ b/torch/nn/qat/modules/conv.py
@@ -32,7 +32,7 @@ def forward(self, input):
         return self._conv_forward(input, self.weight_fake_quant(self.weight))
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
+    def from_float(cls, mod):
         r"""Create a qat module from a float module or qparams_dict
 
             Args: `mod` a float module, either produced by torch.quantization utilities
@@ -40,9 +40,8 @@ def from_float(cls, mod, qconfig=None):
         """
         assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \
             cls._FLOAT_MODULE.__name__
-        if not qconfig:
-            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
-            assert mod.qconfig, 'Input float module must have a valid qconfig'
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
         if type(mod) == ConvReLU2d:
             mod = mod[0]
         qconfig = mod.qconfig
diff --git a/torch/nn/qat/modules/linear.py b/torch/nn/qat/modules/linear.py
index 77998426239f..47fc40b9b6c0 100644
--- a/torch/nn/qat/modules/linear.py
+++ b/torch/nn/qat/modules/linear.py
@@ -30,7 +30,7 @@ def forward(self, input):
         return F.linear(input, self.weight_fake_quant(self.weight), self.bias)
 
     @classmethod
-    def from_float(cls, mod, qconfig=None):
+    def from_float(cls, mod):
         r"""Create a qat module from a float module or qparams_dict
 
             Args: `mod` a float module, either produced by torch.quantization utilities
@@ -38,9 +38,8 @@ def from_float(cls, mod, qconfig=None):
         """
         assert type(mod) == cls._FLOAT_MODULE, ' qat.' + cls.__name__ + '.from_float only works for ' + \
             cls._FLOAT_MODULE.__name__
-        if not qconfig:
-            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
-            assert mod.qconfig, 'Input float module must have a valid qconfig'
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
         if type(mod) == LinearReLU:
             mod = mod[0]
 

From 103fa3894a0dff4bd697688a4a5d6095cd45162e Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Thu, 24 Sep 2020 22:42:46 -0700
Subject: [PATCH 124/449] Revert D23841786: [pytorch][PR] Enable distributed
 package on windows, Gloo backend supported only

Test Plan: revert-hammer

Differential Revision:
D23841786 (https://github.com/pytorch/pytorch/commit/0122299f9ba729aa0c9bd43764af53225e03672c)

Original commit changeset: 334ba1ed73ef

fbshipit-source-id: ec95432f9957df56a5a04e52661f5db920b7f57f
---
 .../install_miniconda3.bat                    |  7 ---
 CMakeLists.txt                                |  8 +--
 caffe2/CMakeLists.txt                         | 49 ++++++---------
 cmake/Dependencies.cmake                      |  5 +-
 test/cpp/dist_autograd/CMakeLists.txt         |  2 +-
 test/distributed/test_c10d.py                 | 49 +++++----------
 test/distributed/test_c10d_spawn.py           |  8 +--
 test/run_test.py                              | 11 ++--
 tools/build_variables.bzl                     |  7 +--
 torch/CMakeLists.txt                          | 33 +++++-----
 torch/csrc/Module.cpp                         |  4 +-
 torch/csrc/WindowsTorchApiMacro.h             |  6 --
 torch/csrc/distributed/c10d/comm.h            |  4 +-
 torch/csrc/distributed/c10d/init.cpp          | 10 +--
 torch/csrc/distributed/c10d/reducer.cpp       | 22 ++++---
 torch/csrc/distributed/c10d/reducer.h         | 14 -----
 torch/csrc/jit/python/pybind_utils.h          |  8 +--
 .../csrc/jit/python/python_sugared_value.cpp  |  2 +-
 torch/csrc/jit/runtime/interpreter.cpp        |  8 +--
 torch/csrc/jit/serialization/pickler.cpp      |  6 +-
 torch/csrc/jit/serialization/unpickler.cpp    |  6 +-
 torch/csrc/utils/future.h                     |  2 +-
 torch/distributed/rendezvous.py               | 14 +----
 torch/lib/c10d/CMakeLists.txt                 | 32 ++++------
 torch/lib/c10d/FileStore.cpp                  | 51 +---------------
 torch/lib/c10d/GlooDeviceFactory.cpp          | 33 ++++------
 torch/lib/c10d/ProcessGroupGloo.cpp           | 61 +++----------------
 torch/lib/c10d/Utils.cpp                      |  3 +-
 torch/lib/c10d/Utils.hpp                      |  4 --
 torch/lib/c10d/test/CMakeLists.txt            | 15 ++---
 torch/lib/c10d/test/CUDATest.hpp              | 10 +--
 torch/lib/c10d/test/FileStoreTest.cpp         |  8 ---
 torch/lib/c10d/test/ProcessGroupGlooTest.cpp  |  9 +--
 torch/lib/c10d/test/TestUtils.hpp             | 30 +--------
 torch/testing/_internal/common_distributed.py | 17 +-----
 torch/testing/_internal/common_utils.py       |  4 --
 torch/testing/_internal/dist_utils.py         |  3 +-
 .../ddp_under_dist_autograd_test.py           | 16 ++---
 .../_internal/distributed/distributed_test.py | 48 ++++-----------
 39 files changed, 167 insertions(+), 462 deletions(-)

diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
index cf7255ce3789..a66ef4b651c5 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
@@ -12,11 +12,4 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic
 if "%REBUILD%"=="" (
   call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3
   call conda install -y -q -c conda-forge cmake
-  call conda install -y -q -c rdonnelly libuv
 )
-
-:: Get installed libuv path
-@echo off
-set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library
-@echo on
-echo libuv_ROOT=%libuv_ROOT%
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d937e0e1655..826c187b602e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,7 @@ endif()
 
 # For non-supported platforms, turn USE_DISTRIBUTED off by default.
 # It is not tested and likely won't work without additional changes.
-if(NOT LINUX AND NOT WIN32)
+if(NOT LINUX)
   set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed")
   # On macOS, if USE_DISTRIBUTED is enabled (specified by the user),
   # then make Gloo build with the libuv transport.
@@ -226,12 +226,6 @@ option(USE_TBB "Use TBB" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 
-# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
-if(WIN32)
-  set(USE_TENSORPIPE OFF)
-  message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
-endif()
-
 # Linux distributions do not want too many embedded sources, in that sense we
 # need to be able to build pytorch with an (almost) empty third_party
 # directory.
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 219b28c69695..65f072b6f29d 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -291,29 +291,26 @@ endif()
 
 if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   if(USE_DISTRIBUTED)
+    add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
+    target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
+    add_dependencies(process_group_agent torch c10d)
 
     # Define this target even if we're building without TensorPipe, to make life
     # easier to other targets that depend on this. However, in that case, by not
     # setting the USE_TENSORPIPE compile definition, this target will just end
     # up being empty. Downstream targets should also add a #ifdef guard.
-    if(NOT WIN32)
-      add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
-      target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
-      add_dependencies(process_group_agent torch c10d)
-
-      add_library(tensorpipe_agent
-        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
-        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
-        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
-        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
-        )
-      target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
-      add_dependencies(tensorpipe_agent torch c10d)
-      if(USE_TENSORPIPE)
-        target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
-        target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
-        add_dependencies(tensorpipe_agent tensorpipe)
-      endif()
+    add_library(tensorpipe_agent
+      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
+      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
+      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
+      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
+      )
+    target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
+    add_dependencies(tensorpipe_agent torch c10d)
+    if(USE_TENSORPIPE)
+      target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
+      target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
+      add_dependencies(tensorpipe_agent tensorpipe)
     endif()
   endif()
 
@@ -496,7 +493,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
         PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
       )
     endif()
-    if(USE_DISTRIBUTED AND NOT WIN32)
+    if(USE_DISTRIBUTED)
       append_filelist("libtorch_distributed_sources" TORCH_SRCS)
     endif()
   endif()
@@ -840,7 +837,7 @@ endif()
   if(BUILD_TEST AND NOT USE_ROCM)
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
     add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr)
-    if(USE_DISTRIBUTED AND NOT WIN32)
+    if(USE_DISTRIBUTED)
       add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
     endif()
   endif()
@@ -892,7 +889,9 @@ endif()
     DESTINATION share/cmake/Torch)
 
   if(USE_DISTRIBUTED)
-    add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
+    if(NOT MSVC)
+      add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
+    endif()
   endif()
 
 
@@ -967,14 +966,6 @@ if(USE_DISTRIBUTED)
   target_compile_definitions(torch_cpu PRIVATE
     USE_DISTRIBUTED
   )
-  # Pass USE_RPC in order to reduce use of 
-  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-  # need to be removed when RPC is supported
-  if(NOT WIN32)
-    target_compile_definitions(torch_cpu PRIVATE
-      USE_RPC
-    )
-  endif()
   # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
   # can only be compiled with USE_TENSORPIPE is set.
   if(USE_TENSORPIPE)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 023bbe9e8d07..028098f61d36 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1253,7 +1253,10 @@ if(USE_CUDA)
 endif()
 
 if(USE_GLOO)
-  if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+  if(MSVC)
+    message(WARNING "Gloo can not be used on Windows.")
+    caffe2_update_option(USE_GLOO OFF)
+  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     message(WARNING "Gloo can only be used on 64-bit systems.")
     caffe2_update_option(USE_GLOO OFF)
   else()
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 9969c63e16d5..5d23602881f0 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(USE_DISTRIBUTED AND NOT WIN32)
+if(USE_DISTRIBUTED)
   set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
   set(DIST_AUTOGRAD_TEST_SOURCES
     ${TORCH_ROOT}/test/cpp/common/main.cpp
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 911a73ce432e..a81bc53f175a 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -29,7 +29,7 @@
 from torch.testing._internal.common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
-    simple_sparse_reduce_tests, skip_if_win32, create_device
+    simple_sparse_reduce_tests
 
 from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
     retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -255,7 +255,6 @@ def create_tcp_store(addr):
     raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
 
 
-@skip_if_win32()
 class TCPStoreTest(TestCase, StoreTestBase):
     def _create_store(self):
         store = create_tcp_store('localhost')
@@ -274,7 +273,6 @@ def test_address_already_in_use(self):
             store2 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
 
 
-@skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
         super(PrefixTCPStoreTest, self).setUp()
@@ -331,7 +329,6 @@ def test_unknown_handler(self):
             c10d.rendezvous('invalid://')
 
 
-@skip_if_win32()
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     def test_common_errors(self):
@@ -458,7 +455,7 @@ def test_common_errors(self):
 
     def test_nominal(self):
         with tempfile.NamedTemporaryFile(delete=False) as file:
-            url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2'
+            url = 'file://%s?world_size=%d' % (file.name, 2)
             gen0 = c10d.rendezvous(url + "&rank=0")
             store0, rank0, size0 = next(gen0)
             self.assertEqual(0, rank0)
@@ -477,7 +474,6 @@ def test_nominal(self):
             self.assertEqual(b"value1", store0.get("key1"))
 
 
-@skip_if_win32()
 class RendezvousTCPTest(TestCase):
 
     def create_tcp_url(self):
@@ -548,13 +544,9 @@ def _test_store_timeout(self, backend, init_method, c2p):
 
     def _init_methods(self):
         f = tempfile.NamedTemporaryFile(delete=False)
-        if sys.platform == 'win32':
-            yield "file:///%s" % f.name.replace("\\", "/")
-            f.close()
-        else:
-            yield "file://%s" % f.name
-            f.close()
-            yield "tcp://127.0.0.1:%d" % common.find_free_port()
+        yield "file://%s" % f.name
+        f.close()
+        yield "tcp://127.0.0.1:%d" % common.find_free_port()
 
     def _test_default_store_timeout(self, backend):
         for init_method in self._init_methods():
@@ -592,16 +584,11 @@ def test_default_store_timeout_gloo(self):
 class ProcessGroupGlooTest(MultiProcessTestCase):
     def setUp(self):
         super(ProcessGroupGlooTest, self).setUp()
-
-        # For Windows platform, Python does not support fork, change it to spawn here.
-        if sys.platform == 'win32':
-            self._spawn_processes()
-        else:
-            self._fork_processes()
+        self._fork_processes()
 
     def opts(self, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [create_device(interface=LOOPBACK)]
+        opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
@@ -611,8 +598,8 @@ def test_multi_device_constructor(self):
         opts = c10d.ProcessGroupGloo.Options()
         opts.timeout = 5.0
         opts.devices = [
-            create_device(interface=LOOPBACK),
-            create_device(interface=LOOPBACK),
+            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
+            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
         ]
         pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
 
@@ -1527,7 +1514,6 @@ def test_barrier_implies_wait(self):
         for i, tensor in enumerate(tensors):
             self.assertEqual(torch.full(size, float(i * self.world_size)), tensor)
 
-    @skip_if_win32()
     def test_round_robin(self):
         num_process_groups = 2
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -1545,7 +1531,6 @@ def test_round_robin(self):
             pg.broadcast(tensor, root=0).wait()
             self.assertEqual(torch.full([100, 100], 0.), tensor)
 
-    @skip_if_win32()
     def test_round_robin_create_destroy(self):
         store = c10d.FileStore(self.file_name, self.world_size)
 
@@ -1974,10 +1959,7 @@ def forward(self, x):
 class DistributedDataParallelTest(MultiProcessTestCase):
     def setUp(self):
         super(DistributedDataParallelTest, self).setUp()
-        if sys.platform == 'win32':
-            self._spawn_processes()
-        else:
-            self._fork_processes()
+        self._fork_processes()
 
     def tearDown(self):
         # DistributedDataParallel test doesn't seem to call FileStore destructor
@@ -2086,7 +2068,7 @@ def update_parameters(model):
     def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [create_device(interface=LOOPBACK)]
+        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view)
 
@@ -3965,10 +3947,7 @@ def test_nccl_timeout(self):
 class CommTest(MultiProcessTestCase):
     def setUp(self):
         super(CommTest, self).setUp()
-        if sys.platform == 'win32':
-            self._spawn_processes()
-        else:
-            self._fork_processes()
+        self._fork_processes()
 
     def tearDown(self):
         super(CommTest, self).tearDown()
@@ -4034,7 +4013,7 @@ def test_broadcast_coalesced_nccl(self):
     def test_broadcast_coalesced_gloo_cuda(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [create_device(interface=LOOPBACK)]
+        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cuda:%d" % self.rank)
         ranks = list(range(self.world_size))
@@ -4045,7 +4024,7 @@ def test_broadcast_coalesced_gloo_cuda(self):
     def test_broadcast_coalesced_gloo_cpu(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [create_device(interface=LOOPBACK)]
+        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cpu")
         ranks = list(range(self.world_size))
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index c84608e8f178..d0bf00b8a08a 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -10,10 +10,8 @@
 import torch.nn as nn
 
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
-from torch.testing._internal.common_distributed import requires_gloo, \
-    create_device
-from torch.testing._internal.common_utils import TestCase, load_tests, \
-    run_tests, skipIfRocm
+from torch.testing._internal.common_distributed import requires_gloo
+from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm
 from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN
 
 
@@ -41,7 +39,7 @@ class ProcessGroupShareTensorTest(TestCase):
     @classmethod
     def opts(cls, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [create_device(interface='lo')]
+        opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
diff --git a/test/run_test.py b/test/run_test.py
index 0f9d14a78605..d63fc372f9c2 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -13,7 +13,7 @@
 import torch
 import torch._six
 from torch.utils import cpp_extension
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA
+from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell
 import torch.distributed as dist
 from typing import Dict, Optional
 
@@ -99,6 +99,7 @@
     'distributed/rpc/test_process_group_agent',
     'distributed/rpc/test_tensorpipe_agent',
     'distributed/test_distributed_fork',
+    'distributed/test_distributed_spawn',
 ]
 
 ROCM_BLOCKLIST = [
@@ -305,13 +306,9 @@ def test_distributed(test_module, test_directory, options):
             'MPI not available -- MPI backend tests will be skipped')
     config = DISTRIBUTED_TESTS_CONFIG
     for backend, env_vars in config.items():
-        if sys.platform == 'win32' and backend != 'gloo':
-            continue
         if backend == 'mpi' and not mpi_available:
             continue
         for with_init_file in {True, False}:
-            if sys.platform == 'win32' and not with_init_file:
-                continue
             tmp_dir = tempfile.mkdtemp()
             if options.verbose:
                 init_str = "with {} init_method"
@@ -325,9 +322,9 @@ def test_distributed(test_module, test_directory, options):
             os.environ.update(env_vars)
             if with_init_file:
                 if test_module in ["test_distributed_fork", "test_distributed_spawn"]:
-                    init_method = f'{FILE_SCHEMA}{tmp_dir}/'
+                    init_method = 'file://{}/'.format(tmp_dir)
                 else:
-                    init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file'
+                    init_method = 'file://{}/shared_init_file'.format(tmp_dir)
                 os.environ['INIT_METHOD'] = init_method
             try:
                 os.mkdir(os.path.join(tmp_dir, 'barrier'))
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index c21fab8ec2cf..174bb858da44 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -537,14 +537,11 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
 ]
 
-libtorch_python_distributed_core_sources = [
+libtorch_python_distributed_sources = [
+    "torch/csrc/distributed/autograd/init.cpp",
     "torch/csrc/distributed/c10d/comm.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
-]
-
-libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
-    "torch/csrc/distributed/autograd/init.cpp",
     "torch/csrc/distributed/rpc/init.cpp",
     "torch/csrc/distributed/rpc/process_group_agent.cpp",
     "torch/csrc/distributed/rpc/py_rref.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 2ae2f7f737fe..b78dc4a362a7 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -160,28 +160,25 @@ endif()
 
 if(USE_DISTRIBUTED)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED)
-    if(WIN32)
-      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
-    else()
-      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC)
+    if(NOT MSVC)
       append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
+      # Disable certain warnings for GCC-9.X
+      if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
+        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      endif()
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
+      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
+      if(USE_TENSORPIPE)
+        list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
+        list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
+      endif()
     endif()
-    # Disable certain warnings for GCC-9.X
-    if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-    endif()
-    if(USE_TENSORPIPE)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
-      list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
-    endif()
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
-    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 endif()
 
-if(USE_NCCL AND NOT WIN32)
+if(USE_NCCL)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ae6f15155f2a..ed4aa21a8f76 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -688,9 +688,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
-#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
+#ifdef USE_DISTRIBUTED
+#ifdef USE_C10D
   THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions());
-#ifndef _WIN32
   THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions());
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::autograd::python_functions());
diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h
index 7f44db0baba9..7f8ef4e01677 100644
--- a/torch/csrc/WindowsTorchApiMacro.h
+++ b/torch/csrc/WindowsTorchApiMacro.h
@@ -5,9 +5,3 @@
 // There's no difference between aten, torch and caffe2 libs any more
 // TODO: clean up the naming for consistency
 #define TORCH_API CAFFE2_API
-
-#ifdef _WIN32
-#define TORCH_PYTHON_API
-#else
-#define TORCH_PYTHON_API CAFFE2_API
-#endif
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index 2eb626c40232..e2b501f08aff 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -38,7 +38,7 @@ class GradBucket {
 // DDP's c10d reducer allows communication hooks defined as a sub class
 // of CommHookInterface. CommHookInterface is an abstract class and can
 // be used to implement both Python and CPP hooks.
-struct TORCH_PYTHON_API CommHookInterface {
+struct TORCH_API CommHookInterface {
  public:
   virtual ~CommHookInterface() {}
 
@@ -59,7 +59,7 @@ struct TORCH_PYTHON_API CommHookInterface {
 
 // PythonCommHook enables registering a python hook to c10d reducer and is a
 // sub class of CommHookInterface.
-class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
+class TORCH_API PythonCommHook : public CommHookInterface {
  public:
   // The constructor takes a state and a callable hook. Inputs are Python
   // objects. The state is passed to the hook in runHook function can be used to
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index be1752d7366f..165d6a1c8603 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1,11 +1,7 @@
 #include <torch/csrc/python_headers.h>
 
 #include <c10d/FileStore.hpp>
-#ifndef _WIN32
 #include <c10d/HashStore.hpp>
-#include <c10d/TCPStore.hpp>
-#include <c10d/ProcessGroupRoundRobin.hpp>
-#endif
 #include <c10d/ProcessGroup.hpp>
 
 #ifdef USE_C10D_GLOO
@@ -21,6 +17,8 @@
 #endif
 
 #include <c10d/PrefixStore.hpp>
+#include <c10d/ProcessGroupRoundRobin.hpp>
+#include <c10d/TCPStore.hpp>
 #include <pybind11/chrono.h>
 
 #include <torch/csrc/Exceptions.h>
@@ -325,7 +323,6 @@ They are used in specifying strategies for reduction collectives, e.g.,
   shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store)
       .def(py::init<const std::string&, int>());
 
-#ifndef _WIN32
   shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store)
       .def(py::init<>());
 
@@ -343,7 +340,6 @@ They are used in specifying strategies for reduction collectives, e.g.,
           py::arg("is_master"),
           py::arg("timeout") =
               std::chrono::milliseconds(::c10d::Store::kDefaultTimeout));
-#endif
 
   shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store)
       .def(py::init<const std::string&, std::shared_ptr<::c10d::Store>>());
@@ -611,7 +607,6 @@ They are used in specifying strategies for reduction collectives, e.g.,
               py::arg("opts") = ::c10d::BarrierOptions(),
               py::call_guard<py::gil_scoped_release>());
 
-#ifndef _WIN32
   module.def(
       "_round_robin_process_groups",
       [](std::vector<std::shared_ptr<::c10d::ProcessGroup>> processGroups)
@@ -625,7 +620,6 @@ They are used in specifying strategies for reduction collectives, e.g.,
       },
       py::arg("process_groups"),
       py::call_guard<py::gil_scoped_release>());
-#endif
 
 #ifdef USE_C10D_GLOO
   auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>(
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 814d3494ff4e..86916c7994dd 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -89,7 +89,10 @@ Reducer::Reducer(
       for (size_t variable_index = 0; variable_index < variable_count;
            variable_index++) {
         auto& variable = replicas_[replica_index][variable_index];
-        const auto index = VariableIndex(replica_index, variable_index);
+        const auto index = VariableIndex{
+            .replica_index = replica_index,
+            .variable_index = variable_index,
+        };
 
         // The gradient accumulator function is lazily initialized once.
         // Therefore we can use its presence in the autograd graph as
@@ -97,19 +100,15 @@ Reducer::Reducer(
         auto grad_accumulator =
             torch::autograd::impl::grad_accumulator(variable);
 
-#ifndef _WIN32
         using torch::distributed::autograd::ThreadLocalDistAutogradContext;
-#endif
         // Hook to execute after the gradient accumulator has executed.
         hooks_.emplace_back(
             grad_accumulator->add_post_hook(
                 torch::make_unique<torch::autograd::utils::LambdaPostHook>(
                     [=](const torch::autograd::variable_list& outputs,
                         const torch::autograd::variable_list& /* unused */) {
-#ifndef _WIN32
                       this->rpc_context_.set(
                           ThreadLocalDistAutogradContext::getContextPtr());
-#endif
                       this->autograd_hook(index);
                       return outputs;
                     })),
@@ -478,7 +477,10 @@ void Reducer::push_rebuilt_params_for_all_indices() {
     const auto variable_count = replicas_[replica_index].size();
     for (size_t variable_index = 0; variable_index < variable_count;
          ++variable_index) {
-      const auto index = VariableIndex(replica_index, variable_index);
+      const auto index = VariableIndex{
+          .replica_index = replica_index,
+          .variable_index = variable_index,
+      };
       push_rebuilt_params(index);
     }
   }
@@ -848,8 +850,10 @@ void Reducer::initialize_buckets(
       TORCH_CHECK(
           variable_index < variable_locators_.size(),
           "Out of range variable index specified.");
-      variable_locators_[variable_index] = VariableLocator(
-        bucket_index, intra_bucket_index++);
+      variable_locators_[variable_index] = VariableLocator{
+          .bucket_index = bucket_index,
+          .intra_bucket_index = intra_bucket_index++,
+      };
     }
     bucket.variable_indices = std::move(bucket_indices[bucket_index]);
 
@@ -1231,9 +1235,7 @@ void Reducer::runGradCallbackForVariable(
     cb(variable.mutable_grad());
   } else {
     // Under distributed autograd
-#ifndef _WIN32
     context_ptr->runGradCallbackForVariable(variable, std::move(cb));
-#endif
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 486b7337366a..960a32356acf 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -104,13 +104,6 @@ class Reducer {
   struct VariableIndex {
     size_t replica_index;
     size_t variable_index;
-
-    VariableIndex() = default;
-
-    VariableIndex(size_t replica_index_, size_t variable_index_) {
-      replica_index = replica_index_;
-      variable_index = variable_index_;
-    }
   };
 
   void push_rebuilt_params(const VariableIndex& index);
@@ -288,13 +281,6 @@ class Reducer {
     size_t bucket_index;
     // Index of parameter in single bucket replica.
     size_t intra_bucket_index;
-
-    VariableLocator() = default;
-
-    VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) {
-      bucket_index = bucket_index_;
-      intra_bucket_index = intra_bucket_index_;
-    }
   };
 
   // Map the index of a variable to its location in the bucket structure.
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 4be55a9caa90..65f5a49145c8 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) {
   if (py::isinstance<Object>(input)) {
     auto object = py::cast<Object>(input);
     return InferredType(object.type());
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
   } else if (py::isinstance<torch::distributed::rpc::PyRRef>(input)) {
     auto rref_ivalue = input.cast<torch::distributed::rpc::PyRRef>().toIValue();
     return InferredType(rref_ivalue.type());
@@ -716,7 +716,7 @@ inline IValue toIValue(
       }
     }
     case TypeKind::RRefType: {
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
       return obj.cast<torch::distributed::rpc::PyRRef>().toIValue();
 #else
       AT_ERROR("RRef is only supported with the distributed package");
@@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) {
     }
     return std::move(py_dict);
   } else if (ivalue.isRRef()) {
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
     auto RRefPtr =
         c10::dynamic_intrusive_pointer_cast<torch::distributed::rpc::RRef>(
             std::move(ivalue).toRRef());
@@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) {
     auto py_class = getScriptedClassOrError(qualified_class_name);
     return py_class.attr(enum_holder->name().c_str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
     return py::cast(torch::distributed::rpc::PyRRef(
         c10::static_intrusive_pointer_cast<distributed::rpc::RRef>(
             ivalue.toRRef())));
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 119b6b5e5de7..ba94d33f37b3 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -916,7 +916,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) {
     return SpecialFormValue::create(prim::annotate);
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
     // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on.
   } else if (
       obj.ptr() ==
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index f61e2597447f..337fe66c0789 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -23,7 +23,7 @@
 #include <torch/csrc/jit/runtime/profiling_record.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/autograd/context/container.h>
 using torch::distributed::autograd::DistAutogradContainer;
 #endif
@@ -267,7 +267,7 @@ void insertLastUses(Graph& g) {
 }
 
 inline int64_t getDistAutogradContextId() {
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
   return DistAutogradContainer::currentContextId();
 #else
   return 0;
@@ -1690,7 +1690,7 @@ InterpreterState::InterpreterState(
     : pImpl(std::move(pImpl_)) {}
 
 void InterpreterContinuation::operator()() {
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
   auto prev_dist_id = DistAutogradContainer::currentContextId();
   DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_);
 #endif
@@ -1700,7 +1700,7 @@ void InterpreterContinuation::operator()() {
   } else {
     state.runAsync(stack);
   }
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
   DistAutogradContainer::forceCurrentContextId(prev_dist_id);
 #endif
 }
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 2bc9abea8c57..6f911f4246cc 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <aten/src/ATen/quantized/Quantizer.h>
@@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
            "this class.";
     AT_ERROR(err.str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
     TORCH_CHECK(
         torch::distributed::rpc::getAllowJitRRefPickle() == true,
         "RRef jit pickling is only allowed inside RPC calls.");
@@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) {
   }
 }
 
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
 void Pickler::pushRRef(const IValue& ivalue) {
   // It is the same as how rref is pickled in python, see PyRRef::pickle
   auto rrefInterface = ivalue.toRRef();
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 9b8fce0b4869..c416f9641023 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <torch/csrc/jit/api/function_impl.h>
@@ -549,7 +549,7 @@ void Unpickler::readGlobal(
     stack_.emplace_back(int64_t(globals_.size() - 1));
     return;
   } else if (module_name == "torch.distributed.rpc" && class_name == "rref") {
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
     return rebuildRRef();
 #else
     TORCH_INTERNAL_ASSERT(
@@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) {
   });
 }
 
-#ifdef USE_RPC
+#ifdef USE_DISTRIBUTED
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
     // It is the same as how rref is unpickled in python,
diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h
index 093d043ecf7d..6d672ee86cd5 100644
--- a/torch/csrc/utils/future.h
+++ b/torch/csrc/utils/future.h
@@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception {
 // Most implementation is copied from FutureMessage and
 // c10::ivalue::Future
 template <typename T>
-class TORCH_PYTHON_API Future final {
+class TORCH_API Future final {
  public:
   Future() = default;
 
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 4545aea2bf56..292634580aab 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -6,12 +6,9 @@
 import torch._six as six
 import numbers
 import os
-import sys
-from . import FileStore
+from . import FileStore, TCPStore
 from .constants import default_pg_timeout
 
-if sys.platform != 'win32':
-    from . import TCPStore
 
 _rendezvous_handlers = {}
 
@@ -93,10 +90,6 @@ def _error(msg):
 
     result = urlparse(url)
     path = result.path
-    if sys.platform == 'win32':
-        import urllib.request
-        path = urllib.request.url2pathname(result.path)
-
     if not path:
         raise _error("path missing")
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
@@ -182,8 +175,7 @@ def _env_error(var):
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using env:// method")
 
-if sys.platform != 'win32':
-    register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
-    register_rendezvous_handler("env", _env_rendezvous_handler)
 
 register_rendezvous_handler("file", _file_rendezvous_handler)
+register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+register_rendezvous_handler("env", _env_rendezvous_handler)
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 4b206f380111..68fe49f411f5 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -45,16 +45,15 @@ endfunction()
 
 set(C10D_SRCS
   FileStore.cpp
+  HashStore.cpp
   ProcessGroup.cpp
+  ProcessGroupRoundRobin.cpp
   Store.cpp
   PrefixStore.cpp
+  TCPStore.cpp
   Utils.cpp
   )
 
-if(NOT WIN32)
-  list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp)
-endif()
-
 set(C10D_LIBS torch)
 
 if(USE_C10D_NCCL)
@@ -78,17 +77,14 @@ endif()
 add_library(c10d STATIC ${C10D_SRCS})
 set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
-
-if(NOT MSVC)
-  target_compile_options(c10d PUBLIC
-    -Wall
-    -Wextra
-    -Wno-unused-parameter
-    -Wno-missing-field-initializers
-    -Wno-write-strings
-    -Wno-unknown-pragmas
-    )
-endif()
+target_compile_options(c10d PUBLIC
+  -Wall
+  -Wextra
+  -Wno-unused-parameter
+  -Wno-missing-field-initializers
+  -Wno-write-strings
+  -Wno-unknown-pragmas
+  )
 
 add_dependencies(c10d torch)
 
@@ -122,19 +118,17 @@ if(USE_C10D_GLOO)
 endif()
 
 copy_header(FileStore.hpp)
+copy_header(HashStore.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
 copy_header(Store.hpp)
+copy_header(TCPStore.hpp)
 copy_header(Types.hpp)
 copy_header(Utils.hpp)
 if(USE_GLOO)
   copy_header(ProcessGroupGloo.hpp)
   copy_header(GlooDeviceFactory.hpp)
 endif()
-if(NOT WIN32)
-  copy_header(HashStore.hpp)
-  copy_header(TCPStore.hpp)
-endif()
 
 if(USE_C10D_NCCL)
   copy_header(ProcessGroupNCCL.hpp)
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index eb25c52f787a..55346e0fa635 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -3,16 +3,9 @@
 #include <assert.h>
 #include <fcntl.h>
 #include <stdint.h>
-#include <sys/stat.h>
-
-#ifdef _WIN32
-#include <windows.h>
-#include <fileapi.h>
-#include <io.h>
-#else
 #include <sys/file.h>
+#include <sys/stat.h>
 #include <unistd.h>
-#endif
 
 #include <chrono>
 #include <cstdio>
@@ -28,40 +21,6 @@
     throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \
   }
 
-#ifdef _WIN32
-#define LOCK_EX 0x00000001
-#define LOCK_SH 0x00000010
-#define LOCK_UN 0x00000100
-
-int flock_(int fd, int op) {
-    HANDLE hdl = (HANDLE) _get_osfhandle(fd);
-    DWORD low = 1, high = 0;
-    OVERLAPPED offset = {0, 0, 0, 0, NULL};
-
-    if (hdl < 0)
-      return -1;
-
-    switch (op) {
-      case LOCK_EX:
-        if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset))
-          return 0;
-        break;
-      case LOCK_SH:
-        if (LockFileEx(hdl, 0, 0, low, high, &offset))
-          return 0;
-        break;
-      case LOCK_UN:
-        if(UnlockFileEx(hdl, 0, low, high, &offset) != 0)
-          return 0;
-        break;
-      default:
-        break;
-    }
-    errno = EINVAL;
-    return -1;
-}
-#endif
-
 namespace c10d {
 
 namespace {
@@ -120,11 +79,7 @@ class Lock {
   int fd_{-1};
 
   void flock(int operation) {
-#ifdef _WIN32
-    auto rv = syscall(std::bind(::flock_, fd_, operation));
-#else
     auto rv = syscall(std::bind(::flock, fd_, operation));
-#endif
     SYSASSERT(rv, "flock");
   }
 };
@@ -137,11 +92,7 @@ class File {
       std::chrono::milliseconds timeout) {
     const auto start = std::chrono::steady_clock::now();
     while (true) {
-#ifdef _WIN32
-      fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE));
-#else
       fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644));
-#endif
       // Only retry when the file doesn't exist, since we are waiting for the
       // file to be created in this case to address the following issue:
       // https://github.com/pytorch/pytorch/issues/13750
diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp
index dca6b03eb9dd..70c3c2bb7a31 100644
--- a/torch/lib/c10d/GlooDeviceFactory.cpp
+++ b/torch/lib/c10d/GlooDeviceFactory.cpp
@@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING(
 
 #if GLOO_HAVE_TRANSPORT_TCP
 static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
-    const std::string& interfaceName,
+    const std::string& interface,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interfaceName.empty() || !hostname.empty(),
+      !interface.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeTCPDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::tcp::attr attr;
-  if (!interfaceName.empty()) {
-    attr.iface = interfaceName;
+  if (!interface.empty()) {
+    attr.iface = interface;
   } else {
     attr.hostname = hostname;
   }
@@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice);
 
 #if GLOO_HAVE_TRANSPORT_UV
 static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
-    const std::string& interfaceName,
+    const std::string& interface,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interfaceName.empty() || !hostname.empty(),
+      !interface.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeUVDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::uv::attr attr;
-  if (!interfaceName.empty()) {
-    attr.iface = interfaceName;
+  if (!interface.empty()) {
+    attr.iface = interface;
   } else {
     attr.hostname = hostname;
   }
@@ -81,28 +81,23 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
 // the flexibility of other application to override by priority. Register
 // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override.
 C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice);
-C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice);
 C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice);
 #endif
 
 static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT");
 
 std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
-    makeDeviceForInterface(const std::string& interfaceName) {
+    makeDeviceForInterface(const std::string& interface) {
   if (glooDeviceTransport) {
-    return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, "");
+    return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, "");
   }
 
 #ifdef __linux__
-  return GlooDeviceRegistry()->Create("LINUX", interfaceName, "");
+  return GlooDeviceRegistry()->Create("LINUX", interface, "");
 #endif
 
 #ifdef __APPLE__
-  return GlooDeviceRegistry()->Create("APPLE", interfaceName, "");
-#endif
-
-#ifdef _WIN32
-  return GlooDeviceRegistry()->Create("WIN32", interfaceName, "");
+  return GlooDeviceRegistry()->Create("APPLE", interface, "");
 #endif
 
   throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device");
@@ -122,10 +117,6 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
   return GlooDeviceRegistry()->Create("APPLE", "", hostname);
 #endif
 
-#ifdef _WIN32
-  return GlooDeviceRegistry()->Create("WIN32", "", hostname);
-#endif
-
   throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device");
 }
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index c139ac7a34fd..531fe751f1c9 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -2,16 +2,10 @@
 
 #include <c10d/GlooDeviceFactory.hpp>
 
-#ifdef _WIN32
-#include <winsock2.h>
-#include <ws2tcpip.h>
-#include <gloo/common/win.h>
-#else
 #include <netdb.h>
 #include <sys/socket.h>
-#include <unistd.h>
-#endif
 #include <sys/types.h>
+#include <unistd.h>
 
 #include <type_traits>
 
@@ -42,36 +36,6 @@
 #include <gloo/rendezvous/context.h>
 #include <gloo/rendezvous/prefix_store.h>
 
-#ifdef _WIN32
-#define GENERATE_ALL_TYPES(type, func, ...)            \
-  switch (type) {                                      \
-    case ::at::ScalarType::Float:                      \
-      func<float>(__VA_ARGS__);                        \
-      break;                                           \
-    case ::at::ScalarType::Double:                     \
-      func<double>(__VA_ARGS__);                       \
-      break;                                           \
-    case ::at::ScalarType::Half:                       \
-      func<gloo::float16>(__VA_ARGS__);                \
-      break;                                           \
-    case ::at::ScalarType::Char:                       \
-      func<int8_t>(__VA_ARGS__);                       \
-      break;                                           \
-    case ::at::ScalarType::Byte:                       \
-      func<uint8_t>(__VA_ARGS__);                      \
-      break;                                           \
-    case ::at::ScalarType::Int:                        \
-      func<int32_t>(__VA_ARGS__);                      \
-      break;                                           \
-    case ::at::ScalarType::Long:                       \
-      func<int64_t>(__VA_ARGS__);                      \
-      break;                                           \
-    default:                                           \
-      throw std::runtime_error("Invalid scalar type"); \
-  }
-
-#define HOST_NAME_MAX 256
-#else
 #define GENERATE_ALL_TYPES(type, func, args...)        \
   switch (type) {                                      \
     case ::at::ScalarType::Float:                      \
@@ -98,7 +62,6 @@
     default:                                           \
       throw std::runtime_error("Invalid scalar type"); \
   }
-#endif
 
 namespace c10d {
 
@@ -446,19 +409,12 @@ ProcessGroupGloo::Options::Options()
 
 namespace {
 
-void socketInitialize() {
-#ifdef _WIN32
-  ::gloo::init_winsock();
-#endif
-}
-
 // Gloo assumes that this machine's hostname can always be resolved
 // to an address. If it doesn't it throws a runtime error saying
 // that it can't be resolved. Instead of catching it, we choose
 // to proactively check if an address can be resolved, so we can
 // gracefully fall back to an alternative if it doesn't.
 bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
-  socketInitialize();
   struct addrinfo hints;
   memset(&hints, 0, sizeof(hints));
   hints.ai_family = AF_UNSPEC;
@@ -475,11 +431,7 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
       continue;
     }
     rv = bind(fd, rp->ai_addr, rp->ai_addrlen);
-#ifdef _WIN32
-    closesocket(fd);
-#else
     close(fd);
-#endif
     if (rv == -1) {
       continue;
     }
@@ -491,11 +443,14 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
 
 } // namespace
 
+#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
-    createDeviceForInterface(const std::string& interface_name) {
-  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name);
+    createDeviceForInterface(const std::string& interface) {
+  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface);
 }
+#endif
 
+#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDeviceForHostname(const std::string& hostname) {
   TORCH_CHECK(
@@ -505,14 +460,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
       " to a (local) address");
   return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname);
 }
+#endif
 
-#if defined(__linux__) || defined(_WIN32)
+#ifdef __linux__
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDefaultDevice() {
   // Use the hostname to resolve the network address to
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
-  socketInitialize();
   std::array<char, HOST_NAME_MAX> hostname{};
   auto rv = gethostname(hostname.data(), HOST_NAME_MAX);
   if (rv != 0) {
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
index 6c6e941ef95d..d975f6eb6bc5 100644
--- a/torch/lib/c10d/Utils.cpp
+++ b/torch/lib/c10d/Utils.cpp
@@ -1,6 +1,5 @@
 #include <c10d/Utils.hpp>
 
-#ifndef _WIN32
 #include <netdb.h>
 #include <sys/poll.h>
 
@@ -355,6 +354,6 @@ std::tuple<int, std::string> accept(
   return std::make_tuple(
       socket, sockaddrToString(reinterpret_cast<struct ::sockaddr*>(&addr)));
 }
+
 } // namespace tcputil
 } // namespace c10d
-#endif
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 1116cd39ba1c..1bdaddde9f24 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -1,8 +1,6 @@
 #pragma once
 
-#ifndef _WIN32
 #include <sys/socket.h>
-#endif
 #include <sys/types.h>
 
 #include <chrono>
@@ -482,7 +480,6 @@ class ResourceGuard {
   bool released_;
 };
 
-#ifndef _WIN32
 namespace tcputil {
 
 constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1);
@@ -612,5 +609,4 @@ std::tuple<int, std::string> accept(
     const std::chrono::milliseconds& timeout = kNoTimeout);
 
 } // namespace tcputil
-#endif
 } // namespace c10d
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 003f56f30861..8429d1099b29 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -8,19 +8,14 @@ function(c10d_add_test test_src)
   get_filename_component(test_name ${test_src} NAME_WE)
   add_executable(${test_name} "${test_src}")
   target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
-  target_link_libraries(${test_name} ${ARGN})
-  if(NOT WIN32)
-    target_link_libraries(${test_name} pthread)
-    target_compile_options(${test_name} PRIVATE -Wno-error)
-  endif()
+  target_link_libraries(${test_name} pthread ${ARGN})
+  target_compile_options(${test_name} PRIVATE -Wno-error)
   add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
 endfunction()
 
 c10d_add_test(FileStoreTest.cpp c10d gtest_main)
-if(NOT WIN32)
-  c10d_add_test(HashStoreTest.cpp c10d gtest_main)
-  c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
-endif()
+c10d_add_test(HashStoreTest.cpp c10d gtest_main)
+c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
 
 if(USE_CUDA)
   if(USE_C10D_GLOO)
@@ -34,7 +29,7 @@ if(USE_CUDA)
   endif()
 else()
   if(USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main)
   endif()
 endif()
 
diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp
index 328da2faf648..defaff895a18 100644
--- a/torch/lib/c10d/test/CUDATest.hpp
+++ b/torch/lib/c10d/test/CUDATest.hpp
@@ -5,15 +5,9 @@
 namespace c10d {
 namespace test {
 
-#ifdef _WIN32
-#define EXPORT_TEST_API __declspec(dllexport)
-#else
-#define EXPORT_TEST_API
-#endif
+void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
 
-EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
-
-EXPORT_TEST_API int cudaNumDevices();
+int cudaNumDevices();
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp
index cc8da6326091..77215f4521c2 100644
--- a/torch/lib/c10d/test/FileStoreTest.cpp
+++ b/torch/lib/c10d/test/FileStoreTest.cpp
@@ -1,8 +1,6 @@
 #include <c10d/test/StoreTestCommon.hpp>
 
-#ifndef _WIN32
 #include <unistd.h>
-#endif
 
 #include <iostream>
 #include <thread>
@@ -12,11 +10,6 @@
 #include <c10d/FileStore.hpp>
 #include <c10d/PrefixStore.hpp>
 
-#ifdef _WIN32
-std::string tmppath() {
-  return c10d::test::autoGenerateTmpFilePath();
-}
-#else
 std::string tmppath() {
   const char* tmpdir = getenv("TMPDIR");
   if (tmpdir == nullptr) {
@@ -36,7 +29,6 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
-#endif
 
 void testGetSet(std::string path, std::string prefix = "") {
   // Basic Set/Get on File Store
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index da4f9b5fc106..6606e553e733 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -1,10 +1,7 @@
-#ifndef _WIN32
 #include <signal.h>
+#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
-#endif
-
-#include <sys/types.h>
 
 #include <condition_variable>
 #include <iostream>
@@ -24,7 +21,6 @@ using namespace c10d::test;
 constexpr auto kSendDelay = std::chrono::milliseconds(100);
 constexpr auto kWaitTimeout = std::chrono::milliseconds(1);
 
-#ifndef _WIN32
 class SignalTest {
  public:
   SignalTest(const std::string& path) : path_(path) {}
@@ -96,7 +92,6 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal(
   test.arm(fork.pid, signal);
   return test.run(0, 2);
 }
-#endif
 
 class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo {
  public:
@@ -461,7 +456,6 @@ void testRecv(const std::string& path) {
   EXPECT_TRUE(recvCompleted);
 }
 
-#ifndef _WIN32
 TEST(ProcessGroupGlooTest, testSIGSTOPException) {
   // test SIGSTOP
   // Fork() and TSAN don't play well together, so skip the test if we're testing
@@ -491,7 +485,6 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) {
   EXPECT_FALSE(work->isSuccess());
   EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception);
 }
-#endif
 
 TEST(ProcessGroupGlooTest, testAllReduceCPU) {
   {
diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp
index 5f5dfca315cb..c62695485573 100644
--- a/torch/lib/c10d/test/TestUtils.hpp
+++ b/torch/lib/c10d/test/TestUtils.hpp
@@ -1,12 +1,9 @@
 #pragma once
 
-#ifndef _WIN32
 #include <signal.h>
+#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
-#endif
-
-#include <sys/types.h>
 #include <cstring>
 
 #include <condition_variable>
@@ -40,28 +37,6 @@ class Semaphore {
   std::condition_variable cv_;
 };
 
-#ifdef _WIN32
-std::string autoGenerateTmpFilePath() {
-  char tmp[L_tmpnam_s];
-  errno_t err;
-  err = tmpnam_s(tmp, L_tmpnam_s);
-  if (err != 0)
-  {
-    throw std::system_error(errno, std::system_category());
-  }
-  return std::string(tmp);
-}
-
-std::string tmppath() {
-  const char* tmpfile = getenv("TMPFILE");
-  if (tmpfile) {
-    return std::string(tmpfile);
-  }
-  else {
-    return autoGenerateTmpFilePath();
-  }
-}
-#else
 std::string tmppath() {
   // TMPFILE is for manual test execution during which the user will specify
   // the full temp file path using the environmental variable TMPFILE
@@ -88,7 +63,6 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
-#endif
 
 bool isTSANEnabled() {
   auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
@@ -106,7 +80,6 @@ struct TemporaryFile {
   }
 };
 
-#ifndef _WIN32
 struct Fork {
   pid_t pid;
 
@@ -128,7 +101,6 @@ struct Fork {
     return pid == 0;
   }
 };
-#endif
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index b2cd30c66812..f8e5b4822bd8 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -16,7 +16,7 @@
 import torch.distributed as c10d
 
 from functools import partial, reduce
-from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA
+from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM
 
 class TestSkip(NamedTuple):
     exit_code: int
@@ -143,23 +143,10 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
-def skip_if_win32():
-    return unittest.skipIf(
-        sys.platform == 'win32',
-        "This unit test case is not supportted on Windows platform",
-    )
-
 TIMEOUT_DEFAULT = 100
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 
-def create_device(interface=None):
-    if sys.platform == 'win32' or interface is None:
-        return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
-    else:
-        return c10d.ProcessGroupGloo.create_device(interface=interface)
-
-
 def get_timeout(test_id):
     return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT)
 
@@ -219,7 +206,7 @@ def initialize_temp_directories(init_method=None):
     if init_method is not None:
         os.environ["INIT_METHOD"] = init_method
     else:
-        os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join(
+        os.environ["INIT_METHOD"] = "file://" + os.path.join(
             init_dir_path, "shared_init_file"
         )
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 36434ff8aa2f..9959551031ff 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -53,10 +53,6 @@
 
 torch.backends.disable_global_flags()
 
-FILE_SCHEMA = "file://"
-if sys.platform == 'win32':
-    FILE_SCHEMA = "file:///"
-
 IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
 
 class ProfilingMode(Enum):
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
index 93de304a53ca..b88765211df1 100644
--- a/torch/testing/_internal/dist_utils.py
+++ b/torch/testing/_internal/dist_utils.py
@@ -7,7 +7,6 @@
 import torch.distributed as dist
 import torch.distributed.rpc as rpc
 from torch.distributed.rpc import _rref_context_get_debug_info  # type: ignore[attr-defined]
-from torch.testing._internal.common_utils import FILE_SCHEMA
 
 
 if not dist.is_available():
@@ -15,7 +14,7 @@
     sys.exit(0)
 
 
-INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}"
+INIT_METHOD_TEMPLATE = "file://{file_name}"
 
 
 def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 09db831e9999..1b1f755ed4cc 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -20,7 +20,7 @@
     skip_if_lt_x_gpu,
     skip_if_rocm,
 )
-from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE
+from torch.testing._internal.dist_utils import dist_init
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
@@ -329,7 +329,7 @@ def _remote_worker_process(self):
         gLogger.info("The remote worker is running.")
         dist.init_process_group(
             backend="gloo",
-            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            init_method="file://{}".format(self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -346,7 +346,7 @@ def _trainer_process(self, rank: int):
         )
         dist.init_process_group(
             backend="gloo",
-            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            init_method="file://{}".format(self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -363,7 +363,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
         gLogger.info("Running the master process...")
         dist.init_process_group(
             backend="gloo",
-            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            init_method="file://{}".format(self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -500,7 +500,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            init_method="file://{}".format(self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -567,7 +567,7 @@ def test_ddp_dist_autograd_sparse_grads(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            init_method="file://{}".format(self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -604,7 +604,7 @@ def test_ddp_dist_autograd_local_vs_remote(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            init_method="file://{}".format(self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -651,7 +651,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            init_method="file://{}".format(self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index af5e648f6acb..f6f2b9a6fbfb 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,4 +1,5 @@
 import copy
+import fcntl
 import itertools
 import random
 import math
@@ -21,7 +22,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember
-from torch.testing._internal.common_utils import FILE_SCHEMA
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     TEST_SKIPS,
@@ -43,10 +43,6 @@
 except ImportError:
     HAS_TORCHVISION = False
 
-if sys.platform == 'win32':
-    import msvcrt
-else:
-    import fcntl
 
 class Foo:
     def __init__(self, x):
@@ -195,17 +191,10 @@ def _lock():
     lockfile = os.path.join(TEMP_DIR, "lockfile")
     with open(lockfile, "w") as lf:
         try:
-            if sys.platform == 'win32':
-                msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1)
-                yield
-            else:
-                fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
-                yield
+            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+            yield
         finally:
-            if sys.platform == 'win32':
-                msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
-            else:
-                fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
             lf.close()
 
 
@@ -281,7 +270,7 @@ def tearDown(self):
 
     @property
     def init_method(self):
-        return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name)
+        return "file://{file_name}".format(file_name=self.file_name)
 
     @classmethod
     def _run(cls, rank, test_name, file_name):
@@ -2173,13 +2162,8 @@ def _test_DDP_5iter(
                 # save the model in the middle and reload
                 if test_save and idx == 2 and INIT_METHOD.startswith("file://"):
                     with tempfile.NamedTemporaryFile() as tmp:
-                        if sys.platform == 'win32':
-                            torch.save(model_DDP, tmp)
-                            tmp.seek(0)
-                            model_DDP = torch.load(tmp)
-                        else:
-                            torch.save(model_DDP, tmp.name)
-                            model_DDP = torch.load(tmp.name)
+                        torch.save(model_DDP, tmp.name)
+                        model_DDP = torch.load(tmp.name)
 
             with tempfile.TemporaryFile() as tmp_file:
                 torch.save(model_DDP, tmp_file)
@@ -2208,13 +2192,8 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gr
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                if sys.platform == 'win32':
-                    torch.save(model_DDP, tmp)
-                    tmp.seek(0)
-                    model_DDP = torch.load(tmp)
-                else:
-                    torch.save(model_DDP, tmp.name)
-                    model_DDP = torch.load(tmp.name)
+                torch.save(model_DDP, tmp.name)
+                model_DDP = torch.load(tmp.name)
 
             # dummy data initialization
             local_bs = len(gpu_subset)
@@ -2371,13 +2350,8 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                if sys.platform == 'win32':
-                    torch.save(model_DDP, tmp)
-                    tmp.seek(0)
-                    model_DDP = torch.load(tmp)
-                else:
-                    torch.save(model_DDP, tmp.name)
-                    model_DDP = torch.load(tmp.name)
+                torch.save(model_DDP, tmp.name)
+                model_DDP = torch.load(tmp.name)
 
             # data initialization
             input_cpu = torch.randn(global_bs, 2)

From bdf329ef8a256f2157aae86a5be28109c2589eb4 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Thu, 24 Sep 2020 22:49:17 -0700
Subject: [PATCH 125/449] SyncBN: preserve qconfig if it exists (#45317)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45317

Eager mode quantization depends on the presence of the `config`
model attribute.  Currently converting a model to use `SyncBatchNorm`
removes the qconfig - fixing this.  This is important if a BN is not
fused to anything during quantization convert.

Test Plan:
```
python test/test_quantization.py TestDistributed.test_syncbn_preserves_qconfig
```

Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23922072

fbshipit-source-id: cc1bc25c8e5243abb924c6889f78cf65a81be158
---
 test/quantization/test_workflow_module.py | 15 +++++++++++++++
 torch/nn/modules/batchnorm.py             | 14 ++++++++------
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index 817e54460e07..5068a6fe7fd4 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -1536,6 +1536,21 @@ def forward(self, x):
                 isinstance(fused_model.conv.bn, nn.SyncBatchNorm),
                 "Expected BN to be converted to SyncBN")
 
+    def test_syncbn_preserves_qconfig(self):
+        """
+        Makes sure that if a BatchNorm is not fused and a qconfig exists,
+        convering the module to SyncBatchNorm preserves the qconfig.
+        """
+        m = nn.Sequential(
+            nn.Conv2d(1, 1, 1),
+            nn.BatchNorm2d(1),
+        )
+        m[1].qconfig = torch.quantization.default_qconfig
+        m = torch.nn.SyncBatchNorm.convert_sync_batchnorm(m)
+        self.assertTrue(
+            hasattr(m[1], "qconfig"),
+            "missing qconfig after SyncBatchNorm conversion")
+
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     @override_qengines
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 075311870439..f5ca6deb5b19 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -114,7 +114,7 @@ def forward(self, input: Tensor) -> Tensor:
                 else:  # use exponential moving average
                     exponential_average_factor = self.momentum
 
-        r""" 
+        r"""
         Decide whether the mini-batch stats should be used for normalization rather than the buffers.
         Mini-batch stats are used in training mode, and in eval mode when buffers are None.
         """
@@ -185,7 +185,7 @@ class BatchNorm1d(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
 
@@ -258,7 +258,7 @@ class BatchNorm2d(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
 
@@ -332,7 +332,7 @@ class BatchNorm3d(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
 
@@ -414,7 +414,7 @@ class SyncBatchNorm(_BatchNorm):
         track_running_stats: a boolean value that when set to ``True``, this
             module tracks the running mean and variance, and when set to ``False``,
             this module does not track such statistics, and initializes statistics
-            buffers :attr:`running_mean` and :attr:`running_var` as ``None``. 
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
             When these buffers are ``None``, this module always uses batch statistics.
             in both training and eval modes. Default: ``True``
         process_group: synchronization of stats happen within each process group
@@ -493,7 +493,7 @@ def forward(self, input: Tensor) -> Tensor:
             else:  # use exponential moving average
                 exponential_average_factor = self.momentum
 
-        r""" 
+        r"""
         Decide whether the mini-batch stats should be used for normalization rather than the buffers.
         Mini-batch stats are used in training mode, and in eval mode when buffers are None.
         """
@@ -576,6 +576,8 @@ def convert_sync_batchnorm(cls, module, process_group=None):
             module_output.running_mean = module.running_mean
             module_output.running_var = module.running_var
             module_output.num_batches_tracked = module.num_batches_tracked
+            if hasattr(module, "qconfig"):
+                module_output.qconfig = module.qconfig
         for name, child in module.named_children():
             module_output.add_module(name, cls.convert_sync_batchnorm(child, process_group))
         del module

From 95df8657c94492ff026112f8e51a24216f1a9a0c Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Thu, 24 Sep 2020 23:07:38 -0700
Subject: [PATCH 126/449] Enables test linalg (#45278)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45271.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45278

Reviewed By: ngimel

Differential Revision: D23926124

Pulled By: mruberry

fbshipit-source-id: 26692597f9a1988e5fa846f97b8430c3689cac27
---
 test/run_test.py    |  1 +
 test/test_linalg.py | 12 ++++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/test/run_test.py b/test/run_test.py
index d63fc372f9c2..b24a20c60f46 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -41,6 +41,7 @@
     'test_foreach',
     'test_indexing',
     'test_jit',
+    'test_linalg',
     'test_logging',
     'test_mkldnn',
     'test_multiprocessing',
diff --git a/test/test_linalg.py b/test/test_linalg.py
index c81b4dc37582..3dbf31497b77 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -5,7 +5,7 @@
 from math import inf, nan, isnan
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, TEST_NUMPY)
+    (TestCase, run_tests, TEST_NUMPY, IS_MACOS, IS_WINDOWS, TEST_WITH_ASAN)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack)
 from torch.testing._internal.jit_metaprogramming_utils import gen_script_fn_and_args
@@ -56,11 +56,12 @@ def test_det(self, device, dtype):
 
         # NOTE: det requires a 2D+ tensor
         t = torch.randn(1, device=device, dtype=dtype)
-        with self.assertRaises(IndexError):
+        with self.assertRaises(RuntimeError):
             op(t)
 
     # This test confirms that torch.linalg.norm's dtype argument works
     # as expected, according to the function's documentation
+    @skipCUDAIfNoMagma
     def test_norm_dtype(self, device):
         def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype):
             msg = (
@@ -154,6 +155,7 @@ def run_test_case(input, p, dim, keepdim):
 
     # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that
     # their matrix norm results match
+    @skipCUDAIfNoMagma
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     @dtypes(torch.float, torch.double)
     def test_norm_matrix(self, device, dtype):
@@ -400,6 +402,8 @@ def gen_error_message(input_size, ord, keepdim, dim=None):
 
     # Test that linal.norm gives the same result as numpy when inputs
     # contain extreme values (inf, -inf, nan)
+    @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
+    @unittest.skipIf(IS_MACOS, "Skipped on MacOS!")
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
@@ -440,14 +444,14 @@ def is_broken_matrix_norm_case(ord, x):
                 result_n = np.linalg.norm(x_n, ord=ord)
 
                 if is_broken_matrix_norm_case(ord, x):
-                    self.assertNotEqual(result, result_n, msg=msg)
+                    continue
                 else:
                     self.assertEqual(result, result_n, msg=msg)
 
     # Test degenerate shape results match numpy for linalg.norm vector norms
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    @unittest.skipIf(TEST_WITH_ASAN, "Skipped on ASAN since it checks for undefined behavior.")
     @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_norm_vector_degenerate_shapes(self, device, dtype):
         def run_test_case(input, ord, dim, keepdim, should_error):

From 99e0a87bbb4faa6bb539c0eedf323d79fdd8cfcf Mon Sep 17 00:00:00 2001
From: jjsjann123 <jiej@nvidia.com>
Date: Thu, 24 Sep 2020 23:11:38 -0700
Subject: [PATCH 127/449] [nvFuser] Latency improvements for pointwise +
 reduction fusion (#45218)

Summary:
A lot of changes are in this update, some highlights:

- Added Doxygen config file
- Split the fusion IR (higher level TE like IR) from kernel IR (lower level CUDA like IR)
- Improved latency with dynamic shape handling for the fusion logic
- Prevent recompilation for pointwise + reduction fusions when not needed
- Improvements to inner dimension reduction performance
- Added input -> kernel + kernel launch parameters cache, added eviction policy
- Added reduction fusions with multiple outputs (still single reduction stage)
- Fixed code generation bugs for symbolic tiled GEMM example
- Added thread predicates to prevent shared memory form being loaded multiple times
- Improved sync threads placements with shared memory and removed read before write race
- Fixes to FP16 reduction fusions where output would come back as FP32

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45218

Reviewed By: ezyang

Differential Revision: D23905183

Pulled By: soumith

fbshipit-source-id: 12f5ad4cbe03e9a25043bccb89e372f8579e2a79
---
 aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h     |    1 +
 caffe2/CMakeLists.txt                         |    4 +
 test/cpp/jit/test_gpu.cpp                     | 1925 ++++++++++---
 test/cpp/jit/tests.h                          |   21 +-
 test/test_jit_cuda_fuser.py                   |   99 +-
 test/test_jit_cuda_fuser_legacy.py            |    6 +
 test/test_jit_cuda_fuser_profiling.py         |    6 +
 tools/build_variables.bzl                     |    4 +
 torch/csrc/jit/codegen/cuda/codegen.cpp       |  640 +++++
 torch/csrc/jit/codegen/cuda/codegen.h         |   22 +
 torch/csrc/jit/codegen/cuda/compute_at.cpp    |   65 +-
 torch/csrc/jit/codegen/cuda/compute_at.h      |    4 +-
 torch/csrc/jit/codegen/cuda/docs/.gitignore   |    1 +
 .../jit/codegen/cuda/docs/documentation.h     |   23 +
 .../csrc/jit/codegen/cuda/docs/fuser.doxygen  | 2515 +++++++++++++++++
 .../cuda/docs/images/ir_architecture.png      |  Bin 0 -> 96754 bytes
 torch/csrc/jit/codegen/cuda/docs/main_page.md |    8 +
 torch/csrc/jit/codegen/cuda/executor.cpp      |  395 ++-
 torch/csrc/jit/codegen/cuda/executor.h        |   86 +-
 .../jit/codegen/cuda/executor_kernel_arg.cpp  |    2 +-
 .../jit/codegen/cuda/executor_kernel_arg.h    |    8 +
 .../jit/codegen/cuda/executor_launch_params.h |    5 +
 .../csrc/jit/codegen/cuda/executor_utils.cpp  |  217 +-
 torch/csrc/jit/codegen/cuda/executor_utils.h  |   17 +-
 .../csrc/jit/codegen/cuda/expr_evaluator.cpp  |  219 +-
 torch/csrc/jit/codegen/cuda/expr_evaluator.h  |   86 +-
 torch/csrc/jit/codegen/cuda/fusion.cpp        |  153 +-
 torch/csrc/jit/codegen/cuda/fusion.h          |   56 +-
 torch/csrc/jit/codegen/cuda/graph_fuser.cpp   |    7 +
 torch/csrc/jit/codegen/cuda/index_compute.cpp |  186 +-
 .../csrc/jit/codegen/cuda/instrumentation.cpp |   71 +
 torch/csrc/jit/codegen/cuda/instrumentation.h |   93 +
 torch/csrc/jit/codegen/cuda/interface.cpp     |    1 +
 torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp |   22 +-
 torch/csrc/jit/codegen/cuda/ir_base_nodes.h   |    1 +
 torch/csrc/jit/codegen/cuda/ir_cloner.cpp     |   76 -
 torch/csrc/jit/codegen/cuda/ir_cloner.h       |   23 -
 torch/csrc/jit/codegen/cuda/ir_graphviz.cpp   |   50 -
 torch/csrc/jit/codegen/cuda/ir_graphviz.h     |    7 -
 .../jit/codegen/cuda/ir_interface_nodes.h     |    9 +-
 .../csrc/jit/codegen/cuda/ir_internal_nodes.h |   12 +
 torch/csrc/jit/codegen/cuda/ir_iostream.cpp   |  840 +-----
 torch/csrc/jit/codegen/cuda/ir_iostream.h     |  108 +-
 torch/csrc/jit/codegen/cuda/ir_nodes.cpp      |  377 ++-
 torch/csrc/jit/codegen/cuda/ir_printer.h      |   54 +-
 torch/csrc/jit/codegen/cuda/iter_visitor.cpp  |  104 +-
 torch/csrc/jit/codegen/cuda/iter_visitor.h    |   71 +-
 torch/csrc/jit/codegen/cuda/kernel.cpp        |  147 +-
 torch/csrc/jit/codegen/cuda/kernel.h          |  121 +-
 torch/csrc/jit/codegen/cuda/kernel_cache.cpp  |  384 ++-
 torch/csrc/jit/codegen/cuda/kernel_cache.h    |  111 +-
 torch/csrc/jit/codegen/cuda/kernel_ir.cpp     |  315 +--
 torch/csrc/jit/codegen/cuda/kernel_ir.h       |  215 +-
 .../jit/codegen/cuda/kernel_ir_builder.cpp    |  104 +
 .../csrc/jit/codegen/cuda/kernel_ir_builder.h |   81 +
 .../codegen/cuda/kernel_resource_strings.h    |   77 +-
 torch/csrc/jit/codegen/cuda/lower2device.cpp  |  191 +-
 torch/csrc/jit/codegen/cuda/lower2device.h    |   41 +-
 torch/csrc/jit/codegen/cuda/lower_index.cpp   |  137 +-
 torch/csrc/jit/codegen/cuda/lower_index.h     |    8 +
 .../jit/codegen/cuda/lower_insert_syncs.cpp   |  227 ++
 .../jit/codegen/cuda/lower_insert_syncs.h     |   51 +
 torch/csrc/jit/codegen/cuda/lower_loops.cpp   |  157 +-
 torch/csrc/jit/codegen/cuda/lower_loops.h     |   69 +-
 .../codegen/cuda/lower_thread_predicate.cpp   |   68 +-
 .../jit/codegen/cuda/lower_thread_predicate.h |   32 +-
 torch/csrc/jit/codegen/cuda/lower_unroll.cpp  |   20 +-
 torch/csrc/jit/codegen/cuda/lower_unroll.h    |    8 +-
 torch/csrc/jit/codegen/cuda/lower_utils.cpp   |   52 +-
 .../jit/codegen/cuda/lower_validation.cpp     |    5 +-
 torch/csrc/jit/codegen/cuda/manager.cpp       |   62 +-
 torch/csrc/jit/codegen/cuda/parser.cpp        |   42 +-
 torch/csrc/jit/codegen/cuda/partition.cpp     |    5 +
 .../jit/codegen/cuda/predicate_compute.cpp    |   76 +-
 .../csrc/jit/codegen/cuda/predicate_compute.h |    8 +-
 torch/csrc/jit/codegen/cuda/scheduler.cpp     |  483 ++--
 torch/csrc/jit/codegen/cuda/scheduler.h       |   32 +-
 .../csrc/jit/codegen/cuda/shape_inference.cpp |   12 +-
 torch/csrc/jit/codegen/cuda/tensor_view.cpp   |   21 +-
 torch/csrc/jit/codegen/cuda/transform_iter.h  |    2 +
 .../jit/codegen/cuda/transform_replay.cpp     |    7 +
 .../jit/codegen/cuda/transform_rfactor.cpp    |    5 +
 torch/csrc/jit/codegen/cuda/type.h            |    8 +
 torch/csrc/jit/codegen/cuda/utils.h           |   15 +
 84 files changed, 8911 insertions(+), 3188 deletions(-)
 create mode 100644 torch/csrc/jit/codegen/cuda/codegen.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/codegen.h
 create mode 100644 torch/csrc/jit/codegen/cuda/docs/.gitignore
 create mode 100644 torch/csrc/jit/codegen/cuda/docs/documentation.h
 create mode 100644 torch/csrc/jit/codegen/cuda/docs/fuser.doxygen
 create mode 100644 torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png
 create mode 100644 torch/csrc/jit/codegen/cuda/docs/main_page.md
 create mode 100644 torch/csrc/jit/codegen/cuda/instrumentation.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/instrumentation.h
 create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_builder.h
 create mode 100644 torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
 create mode 100644 torch/csrc/jit/codegen/cuda/lower_insert_syncs.h

diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index 4630465115c7..00e57ca63520 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -42,6 +42,7 @@ namespace at { namespace cuda {
   _(nvrtcGetProgramLog)                          \
   _(nvrtcGetLoweredName)                         \
   _(cuModuleLoadData)                            \
+  _(cuModuleLoadDataEx)                          \
   _(cuModuleGetFunction)                         \
   _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
   _(cuGetErrorString)                            \
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 65f072b6f29d..6ea848bd32e5 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -506,6 +506,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/codegen.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/executor.cpp
@@ -515,6 +516,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/fusion.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/graph_fuser.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/index_compute.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/instrumentation.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_base_nodes.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_cloner.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_graphviz.cpp
@@ -524,7 +526,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_cache.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_index.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_unroll.cpp
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
index 80fa318d653a..d18becfa6641 100644
--- a/test/cpp/jit/test_gpu.cpp
+++ b/test/cpp/jit/test_gpu.cpp
@@ -3,6 +3,7 @@
 #include <test/cpp/jit/test_base.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
@@ -11,6 +12,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/mutator.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler.h>
@@ -73,11 +75,11 @@ TensorView* makeTensorWithContig(
 }
 
 void checkIntValue(
-    const EvaluationContext* eval_context,
+    StatefulExpressionEvaluator& evaluator,
     Val* val,
     Int::ScalarType expected_value) {
   TORCH_CHECK(val->isAnInt());
-  const auto actual_value = ExpressionEvaluator::evaluate(val, eval_context);
+  const auto actual_value = evaluator.inferValue(val);
   TORCH_CHECK(actual_value.has_value());
   TORCH_CHECK(actual_value.value() == expected_value);
 }
@@ -162,16 +164,16 @@ void testGPU_FusionExprEvalConstants() {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  EvaluationContext eval_context(&fusion);
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   auto* a = new Int(7);
   auto* b = new Int(3);
 
-  checkIntValue(&eval_context, neg(a), -7);
-  checkIntValue(&eval_context, add(a, b), 10);
-  checkIntValue(&eval_context, neg(mul(sub(a, b), div(a, b))), -8);
-  checkIntValue(&eval_context, mod(a, b), 1);
-  checkIntValue(&eval_context, ceilDiv(a, b), 3);
+  checkIntValue(evaluator, neg(a), -7);
+  checkIntValue(evaluator, add(a, b), 10);
+  checkIntValue(evaluator, neg(mul(sub(a, b), div(a, b))), -8);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
 }
 
 // Evaluate basic scalar operations with bound values
@@ -179,7 +181,7 @@ void testGPU_FusionExprEvalBindings() {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  EvaluationContext eval_context(&fusion);
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   auto* a = new Int();
   auto* b = new Int();
@@ -188,35 +190,35 @@ void testGPU_FusionExprEvalBindings() {
   auto* e = new Int(0);
 
   // trying to evaluate before binding should give empty results
-  TORCH_CHECK(!ExpressionEvaluator::evaluate(a, &eval_context).has_value());
-  TORCH_CHECK(!ExpressionEvaluator::evaluate(d, &eval_context).has_value());
+  TORCH_CHECK(!evaluator.inferValue(a).has_value());
+  TORCH_CHECK(!evaluator.inferValue(d).has_value());
 
-  eval_context.bind(a, 7);
-  eval_context.bind(b, 3);
+  evaluator.safeBind(a, 7);
+  evaluator.safeBind(b, 3);
 
   // can't bind to the results of expressions
-  ASSERT_ANY_THROW(eval_context.bind(c, 100));
+  ASSERT_ANY_THROW(evaluator.safeBind(c, 100));
 
   // can't bind to concrete values
-  ASSERT_ANY_THROW(eval_context.bind(e, 100));
+  ASSERT_ANY_THROW(evaluator.safeBind(e, 100));
 
-  checkIntValue(&eval_context, c, 10);
-  checkIntValue(&eval_context, sub(a, b), 4);
-  checkIntValue(&eval_context, mod(a, b), 1);
-  checkIntValue(&eval_context, ceilDiv(a, b), 3);
-  checkIntValue(&eval_context, d, -4);
+  checkIntValue(evaluator, c, 10);
+  checkIntValue(evaluator, sub(a, b), 4);
+  checkIntValue(evaluator, mod(a, b), 1);
+  checkIntValue(evaluator, ceilDiv(a, b), 3);
+  checkIntValue(evaluator, d, -4);
 
   // Reset evaluation context
-  eval_context = EvaluationContext(&fusion);
+  evaluator = StatefulExpressionEvaluator(&fusion);
 
-  eval_context.bind(a, 2);
-  eval_context.bind(b, 5);
+  evaluator.safeBind(a, 2);
+  evaluator.safeBind(b, 5);
 
-  checkIntValue(&eval_context, c, 7);
-  checkIntValue(&eval_context, sub(a, b), -3);
-  checkIntValue(&eval_context, mod(a, b), 2);
-  checkIntValue(&eval_context, ceilDiv(a, b), 1);
-  checkIntValue(&eval_context, d, -2);
+  checkIntValue(evaluator, c, 7);
+  checkIntValue(evaluator, sub(a, b), -3);
+  checkIntValue(evaluator, mod(a, b), 2);
+  checkIntValue(evaluator, ceilDiv(a, b), 1);
+  checkIntValue(evaluator, d, -2);
 }
 
 // Evaluate expressions in a simple IR
@@ -247,8 +249,8 @@ void testGPU_FusionExprEvalBasic() {
   tv2->axis(-1)->parallelize(ParallelType::TIDx);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);
 
-  // 1. Create an evaluation context
-  EvaluationContext eval_context(&fusion);
+  // 1. Create an evaluator
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   // 2. Bind values
   //
@@ -258,21 +260,21 @@ void testGPU_FusionExprEvalBasic() {
   //  (ex. `tv0->getRootDomain()[0]->extent()`
   //   instead of `tv0->axis(0)->extent()`)
   //
-  eval_context.bind(tv0->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv0->getRootDomain()[1]->extent(), 128);
-  eval_context.bind(tv1->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv1->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128);
 
   // 3. Evaluate and check result values
   TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128);
 
   TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128);
 }
 
 // Evaluate expressions in a more complex IR
@@ -298,33 +300,33 @@ void testGPU_FusionExprEvalComplex() {
   tv6->split(0, 5);
   tv5->merge(0);
 
-  // 1. Create an evaluation context
-  EvaluationContext eval_context(&fusion);
+  // 1. Create an evaluator
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   // 2. Bind values
-  eval_context.bind(tv0->getRootDomain()[0]->extent(), 129);
-  eval_context.bind(tv0->getRootDomain()[1]->extent(), 127);
+  evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 129);
+  evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 127);
 
   // Evaluate and check extent values
   TORCH_CHECK(tv0->domain()->nDims() == 2);
-  checkIntValue(&eval_context, tv0->axis(0)->rawExtent(), 129);
-  checkIntValue(&eval_context, tv0->axis(1)->rawExtent(), 127);
+  checkIntValue(evaluator, tv0->axis(0)->rawExtent(), 129);
+  checkIntValue(evaluator, tv0->axis(1)->rawExtent(), 127);
 
   TORCH_CHECK(tv3->domain()->nDims() == 2);
-  checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 129);
-  checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 127);
+  checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 129);
+  checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 127);
 
   TORCH_CHECK(tv4->domain()->nDims() == 2);
-  checkIntValue(&eval_context, tv4->axis(0)->rawExtent(), 129);
-  checkIntValue(&eval_context, tv4->axis(1)->rawExtent(), 127);
+  checkIntValue(evaluator, tv4->axis(0)->rawExtent(), 129);
+  checkIntValue(evaluator, tv4->axis(1)->rawExtent(), 127);
 
   TORCH_CHECK(tv5->domain()->nDims() == 1);
-  checkIntValue(&eval_context, tv5->axis(0)->rawExtent(), 16383);
+  checkIntValue(evaluator, tv5->axis(0)->rawExtent(), 16383);
 
   TORCH_CHECK(tv6->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv6->axis(0)->rawExtent(), 26);
-  checkIntValue(&eval_context, tv6->axis(1)->rawExtent(), 5);
-  checkIntValue(&eval_context, tv6->axis(2)->rawExtent(), 127);
+  checkIntValue(evaluator, tv6->axis(0)->rawExtent(), 26);
+  checkIntValue(evaluator, tv6->axis(1)->rawExtent(), 5);
+  checkIntValue(evaluator, tv6->axis(2)->rawExtent(), 127);
 }
 
 // Evaluate expressions post lowering
@@ -360,31 +362,29 @@ void testGPU_FusionExprEvalPostLower() {
 
   // Lower
   GpuLower gpulw(&fusion);
-  std::stringstream kernel;
-  gpulw.printKernel(kernel);
 
   // 1. Create an evaluation context
-  EvaluationContext eval_context(&fusion);
+  StatefulExpressionEvaluator evaluator(&fusion);
 
   // 2. Bind values
-  eval_context.bind(tv0->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv0->getRootDomain()[1]->extent(), 128);
-  eval_context.bind(tv1->getRootDomain()[0]->extent(), 6);
-  eval_context.bind(tv1->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128);
+  evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6);
+  evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128);
 
   // 3. Evaluate and check result values
   TORCH_CHECK(tv2->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128);
 
   TORCH_CHECK(tv3->domain()->nDims() == 3);
-  checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2);
-  checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4);
-  checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128);
+  checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2);
+  checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4);
+  checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128);
 
-  checkIntValue(&eval_context, bid_x, 2);
-  checkIntValue(&eval_context, tid_x, 128);
+  checkIntValue(evaluator, bid_x, 2);
+  checkIntValue(evaluator, tid_x, 128);
 }
 
 void testGPU_FusionClear() {
@@ -505,10 +505,12 @@ void testGPU_FusionCopy() {
   ASSERT_EQ(original_ir.str(), clone_ir.str());
 
   // Lower original fusion
-  std::stringstream original_kernel;
+  std::string original_kernel;
   {
-    GpuLower lower(&original_fusion);
-    lower.printKernel(original_kernel);
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&original_fusion);
+    original_kernel =
+        codegen::generateCudaKernel(GpuLower(&original_fusion).kernel());
   }
 
   // Make sure the "before lowering" clone was not mutated
@@ -529,12 +531,14 @@ void testGPU_FusionCopy() {
   ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str());
 
   // Lower the "before lowering" and compare kernels
-  std::stringstream clone_kernel;
+  std::string clone_kernel;
   {
-    GpuLower lower(&before_lowering);
-    lower.printKernel(clone_kernel);
+    // TODO(kir): remove this guard once we implement the cuda codegen visitor
+    FusionGuard fg(&before_lowering);
+    clone_kernel =
+        codegen::generateCudaKernel(GpuLower(&before_lowering).kernel());
   }
-  ASSERT_EQ(original_kernel.str(), clone_kernel.str());
+  ASSERT_EQ(original_kernel, clone_kernel);
 }
 
 void testGPU_FusionMove() {
@@ -593,9 +597,7 @@ void testGPU_FusionMove() {
   ASSERT_EQ(original_ir.str(), another_ir.str());
 
   // Lower the fusion IR
-  std::stringstream kernel;
   GpuLower lower(&another_fusion);
-  lower.printKernel(kernel);
 
   std::stringstream lowered_ir;
   lowered_ir << another_fusion;
@@ -799,48 +801,6 @@ void testGPU_FusionTensor() {
     }
   }
 
-  {
-    auto tensor = at::randn({2, 1, 4}, options);
-    auto tensor_type = TensorType::create(tensor);
-    auto fuser_tensor = new TensorView(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (int i = 0; i < static_cast<int>(fuser_tensor->nDims()); i++) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(
-          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
-    }
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]);
-
-    // temporary WAR to disable contig & bcast; issue # 230
-    // TODO: insert the check where broadcast & contiguous cannot be marked
-    // together
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-  }
-
-  {
-    auto tensor = at::randn({2, 3, 1}, options);
-    auto tensor_type = TensorType::create(tensor);
-    auto fuser_tensor = new TensorView(tensor_type);
-    TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim());
-    TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float);
-    TORCH_CHECK(fuser_tensor->domain() != nullptr);
-    for (int i = 0; i < static_cast<int>(fuser_tensor->nDims()); i++) {
-      // size 1 dimension are makred as broadcast
-      TORCH_CHECK(
-          fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1));
-    }
-    TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]);
-
-    // temporary WAR to disable contig & bcast; issue # 230
-    // TODO: insert the check where broadcast & contiguous cannot be marked
-    // together
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]);
-    TORCH_CHECK(!fuser_tensor->domain()->contiguity()[2]);
-  }
-
   // TensorType::create fills stride_properties, which helps us to mark
   // IterDomain properly
   // Note: implementation could change, depending on how much we want to invest
@@ -1156,43 +1116,36 @@ void testGPU_FusionParser() {
   // 1. this can be moved to a dedicated "golden" file
   // 2. use a fuzzy compare (ignore non-significant whitespaces for example)
   const std::string expected_kernel = R"(
-__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3){
-  float T2[4];
-  if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-    for(size_t i6 = 0; i6 < 4; ++i6 ) {
-      T2[ i6 ]
-         = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]
-         * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ];
+__global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Tensor<float, 1> T3) {
+  float T2[1];
+  if ((((((blockIdx.x * 1) + (1 - 1)) * 128) + threadIdx.x) < T0.size[0])) {
+    for(size_t i6 = 0; i6 < 1; ++i6) {
+      T2[i6]
+        = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+        * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
+      T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+        = T2[i6]
+        * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
     }
   } else {
-    for(size_t i6 = 0; i6 < 4; ++i6 ) {
-      if ( ( ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-        T2[ i6 ]
-           = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]
-           * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ];
+    for(size_t i6 = 0; i6 < 1; ++i6) {
+      if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) {
+        T2[i6]
+          = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+          * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
       }
-    }
-  }
-  if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-    for(size_t i13 = 0; i13 < 4; ++i13 ) {
-      T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]
-         = T2[ i13 ]
-         * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ];
-    }
-  } else {
-    for(size_t i13 = 0; i13 < 4; ++i13 ) {
-      if ( ( ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) {
-        T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]
-           = T2[ i13 ]
-           * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ];
+      if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) {
+        T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]
+          = T2[i6]
+          * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)];
       }
     }
   }
 }
 )";
 
-  std::string actual_kernel = GpuLower(fusion.get()).getKernel();
-  actual_kernel = "\n" + actual_kernel;
+  const std::string actual_kernel =
+      "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel());
   if (expected_kernel.size() != actual_kernel.size() ||
       expected_kernel.compare(actual_kernel) != 0) {
     std::cerr
@@ -1576,11 +1529,7 @@ void testGPU_FusionAdvancedComputeAt() {
     fe.compileFusion(&fusion);
     auto outputs = fe.runFusion({t0});
 
-    GpuLower gpulw(&fusion);
-    std::stringstream actual_kernel;
-    gpulw.printKernel(actual_kernel);
-
-    TORCH_CHECK(at::allclose(outputs[0], t5), actual_kernel.str());
+    TORCH_CHECK(at::allclose(outputs[0], t5));
     TORCH_CHECK(at::allclose(outputs[1], t6));
   }
 
@@ -1636,11 +1585,7 @@ void testGPU_FusionAdvancedComputeAt() {
     fe.compileFusion(&fusion);
     fe.runFusion({t0, t1}, {kernel_tv3});
 
-    GpuLower gpulw(&fusion);
-    std::stringstream actual_kernel;
-    gpulw.printKernel(actual_kernel);
-
-    TORCH_CHECK(at::allclose(kernel_tv3, t3), actual_kernel.str());
+    TORCH_CHECK(at::allclose(kernel_tv3, t3));
   }
 
   // Case 4
@@ -1706,11 +1651,7 @@ void testGPU_FusionAdvancedComputeAt() {
     fe.compileFusion(&fusion);
     auto outputs = fe.runFusion({t0, t1, t2, t3});
 
-    GpuLower gpulw(&fusion);
-    std::stringstream actual_kernel;
-    gpulw.printKernel(actual_kernel);
-
-    TORCH_CHECK(at::allclose(outputs[0], t6), actual_kernel.str());
+    TORCH_CHECK(at::allclose(outputs[0], t6));
   }
 
   // Case 5
@@ -1752,176 +1693,715 @@ void testGPU_FusionAdvancedComputeAt() {
   }
 }
 
-void testGPU_FusionScalarInputs() {
+void testGPU_FusionComputeAtMultiConsumers() {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv0 = makeDummyTensor(1);
   fusion.addInput(tv0);
-  TensorView* tv1 = makeDummyTensor(2);
-  fusion.addInput(tv1);
 
-  Float* f0 = new Float();
-  fusion.addInput(f0);
-  Float* f1 = new Float();
-  fusion.addInput(f1);
-  Float* f2 = new Float();
-  fusion.addInput(f2);
-  Float* f3 = new Float();
-  fusion.addInput(f3);
-  Val* f4 = mul(f0, f1);
-  Val* f5 = sub(f2, f3);
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv1, new Float(-2.0));
+  fusion.addOutput(tv2);
+  fusion.addOutput(tv3);
 
-  TensorView* tv2 = sub(tv1, f4);
-  TensorView* tv3 = add(tv0, f5);
-  TensorView* tv4 = mul(tv3, tv2);
+  // This computeAt will affect tv2 as well, even though tv2 is not in
+  // the data-flow path between tv1 and tv3. The reason is that tv1 is
+  // now computed at tv3, so tv2 must also be computed at the same
+  // location. Overall, what will happen is basically we merge
+  // expressions of all tensors and compute them in a single loop
+  // nest.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  // Note that tv2 is also computed at tv3.
+  TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget);
+  TORCH_CHECK(tv2->getComputeAtView() == tv3);
+  TORCH_CHECK(!tv3->hasComputeAt());
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1000}, options);
+
+  auto t1 = t0 * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+
+  at::Tensor kernel_tv2 = at::empty_like(t0, options);
+  at::Tensor kernel_tv3 = at::empty_like(t0, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({t0}, {kernel_tv2, kernel_tv3});
+
+  TORCH_CHECK(at::allclose(kernel_tv2, t2));
+  TORCH_CHECK(at::allclose(kernel_tv3, t3));
+}
 
+// Similar to ComputeAtMultiConsumers, but with a common consumer.
+void testGPU_FusionComputeAtCommonConsumer1() {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv1, new Float(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, new Float(5.0));
+  fusion.addOutput(tv3);
   fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
 
-  // Lets setup to actually run
-  while (tv4->nDims() > 1)
-    tv4->merge(0);
-  tv4->split(0, 128);
-  tv4->split(0, 4);
+  // Computing tv1 at tv3. This will affect tv2 as discussed in
+  // ComplexComputeAt1. Additionally, in this case, notice that tv4 is
+  // the common consumer of tv2 and tv3, so they are computed at
+  // tv4. The indirect propagation of the computeAt should stop at the
+  // common consumer, and no further change should occur. More
+  // specifically, tv4 and tv5 should not have a computeAt tensor.
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
 
-  tv0->computeAt(tv4, 1);
-  tv1->computeAt(tv4, 1);
+  TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget);
+  TORCH_CHECK(tv2->getComputeAtView() == tv4);
+  TORCH_CHECK(tv3->getComputeAtView() == tv4);
+  TORCH_CHECK(!tv4->hasComputeAt());
+  TORCH_CHECK(!tv5->hasComputeAt());
 
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1000}, options);
+
+  auto t1 = t0 * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+
+  at::Tensor kernel_tv3 = at::empty_like(t0, options);
+  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5});
+
+  TORCH_CHECK(at::allclose(kernel_tv3, t3));
+  TORCH_CHECK(at::allclose(kernel_tv4, t4));
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
+}
+
+void testGPU_FusionComputeAtCommonConsumer2() {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv3 + tv4
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv2, new Float(-1.0));
+  TensorView* tv4 = add(tv1, new Float(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+
+  fusion.addOutput(tv5);
+
+  TensorView* computeAtTarget = tv3;
+
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
+
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  // This computeAt will affect all tensors including tv3, tv4 and
+  // tv5, even though it appears to impact only tv1 and tv2. The
+  // reason is that tv1 is now computed at tv3, so tv4 must also be
+  // computed at the same location. Similarly, the consumer of tv4,
+  // tv5, must also be computed at the same location. Overall, what
+  // will happen is basically we merge expressions of all tensors and
+  // compute them in a single loop nest. Internally, this will be
+  // realized by making all tensors, except for those in the path
+  // between tv1 and tv3, computed at tv5, which we call the common
+  // consumer.
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (Val* val : fusion.vals()) {
+    if (fusion.hasInput(val) ||
+        val->getValType().value() != ValType::TensorView) {
+      continue;
+    }
+    TensorView* tv = val->as<TensorView>();
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  TORCH_CHECK(tv1->getComputeAtView() == tv2);
+  TORCH_CHECK(tv2->getComputeAtView() == tv3);
+  // tv3 and tv4 are computed at tv5
+  TORCH_CHECK(tv3->getComputeAtView() == tv5);
+  TORCH_CHECK(tv4->getComputeAtView() == tv5);
+  TORCH_CHECK(!tv5->hasComputeAt());
 
   for (Val* val : fusion.vals()) {
     if (!fusion.hasInput(val) &&
         val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = static_cast<TensorView*>(val);
-
+      TensorView* tv = val->as<TensorView>();
       tv->axis(1)->parallelize(ParallelType::Unroll);
       tv->axis(-1)->parallelize(ParallelType::TIDx);
     }
   }
 
-  // f4 = f0 * f1
-  // f5 = f2 - f3
-  // t2 = t1 - f4
-  // t3 = t0 + f5
-  // t4 = t3 * t2
-
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
-  float fl0 = 0.1;
-  float fl1 = -0.2;
-  float fl2 = 0.3;
-  float fl3 = -0.4;
-  float fl4 = fl0 * fl1;
-  float fl5 = fl2 - fl3;
-
   at::Tensor t0 = at::randn({129, 127}, options);
-  at::Tensor t1 = at::rand_like(t0, options);
-
-  auto t2 = t1.sub(fl4);
-  auto t3 = t0.add(fl5);
-  auto t4 = t3.mul(t2);
 
-  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+  auto t1 = t0.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto t5 = t3 + t4;
 
-  at::Scalar test(fl0);
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  fe.runFusion(
-      {t0,
-       t1,
-       at::Scalar(fl0),
-       at::Scalar(fl1),
-       at::Scalar(fl2),
-       at::Scalar(fl3)},
-      {kernel_tv4});
-
-  GpuLower gpulw(&fusion);
-  std::stringstream actual_kernel;
-  gpulw.printKernel(actual_kernel);
+  fe.runFusion({t0}, {kernel_tv5});
 
-  TORCH_CHECK(at::allclose(kernel_tv4, t4), actual_kernel.str());
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
 }
 
-void testGPU_FusionLoopUnroll() {
+// Similar to the above common consumer test but adds an additional
+// tensor that has no common consumer with the other tensors.
+void testGPU_FusionComputeAtCommonConsumer3() {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv2 * -1
+  // tv4 = tv1 + 4
+  // tv5 = tv2 + tv3
+  // tv6 = tv1 + 6
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  // Set up your input tensor views
-  TensorView* tv0 = makeDummyTensor(3);
-  TensorView* tv1 = makeDummyTensor(3);
-
-  // Register your inputs
+  TensorView* tv0 = makeDummyTensor(2);
   fusion.addInput(tv0);
-  fusion.addInput(tv1);
 
-  // Do math with it, it returns a `Val*` but can be static_casted back to
-  // TensorView
-  TensorView* tv2 = add(tv1, new Float(2.0));
-  TensorView* tv3 = add(tv0, tv2);
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv2, new Float(-1.0));
+  TensorView* tv4 = add(tv1, new Float(4.0));
+  TensorView* tv5 = add(tv3, tv4);
+  TensorView* tv6 = add(tv1, new Float(6.0));
 
-  // Register your outputs
-  fusion.addOutput(tv3);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
 
-  int block_size = 16;
+  TensorView* computeAtTarget = tv3;
 
-  tv3->merge(0, 1);
-  tv3->merge(0, 1);
+  computeAtTarget->merge(0);
+  computeAtTarget->split(0, 128);
+  computeAtTarget->split(0, 4);
 
-  tv3->split(0, block_size);
-  tv3->split(0, 4);
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
 
-  // For all inputs, computeAt the output inline, temporaries should be squeezed
-  // between them
-  tv0->computeAt(tv3, 1);
-  tv1->computeAt(tv3, 1);
+  // This will have the same impact on the tensors except for tv5 and
+  // tv6. tv6 does not have any common consumer with the computeAt
+  // target, but since it uses tv1, it must be also computed at the
+  // same location as the other impacted tensors. We can either make
+  // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5
+  // should be computed at tv6 just because the current implementation
+  // orders the computeAt relationship based on the order in which
+  // tensors are specified as outputs.
 
-  // Parallelize
-  tv2->axis(1)->parallelize(ParallelType::Unroll);
-  tv3->axis(1)->parallelize(ParallelType::Unroll);
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->computeAt(computeAtTarget, 1);
+
+  // All tensors should have the same dimenionality as the target
+  for (Val* val : fusion.vals()) {
+    if (fusion.hasInput(val) ||
+        val->getValType().value() != ValType::TensorView) {
+      continue;
+    }
+    TensorView* tv = val->as<TensorView>();
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
+  }
+
+  TORCH_CHECK(tv1->getComputeAtView() == tv2);
+  TORCH_CHECK(tv2->getComputeAtView() == tv3);
+
+  // tv3 and tv4 are computed at tv5
+  TORCH_CHECK(tv3->getComputeAtView() == tv5);
+  TORCH_CHECK(tv4->getComputeAtView() == tv5);
+
+  // tv5 should be computed at tv6 since tv5 is added as an output
+  // before tv6. If we call fusion.addOutput(tv6) first, tv6 should be
+  // computed at tv5.
+  TORCH_CHECK(tv5->getComputeAtView() == tv6);
+  TORCH_CHECK(!tv6->hasComputeAt());
+
+  for (Val* val : fusion.vals()) {
+    if (!fusion.hasInput(val) &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = val->as<TensorView>();
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
-  at::Tensor input0 = at::rand({129, 13, 3}, options);
-  at::Tensor input1 = at::rand({129, 13, 3}, options);
+  at::Tensor t0 = at::randn({129, 127}, options);
+
+  auto t1 = t0.mul({0.5});
+  auto t2 = t1.mul({-1.0});
+  auto t3 = t2.mul({-1.0});
+  auto t4 = t1.add({4.0});
+  auto t5 = t3 + t4;
+  auto t6 = t1.add({6.0});
+
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
+  at::Tensor kernel_tv6 = at::empty_like(t0, options);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input0, input1});
+  fe.runFusion({t0}, {kernel_tv5, kernel_tv6});
 
-  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
+  TORCH_CHECK(at::allclose(kernel_tv6, t6));
 }
 
-/*
- * Helper function for single op testing that generates a codegen operand
- */
+// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor
+// that does not have data dependency with the consumer.
+void testGPU_FusionComputeAtNoCommonConsumer() {
+  // tv1 = tv0 * 0.5
+  // tv2 = tv1 * -1
+  // tv3 = tv1 * -2
+  // tv4 = tv2 + tv3
+  // tv5 = tv4 * 5
+  // tv6 = tv1 * 6
+  Fusion fusion;
+  FusionGuard fg(&fusion);
 
-Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
-  if (desc.first == ValType::TensorView) {
-    return makeDummyTensor(2, desc.second);
-  } else if (desc.first == ValType::Scalar) {
-    if (desc.second == DataType::Float)
-      return new Float();
-    else if (desc.second == DataType::Int)
-      return new Int();
-    else
-      TORCH_CHECK("Not currently supported type", desc.first);
-  } else {
-    TORCH_CHECK("Not currently supported type", desc.first);
+  TensorView* tv0 = makeDummyTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = mul(tv1, new Float(-1.0));
+  TensorView* tv3 = mul(tv1, new Float(-2.0));
+  TensorView* tv4 = add(tv2, tv3);
+  TensorView* tv5 = mul(tv4, new Float(5.0));
+  // Notice that tv6 is not a consumer of tv4.
+  TensorView* tv6 = mul(tv1, new Float(6.0));
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv4);
+  fusion.addOutput(tv5);
+  fusion.addOutput(tv6);
+
+  TensorView* computeAtTarget = tv3;
+  computeAtTarget->split(0, 128);
+  tv1->computeAt(computeAtTarget, 1);
+
+  TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv6};
+  for (auto tv : affected_tensors) {
+    TORCH_CHECK(tv->nDims() == computeAtTarget->nDims());
   }
-  return nullptr;
-}
 
-/*
- * Helper function for single op testing that generates an ATen operand
- */
+  TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget);
+  TORCH_CHECK(tv2->getComputeAtView() == tv4);
+  TORCH_CHECK(tv3->getComputeAtView() == tv4);
+  TORCH_CHECK(tv4->getComputeAtView() == tv5);
+  TORCH_CHECK(tv5->getComputeAtView() == tv6);
+  TORCH_CHECK(!tv6->hasComputeAt());
 
-IValue gen_aten_operand(
-    std::pair<ValType, DataType> desc,
-    int blocks,
+  computeAtTarget->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (auto tv : affected_tensors) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor t0 = at::randn({1000}, options);
+
+  auto t1 = t0 * 0.5;
+  auto t2 = t1 * -1.0;
+  auto t3 = t1 * -2.0;
+  auto t4 = t2 + t3;
+  auto t5 = t4 * 5.0;
+  auto t6 = t1 * 6.0;
+
+  at::Tensor kernel_tv3 = at::empty_like(t0, options);
+  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+  at::Tensor kernel_tv5 = at::empty_like(t0, options);
+  at::Tensor kernel_tv6 = at::empty_like(t0, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5, kernel_tv6});
+
+  TORCH_CHECK(at::allclose(kernel_tv3, t3));
+  TORCH_CHECK(at::allclose(kernel_tv4, t4));
+  TORCH_CHECK(at::allclose(kernel_tv5, t5));
+  TORCH_CHECK(at::allclose(kernel_tv6, t6));
+}
+
+namespace {
+
+void checkConcretized(
+    TensorView* v0,
+    int a0,
+    TensorView* v1,
+    int a1,
+    bool should_concretize) {
+  if (should_concretize) {
+    TORCH_CHECK(
+        IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1)));
+  } else {
+    TORCH_CHECK(
+        !IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1)));
+  }
+}
+
+} // namespace
+
+void testGPU_FusionBCastConcretizeBasic() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // tv0: [I I]
+  TensorView* tv0 = makeDummyTensor(2);
+
+  // tv1: [I I I]
+  TensorView* tv1 = makeDummyTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // tv2*: [B I I]
+  auto tv2_0 = broadcast(tv0, {true, false, false});
+  auto tv2_1 = broadcast(tv0, {true, false, false});
+  auto tv2 = add(tv2_0, tv2_1);
+
+  // tv3: [I I I]
+  auto tv3 = add(tv2, tv1);
+
+  fusion.addOutput(tv3);
+
+  checkConcretized(tv2, 0, tv1, 0, true);
+  checkConcretized(tv2_0, 0, tv1, 0, true);
+  checkConcretized(tv2_1, 0, tv1, 0, true);
+  checkConcretized(tv2_0, 1, tv1, 0, false);
+  checkConcretized(tv2_0, 0, tv1, 1, false);
+}
+
+void testGPU_FusionBCastConcretizeRfactor() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // both tv0 and tv1 = [I, I]
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+
+  //[B,I,I]
+  auto tv2 = broadcast(tv1, {true, false, false});
+
+  //[B,I,R]
+  auto tv3 = sum(tv2, {2});
+
+  auto tv5 = add(tv3, tv1);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // scheduling:
+  //[B,I,R0,R1=128], root = [B,I,R]
+  tv3->split(2, 128);
+
+  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
+  auto tv4 = tv3->rFactor({3});
+
+  checkConcretized(tv2, 0, tv5, 0, true);
+  checkConcretized(tv4, 0, tv5, 0, true);
+  checkConcretized(tv3, 0, tv5, 0, true);
+}
+
+namespace {
+
+void checkIdProvedEquivalent(
+    TensorView* v0,
+    int a0,
+    TensorView* v1,
+    int a1,
+    bool should_prove) {
+  if (should_prove) {
+    TORCH_CHECK(IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1)));
+  } else {
+    TORCH_CHECK(!IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1)));
+  }
+}
+
+} // namespace
+
+void testGPU_FusionProveIdEqBasic() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+  TensorView* tv2 = makeDummyTensor(3);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv3 = broadcast(tv0, {true, false, false});
+  auto tv4 = broadcast(tv1, {false, true, false});
+  auto tv5 = add(tv3, tv4);
+  fusion.addOutput(tv5);
+
+  checkIdProvedEquivalent(tv0, 0, tv4, 1, true);
+  checkIdProvedEquivalent(tv1, 0, tv4, 0, true);
+  checkIdProvedEquivalent(tv1, 1, tv0, 1, true);
+  checkIdProvedEquivalent(tv0, 0, tv5, 1, true);
+  checkIdProvedEquivalent(tv1, 1, tv5, 2, true);
+  checkIdProvedEquivalent(tv0, 0, tv1, 0, false);
+  checkIdProvedEquivalent(tv0, 1, tv1, 0, false);
+  checkIdProvedEquivalent(tv0, 0, tv1, 1, false);
+}
+
+void testGPU_FusionProveIdEqRfactor() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // [I,I]
+  TensorView* tv0 = makeDummyTensor(2);
+  // [I,I,I]
+  TensorView* tv1 = makeDummyTensor(3);
+
+  //[I,I,R]
+  auto tv2 = sum(tv1, {2});
+
+  auto tv5 = add(tv2, tv0);
+
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // scheduling:
+  //[B,I,R0,R1=128], root = [B,I,R]
+  tv2->split(2, 128);
+
+  // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf]
+  auto tv3 = tv2->rFactor({3});
+
+  checkIdProvedEquivalent(tv1, 0, tv0, 0, true);
+  checkIdProvedEquivalent(tv2, 0, tv0, 0, true);
+  checkIdProvedEquivalent(tv3, 0, tv0, 0, true);
+}
+
+void testGPU_FusionScalarInputs() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  fusion.addInput(tv0);
+  TensorView* tv1 = makeDummyTensor(2);
+  fusion.addInput(tv1);
+
+  Float* f0 = new Float();
+  fusion.addInput(f0);
+  Float* f1 = new Float();
+  fusion.addInput(f1);
+  Float* f2 = new Float();
+  fusion.addInput(f2);
+  Float* f3 = new Float();
+  fusion.addInput(f3);
+  Val* f4 = mul(f0, f1);
+  Val* f5 = sub(f2, f3);
+
+  TensorView* tv2 = sub(tv1, f4);
+  TensorView* tv3 = add(tv0, f5);
+  TensorView* tv4 = mul(tv3, tv2);
+
+  fusion.addOutput(tv4);
+
+  // Lets setup to actually run
+  while (tv4->nDims() > 1)
+    tv4->merge(0);
+  tv4->split(0, 128);
+  tv4->split(0, 4);
+
+  tv0->computeAt(tv4, 1);
+  tv1->computeAt(tv4, 1);
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+
+  for (Val* val : fusion.vals()) {
+    if (!fusion.hasInput(val) &&
+        val->getValType().value() == ValType::TensorView) {
+      TensorView* tv = static_cast<TensorView*>(val);
+
+      tv->axis(1)->parallelize(ParallelType::Unroll);
+      tv->axis(-1)->parallelize(ParallelType::TIDx);
+    }
+  }
+
+  // f4 = f0 * f1
+  // f5 = f2 - f3
+  // t2 = t1 - f4
+  // t3 = t0 + f5
+  // t4 = t3 * t2
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  float fl0 = 0.1;
+  float fl1 = -0.2;
+  float fl2 = 0.3;
+  float fl3 = -0.4;
+  float fl4 = fl0 * fl1;
+  float fl5 = fl2 - fl3;
+
+  at::Tensor t0 = at::randn({129, 127}, options);
+  at::Tensor t1 = at::rand_like(t0, options);
+
+  auto t2 = t1.sub(fl4);
+  auto t3 = t0.add(fl5);
+  auto t4 = t3.mul(t2);
+
+  at::Tensor kernel_tv4 = at::empty_like(t0, options);
+
+  at::Scalar test(fl0);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  fe.runFusion(
+      {t0,
+       t1,
+       at::Scalar(fl0),
+       at::Scalar(fl1),
+       at::Scalar(fl2),
+       at::Scalar(fl3)},
+      {kernel_tv4});
+
+  TORCH_CHECK(at::allclose(kernel_tv4, t4));
+}
+
+void testGPU_FusionLoopUnroll() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(3);
+  TensorView* tv1 = makeDummyTensor(3);
+
+  // Register your inputs
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+
+  // Do math with it, it returns a `Val*` but can be static_casted back to
+  // TensorView
+  TensorView* tv2 = add(tv1, new Float(2.0));
+  TensorView* tv3 = add(tv0, tv2);
+
+  // Register your outputs
+  fusion.addOutput(tv3);
+
+  int block_size = 16;
+
+  tv3->merge(0, 1);
+  tv3->merge(0, 1);
+
+  tv3->split(0, block_size);
+  tv3->split(0, 4);
+
+  // For all inputs, computeAt the output inline, temporaries should be squeezed
+  // between them
+  tv0->computeAt(tv3, 1);
+  tv1->computeAt(tv3, 1);
+
+  // Parallelize
+  tv2->axis(1)->parallelize(ParallelType::Unroll);
+  tv3->axis(1)->parallelize(ParallelType::Unroll);
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(0)->parallelize(ParallelType::BIDx);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  at::Tensor input0 = at::rand({129, 13, 3}, options);
+  at::Tensor input1 = at::rand({129, 13, 3}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({input0, input1});
+
+  TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0))));
+}
+
+/*
+ * Helper function for single op testing that generates a codegen operand
+ */
+
+Val* gen_jit_operand(std::pair<ValType, DataType> desc) {
+  if (desc.first == ValType::TensorView) {
+    return makeDummyTensor(2, desc.second);
+  } else if (desc.first == ValType::Scalar) {
+    if (desc.second == DataType::Float)
+      return new Float();
+    else if (desc.second == DataType::Int)
+      return new Int();
+    else
+      TORCH_CHECK("Not currently supported type", desc.first);
+  } else {
+    TORCH_CHECK("Not currently supported type", desc.first);
+  }
+  return nullptr;
+}
+
+/*
+ * Helper function for single op testing that generates an ATen operand
+ */
+
+IValue gen_aten_operand(
+    std::pair<ValType, DataType> desc,
+    int blocks,
     int threads,
     bool rand) {
   if (desc.first == ValType::TensorView) {
@@ -2012,7 +2492,7 @@ void test_op(
       gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor();
   std::vector<at::Tensor> output_vect = {output};
   cudaDeviceSynchronize();
-  if (fusion.hasRNG())
+  if (fusion.isStochastic())
     at::manual_seed(0);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
@@ -2020,7 +2500,7 @@ void test_op(
   fe.runFusion(aten_inputs_ivalues, output_vect);
   cudaDeviceSynchronize();
 
-  if (fusion.hasRNG())
+  if (fusion.isStochastic())
     at::manual_seed(0);
   at::Tensor ref_output = af(aten_inputs);
   cudaDeviceSynchronize(); // This sync shouldn't be necessary;
@@ -2054,12 +2534,8 @@ void test_op(
       op_str,
       " -- had a mismatch.",
       aten_inputs_to_str(),
-      "\nJIT: ",
-      output,
-      "\nREF: ",
-      ref_output,
-      "\nDIFF: ",
-      diff,
+      "\nABS MAX DIFF: ",
+      output.sub(ref_output).abs().max(),
       "\n");
 }
 
@@ -2385,14 +2861,8 @@ void testGPU_FusionCastOps() {
       "\nOp Type: -- ",
       "cast FP16->FP32->FP16",
       " -- had a mismatch.\n",
-      "IN1 : ",
-      input1,
-      "\n",
-      "JIT: ",
-      outputs[0],
-      "\n",
-      "REF: ",
-      ref_output,
+      "\nABS MAX DIFF: ",
+      outputs[0].sub(ref_output).abs().max(),
       "\n");
 }
 
@@ -3453,10 +3923,6 @@ void testGPU_FusionAdvancedIndexing() {
     FusionGuard fg(&fusion);
 
     int w = 3, x = 4, y = 7, z = 8;
-    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-
-    at::Tensor t0 = at::randn({x, y, z}, options);
-    at::Tensor t1 = at::randn({w, x, y, z}, options);
 
     auto tv0 = makeDummyTensor(3);
     auto tv1 = makeDummyTensor(4);
@@ -3465,10 +3931,42 @@ void testGPU_FusionAdvancedIndexing() {
 
     auto tv2 = add(tv0, new Float(1.0));
     auto tv3 = add(tv2, tv1);
-
     fusion.addOutput(tv3);
 
-    fuser::cuda::scheduleFusion(&fusion, {t0, t1});
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({x, y, z}, options);
+    at::Tensor t1 = at::randn({w, x, y, z}, options);
+
+    fuser::cuda::scheduleFusion(&fusion, {t0, t1});
+
+    torch::jit::fuser::cuda::FusionExecutor fe;
+    fe.compileFusion(&fusion);
+    auto outputs = fe.runFusion({t0, t1});
+
+    auto t2 = t0.add(1.0);
+    auto t3 = t2.add(t1);
+
+    TORCH_CHECK(t3.allclose(outputs[0]));
+  }
+
+  {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    // Set up your input tensor views
+    TensorView* tv0 = makeConcreteTensor({10, 20});
+    fusion.addInput(tv0);
+    TensorView* tv1 = makeConcreteTensor({10, 10, 20});
+    fusion.addInput(tv1);
+
+    TensorView* tv2 = add(tv0, new Float(1));
+    TensorView* tv3 = broadcast(tv2, {true, false, false});
+    TensorView* tv4 = add(tv3, tv1);
+    fusion.addOutput(tv4);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor t0 = at::randn({10, 20}, options);
+    at::Tensor t1 = at::randn({10, 10, 20}, options);
 
     torch::jit::fuser::cuda::FusionExecutor fe;
     fe.compileFusion(&fusion);
@@ -4624,23 +5122,21 @@ void testGPU_FusionReductionScheduler() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand({bid_x, tid_x}, options);
+  at::Tensor input = at::randn({bid_x, tid_x}, options);
 
   // Apply reduction heuristic
-  const at::ArrayRef<c10::IValue> inputs({input});
-
-  TORCH_CHECK(
-      cuda::scheduleReduction(&fusion, inputs, tv1),
-      "Reduction schedule was not generated!");
+  auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {});
 
   cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
   // no broadcasting needed, omitting the last optional argument;
-  auto outputs = fe.runFusion({input});
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
   auto aten_output = input.sum({red_dim});
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
@@ -4685,9 +5181,9 @@ void testGPU_FusionSymbolicReduction() {
   // How many threads to use for the block reduction
   int runtime_threadIdx_dim = 128;
 
-  torch::jit::fuser::cuda::FusionExecutor executor;
-  executor.compileFusion(&fusion);
-  auto outputs = executor.runFusion(
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(
       {input},
       torch::jit::fuser::cuda::LaunchParams(
           -1, -1, -1, runtime_threadIdx_dim, -1, -1));
@@ -4716,24 +5212,22 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand(tensor_dims_in, options);
+  at::Tensor input = at::randn(tensor_dims_in, options);
   at::Tensor cg_output = at::empty(tensor_dims_out, options);
 
   // Apply reduction heuristic
-  const at::ArrayRef<c10::IValue> inputs({input});
-
-  TORCH_CHECK(
-      cuda::scheduleReduction(&fusion, inputs, tv1),
-      "Reduction schedule was not generated!");
+  auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {});
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input});
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
 
   auto aten_output = input.sum(red_dims64);
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
@@ -4758,26 +5252,26 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand(tensor_dims_in, options);
+  at::Tensor input = at::randn(tensor_dims_in, options);
 
-  TORCH_CHECK(
-      cuda::scheduleReduction(&fusion, {input}, tv1),
-      "Reduction schedule was not generated!");
+  auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {});
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({input});
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
 
   auto aten_output = input.sum(red_dims64);
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-05, 1e-05),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
 
 void testGPU_FusionReductionSchedulerDimShmoo() {
-  std::vector<bool> fp16_usage = {false};
+  std::vector<bool> fp16_usage = {true, false};
   std::vector<int> red_axis = {1, 0};
   std::vector<int> output_dims = {320, 640};
   std::vector<int> red_dims;
@@ -4821,40 +5315,31 @@ void testGPU_FusionReductionSchedulerDimShmoo() {
                              .dtype((fp16 ? at::kHalf : at::kFloat))
                              .device(at::kCUDA, 0);
           at::Tensor input =
-              (axis ? at::rand({odim, rdim}, options)
-                    : at::rand({rdim, odim}, options));
-
-          const at::ArrayRef<c10::IValue> inputs({input});
+              (axis ? at::randn({odim, rdim}, options)
+                    : at::randn({rdim, odim}, options));
 
-          c10::optional<cuda::ReductionParams> rparams =
-              cuda::scheduleReduction(&fusion, inputs, tv1);
-          TORCH_CHECK(rparams != c10::nullopt, "Reduction is not found!");
+          std::vector<TensorView*> outputs_of_red;
           if (fp16) {
-            if (axis == 0) {
-              int tidx = rparams.value().lparams.bdimx();
-              tv1_cast->split(-1, tidx);
-              tv1_cast->axis(-1)->parallelize(ParallelType::TIDx);
-              tv1_cast->axis(-2)->parallelize(ParallelType::BIDx);
-            } else {
-              if (rparams.value().mul_reds_per_blk) {
-                int tidy = rparams.value().lparams.bdimy();
-                tv1_cast->split(0, tidy);
-                tv1_cast->axis(-1)->parallelize(ParallelType::TIDy);
-              }
-              tv1_cast->axis(0)->parallelize(ParallelType::BIDx);
-            }
+            outputs_of_red.push_back(tv1_cast);
           }
 
+          auto reduction_params =
+              cuda::getReductionHeuristics(&fusion, {input}, tv1);
+          TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!");
+          cuda::scheduleReduction(
+              &fusion, reduction_params.value(), tv1, outputs_of_red);
+
           torch::jit::fuser::cuda::FusionExecutor fe;
           fe.compileFusion(&fusion);
 
-          auto cg_output = fe.runFusion({input});
+          auto outputs =
+              fe.runFusion({input}, reduction_params.value().lparams);
           auto aten_output = input.sum({axis});
 
           TORCH_CHECK(
-              aten_output.allclose(cg_output[0]),
+              aten_output.allclose(outputs[0], 1e-03, 1e-03),
               "Error of: ",
-              aten_output.sub(cg_output[0]).abs().max());
+              aten_output.sub(outputs[0]).abs().max());
         }
       }
     }
@@ -5203,6 +5688,7 @@ void testGPU_FusionSmem() {
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
 }
 
 void testGPU_FusionSmemReduce() {
@@ -5245,61 +5731,314 @@ void testGPU_FusionSmemReduce() {
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0});
+  auto outputs = fe.runFusion({t0});
+
+  at::Tensor aten_output = sum(t0, {1});
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1);
+}
+
+void testGPU_FusionSmemBlockGemm() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeDummyTensor(2); // (M, K)
+  TensorView* tv1 = makeDummyTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  constexpr int BSX = 16;
+  tv5->split(2, BSX);
+  tv5->split(1, BSX);
+  tv5->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv6 = tv5->rFactor({-1});
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+
+  tv0->computeAt(tv5, 3);
+  tv1->computeAt(tv5, 3);
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+  tv6->axis(-3)->parallelize(ParallelType::TIDy);
+  tv6->axis(-2)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0, t1});
+
+  at::Tensor aten_output = matmul(t0, t1);
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
+}
+
+void testGPU_FusionSmemBlockGemmCache() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  TensorView* tv0 = makeDummyTensor(2); // (M, K)
+  TensorView* tv1 = makeDummyTensor(2); // (K, N)
+  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
+  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
+  TensorView* tv4 = mul(tv2, tv3); // M, K, N
+  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addOutput(tv5);
+
+  // Schedule
+  // Remove reduction axis from tv5
+  // tv6 = (M, R, N)
+  // tv5 = (M, N)
+  TensorView* tv6 = tv5->cache_before();
+
+  constexpr int BSX = 16;
+  tv5->split(1, BSX);
+  tv5->split(0, BSX);
+  // M/BSX, BSX, N/BSX, BSX
+  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
+  // tv5 = M/BSX, N/BSX, MSX, NSX
+
+  tv6->computeAt(tv5, 2);
+  tv6->computeAt(tv5, 2);
+
+  tv6->split(-1, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
+  TensorView* tv7 = tv6->rFactor({-1});
+  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
+  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
+
+  tv0->computeAt(tv6, 3);
+  tv1->computeAt(tv6, 3);
+
+  tv0->computeAt(tv7, 3);
+  tv1->computeAt(tv7, 3);
+
+  tv2->setMemoryType(MemoryType::Shared);
+  tv3->setMemoryType(MemoryType::Shared);
+  tv4->setMemoryType(MemoryType::Shared);
+  tv6->setMemoryType(MemoryType::Shared);
+  tv7->setMemoryType(MemoryType::Shared);
+  // Memory Type
+
+  // Thread and Block binding
+  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(1)->parallelize(ParallelType::BIDy);
+  tv5->axis(-2)->parallelize(ParallelType::TIDy);
+  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+  tv4->axis(-1)->parallelize(ParallelType::TIDx);
+
+  tv7->axis(-3)->parallelize(ParallelType::TIDy);
+  tv7->axis(-2)->parallelize(ParallelType::TIDx);
+
+  tv6->axis(-2)->parallelize(ParallelType::TIDy);
+  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K}, options);
+  at::Tensor t1 = at::randn({K, N}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({t0, t1});
+
+  at::Tensor aten_output = matmul(t0, t1);
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
+}
+
+void testGPU_FusionSmemDynamicReductionSymbolic() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Shared);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::rand({numel_x, numel_y}, options);
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(
+      {input},
+      torch::jit::fuser::cuda::LaunchParams(
+          -1, -1, -1, runtime_threadIdx_dim, -1, -1));
+
+  auto aten_output = input.sum({1});
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
+}
+
+void testGPU_FusionSmemDynamicReductionSymbolicArg() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Algorithm
+  Int* sym_bsx = new Int();
+  TensorView* tv0 = makeDummyTensor(3); // M, K, N
+  fusion.addInput(tv0);
+  fusion.addInput(sym_bsx);
+
+  TensorView* tv1 = sum(tv0, {1}); // M, R, N
+  fusion.addOutput(tv1);
+
+  TensorView* tv2 = tv0->cache_after();
+  tv2->setMemoryType(MemoryType::Shared);
+
+  // Schedule
+  constexpr int BSX = 32;
+  tv1->split(2, BSX);
+  tv1->split(1, sym_bsx);
+  tv1->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}});
+  TensorView* tv3 = tv1->rFactor({-2});
+
+  tv0->computeAt(tv1, -2);
+  tv0->computeAt(tv3, -2);
+
+  // Thread and Block binding
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+  tv1->axis(1)->parallelize(ParallelType::BIDy);
+  tv1->axis(-1)->parallelize(ParallelType::TIDx);
+  // Manual Binding
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv3->axis(-1)->parallelize(ParallelType::TIDx);
+
+  constexpr int M = 154, K = 45, N = 1524;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({M, K, N}, options);
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(
+      {t0, runtime_threadIdx_dim},
+      torch::jit::fuser::cuda::LaunchParams(
+          -1, -1, -1, runtime_threadIdx_dim, -1, -1));
 
   at::Tensor aten_output = sum(t0, {1});
   TORCH_CHECK(
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1);
 }
 
-void testGPU_FusionSmemBlockGemm() {
+void testGPU_FusionSmemDynamicPwiseMulSymbolicArgWAR() {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  // Algorithm
+  Int* sym_bsx = new Int();
   TensorView* tv0 = makeDummyTensor(2); // (M, K)
   TensorView* tv1 = makeDummyTensor(2); // (K, N)
   TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
   TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
   TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
   fusion.addInput(tv0);
   fusion.addInput(tv1);
-  fusion.addOutput(tv5);
-
-  // Schedule
-  constexpr int BSX = 16;
-  tv5->split(2, BSX);
-  tv5->split(1, BSX);
-  tv5->split(0, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv6 = tv5->rFactor({-1});
+  fusion.addInput(sym_bsx);
+  fusion.addOutput(tv4);
+  // Algorithm
 
   tv2->setMemoryType(MemoryType::Shared);
   tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
 
-  tv0->computeAt(tv5, 3);
-  tv1->computeAt(tv5, 3);
+  constexpr int BSX = 32;
+  tv4->split(2, BSX);
+  tv4->split(1, sym_bsx);
+  tv4->split(0, BSX);
+  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
+  tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}});
+  // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX
 
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
-  tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
+  tv0->computeAt(tv4, 3);
+  tv1->computeAt(tv4, 3);
+  // Schedule
+
+  tv4->axis(0)->parallelize(ParallelType::BIDx);
+  tv4->axis(2)->parallelize(ParallelType::BIDy);
   // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(-2)->parallelize(ParallelType::TIDx);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
-  tv6->axis(-3)->parallelize(ParallelType::TIDy);
-  tv6->axis(-2)->parallelize(ParallelType::TIDx);
+  // Thread and Block binding
 
-  constexpr int M = 154, K = 45, N = 1524;
+  constexpr int M = 128, K = 457, N = 1024;
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({M, K}, options);
@@ -5307,100 +6046,231 @@ void testGPU_FusionSmemBlockGemm() {
 
   torch::jit::fuser::cuda::FusionExecutor fe;
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0, t1});
+  auto outputs = fe.runFusion(
+      {t0, t1, BSX},
+      torch::jit::fuser::cuda::LaunchParams(-1, -1, -1, BSX, -1, -1));
 
-  at::Tensor aten_output = matmul(t0, t1);
+  at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0));
   TORCH_CHECK(
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(22) == 1);
 }
 
-void testGPU_FusionSmemBlockGemmCache() {
-#if 0
+void testGPU_FusionSmemDynamicTiledGemm() {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  // Algorithm
-  TensorView* tv0 = makeDummyTensor(2); // (M, K)
-  TensorView* tv1 = makeDummyTensor(2); // (K, N)
-  TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B)
-  TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N)
-  TensorView* tv4 = mul(tv2, tv3); // M, K, N
-  TensorView* tv5 = sum(tv4, {1}); // M, R, N
+  // Symbolic integers we will use for runtime tiling
+  Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z
+  Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x
+  Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x
+  // Compile-time integer for tiling
+  int n_smem_tile = 8; // bound to threadIdx.y
+
+  // Symbolic 2D tensors TV0[M, K], TV1[K, N]
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+
+  // Broadcast tv0 to [M, K, *]
+  TensorView* tv2 = broadcast(tv0, {false, false, true});
+  // Broadcast tv1 to [*, K, N]
+  TensorView* tv3 = broadcast(tv1, {true, false, false});
+
+  // Pointwise multiplication resulting in tv3[M, K, N]
+  TensorView* tv4 = mul(tv2, tv3);
+
+  // Turn the K-dimension of tv4 into a reduction dimension
+  TensorView* tv5 = sum(tv4, {1});
+
+  // Register inputs and outputs
   fusion.addInput(tv0);
   fusion.addInput(tv1);
   fusion.addOutput(tv5);
 
-  // Schedule
-  // Remove reduction axis from tv5
-  // tv6 = (M, R, N)
-  // tv5 = (M, N)
-  TensorView* tv6 = tv5->cache_before();
+  // Register runtime tile dims as inputs
+  fusion.addInput(symbolic_m_tile_dim);
+  fusion.addInput(symbolic_split_k_tile_dim);
+  fusion.addInput(symbolic_block_k_tile_dim);
 
-  constexpr int BSX = 16;
-  tv5->split(1, BSX);
-  tv5->split(0, BSX);
-  // M/BSX, BSX, N/BSX, BSX
-  tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}});
-  // tv5 = M/BSX, N/BSX, MSX, NSX
+  // Make a 3D tile, mix of symbolic and constant, do in reverse order because
+  // dims are inserted
+  tv5->split(2, n_smem_tile);
+  tv5->split(1, symbolic_block_k_tile_dim);
+  tv5->split(1, symbolic_split_k_tile_dim);
+  tv5->split(0, symbolic_m_tile_dim);
 
-  tv6->computeAt(tv5, 2);
+  // Reorder so all outer tiles are in the leftmost 3 positions
+  tv5->reorder({{1, 5}, {5, 1}});
+
+  // Factor out the outer reduction IterDomain, then run the inter-cta
+  // reduction, and intra-cta reduction
+  auto tv6 = tv5->rFactor({2});
+
+  // Scope computations
   tv6->computeAt(tv5, 2);
 
-  tv6->split(-1, BSX);
-  // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX
-  tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}});
-  // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX
-  TensorView* tv7 = tv6->rFactor({-1});
-  // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr
-  // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX
+  // RFactor moves reduction axes around, reorder to match ordering of tv5
+  tv6->reorder({
+      {2, -2},
+      {3, -1},
+      {4, 2},
+      {5, 3},
+      {6, 4},
+  });
 
+  // Setup compute at schedule
   tv0->computeAt(tv6, 3);
   tv1->computeAt(tv6, 3);
+  tv4->computeAt(tv6, -1);
+  //
+  // T2[Mo,  bNo, Koo, Koi,  Kii,  Mi, bNi] CA(4, 3)
+  // T3[bMo,  No, Koo, Koi,  Kii, bMi,  Ni] CA(4, 3)
+  // T4[ Mo,  No, Koo, Koi,  Kii,  Mi,  Ni]
+  // T6[ Mo,  No, rKoo, Koi, Kii,  Mi,  Ni]
+  // T5[ Mo,  No,      rKoi, rKii, Mi,  Ni]
 
-  tv0->computeAt(tv7, 3);
-  tv1->computeAt(tv7, 3);
-
+  // Cache smem tiles
   tv2->setMemoryType(MemoryType::Shared);
   tv3->setMemoryType(MemoryType::Shared);
-  tv4->setMemoryType(MemoryType::Shared);
-  tv6->setMemoryType(MemoryType::Shared);
-  tv7->setMemoryType(MemoryType::Shared);
-  // Memory Type
+  tv4->setMemoryType(MemoryType::Local);
+  tv6->setMemoryType(MemoryType::Local);
 
-  // Thread and Block binding
-  tv5->axis(0)->parallelize(ParallelType::BIDx);
+  tv5->axis(0)->parallelize(ParallelType::BIDz);
   tv5->axis(1)->parallelize(ParallelType::BIDy);
-  tv5->axis(-2)->parallelize(ParallelType::TIDy);
-  tv5->axis(-1)->parallelize(ParallelType::TIDx);
-  // Manual Binding
-  tv2->axis(-1)->parallelize(ParallelType::TIDx);
-  tv3->axis(-1)->parallelize(ParallelType::TIDx);
-  tv4->axis(-1)->parallelize(ParallelType::TIDx);
 
-  tv7->axis(-3)->parallelize(ParallelType::TIDy);
-  tv7->axis(-2)->parallelize(ParallelType::TIDx);
+  std::vector<TensorView*> tv_list = {tv2, tv3, tv4, tv5, tv6};
+  for (auto tv : tv_list) {
+    tv->axis(-2)->parallelize(ParallelType::TIDz);
+    tv->axis(-1)->parallelize(ParallelType::TIDy);
+  }
+  tv2->axis(3)->parallelize(ParallelType::TIDx);
+  tv3->axis(3)->parallelize(ParallelType::TIDx);
+  tv4->axis(3)->parallelize(ParallelType::TIDx);
+  tv6->axis(3)->parallelize(ParallelType::TIDx);
+  tv5->axis(2)->parallelize(ParallelType::TIDx);
 
-  tv6->axis(-2)->parallelize(ParallelType::TIDy);
-  tv6->axis(-1)->parallelize(ParallelType::TIDx);
+  tv2->axis(4)->parallelize(ParallelType::BIDx);
+  tv3->axis(4)->parallelize(ParallelType::BIDx);
+  tv4->axis(4)->parallelize(ParallelType::BIDx);
+  tv6->axis(4)->parallelize(ParallelType::BIDx);
+  tv5->axis(3)->parallelize(ParallelType::BIDx);
 
-  constexpr int M = 154, K = 45, N = 1524;
+  constexpr int M = 31, K = 65, N = 33;
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({M, K}, options);
-  at::Tensor t1 = at::randn({K, N}, options);
+  at::Tensor A = at::randn({M, K}, options);
+  at::Tensor B = at::randn({K, N}, options);
 
   torch::jit::fuser::cuda::FusionExecutor fe;
+  // Generate CUDA and compile with nvRTC
   fe.compileFusion(&fusion);
-  auto outputs = fe.runFusion({t0, t1});
 
-  at::Tensor aten_output = matmul(t0, t1);
+  // Runtime tiling
+  int m_tile = 4; // bound to threadIdx.z
+  int split_k = 7; // bound to blockIdx.x
+  int intra_cta = 8; // bound to threadIdx.x
+
+  auto fuser_outputs = fe.runFusion({A, B, m_tile, split_k, intra_cta});
+  auto C_fuser = fuser_outputs[0];
+
+  at::Tensor aten_C = mul(A.unsqueeze(2), B.unsqueeze(0)).sum(1);
+  TORCH_CHECK(
+      aten_C.allclose(C_fuser, 1e-5, 1e-5),
+      "Error of: ",
+      aten_C.sub(C_fuser).abs().max());
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1);
+  TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(41) == 1);
+}
+
+void testGPU_FusionGlobalIntermediate() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0);
+  fusion.addInput(tv0);
+  fusion.addOutput(tv1);
+  // tv1[I0, R1] = tv0[I0, I1]
+
+  // Interface should just be a direct split with a Parallel type. We can
+  // include the parallelize call if we do this.
+  tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1]
+
+  TensorView* tv2 = tv1->rFactor({2});
+  tv2->setMemoryType(MemoryType::Global);
+  // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1]
+  // tv1[I0,        R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}]
+
+  tv0->computeAt(tv1, 1);
+
+  tv2->axis(-1)->parallelize(ParallelType::TIDx);
+  tv1->axis(0)->parallelize(ParallelType::BIDx);
+
+  constexpr int numel_x = 65000, numel_y = 1024;
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input = at::rand({numel_x, numel_y}, options);
+
+  // How many threads to use for the block reduction
+  constexpr int runtime_threadIdx_dim = 128;
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(
+      {input},
+      torch::jit::fuser::cuda::LaunchParams(
+          -1, -1, -1, runtime_threadIdx_dim, -1, -1));
+
+  auto aten_output = input.sum({1});
   TORCH_CHECK(
       aten_output.allclose(outputs[0], 1e-5, 1e-5),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
-#endif
+}
+
+void testGPU_FusionGlobalIntermediateDefaultSchedule() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeDummyTensor(2);
+  TensorView* tv1 = makeDummyTensor(2);
+  TensorView* tv2 = makeDummyTensor(2);
+  TensorView* tv3 = makeDummyTensor(2);
+  TensorView* tv4 = sub(tv2, tv3);
+  TensorView* tv5 = add(tv1, tv4);
+  TensorView* tv6 = sub(tv5, tv0);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  fusion.addInput(tv2);
+  fusion.addInput(tv3);
+  fusion.addOutput(tv6);
+  // t6 = ((t1 + (t2 - t3)) - t0)
+
+  tv4->setMemoryType(MemoryType::Global);
+  tv5->setMemoryType(MemoryType::Global);
+  tv6->setMemoryType(MemoryType::Global);
+
+  constexpr int M = 32, N = 810;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor in0 = at::rand({M, N}, options);
+  at::Tensor in1 = at::rand({M, N}, options);
+  at::Tensor in2 = at::rand({M, N}, options);
+  at::Tensor in3 = at::rand({M, N}, options);
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion({in0, in1, in2, in3});
+
+  at::Tensor aten_output = (in1 + (in2 - in3)) - in0;
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-5, 1e-5),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().sum());
 }
 
 void testGPU_FusionConstCheck() {
@@ -5990,6 +6860,195 @@ void testGPU_FusionThreadPredicate() {
   TORCH_CHECK(aten_output_tv3.allclose(cg_output_tv3));
 }
 
+void testGPU_FusionLSTMCell() {
+  const int hidden_features = 512;
+  const int batch_size = 64;
+
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tvs[16];
+  for (size_t i = 0; i < 16; i++) {
+    tvs[i] = makeDummyTensor(2);
+    fusion.addInput(tvs[i]);
+  }
+
+  auto ingate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]));
+
+  auto forgetgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]));
+
+  auto cellgate = unaryOp(
+      UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]));
+
+  auto outgate = unaryOp(
+      UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]));
+
+  auto cx = makeContigTensor(2);
+  fusion.addInput(cx);
+
+  auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate));
+
+  auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy));
+
+  fusion.addOutput(cy);
+  fusion.addOutput(hy);
+
+  std::vector<c10::IValue> inputs;
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor large_tensor0 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor1 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor2 =
+      at::randn({batch_size, hidden_features * 4}, options);
+  at::Tensor large_tensor3 =
+      at::randn({batch_size, hidden_features * 4}, options);
+
+  auto chunked0 = large_tensor0.chunk(4, 1);
+  auto chunked1 = large_tensor1.chunk(4, 1);
+  auto chunked2 = large_tensor2.chunk(4, 1);
+  auto chunked3 = large_tensor3.chunk(4, 1);
+
+  inputs.insert(inputs.end(), chunked0.begin(), chunked0.end());
+  inputs.insert(inputs.end(), chunked1.begin(), chunked1.end());
+  inputs.insert(inputs.end(), chunked2.begin(), chunked2.end());
+  inputs.insert(inputs.end(), chunked3.begin(), chunked3.end());
+
+  auto at_ingate =
+      chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid();
+  auto at_forgetgate =
+      chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid();
+  auto at_cellgate =
+      chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh();
+  auto at_outgate =
+      chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid();
+
+  auto at_cx = at::randn({batch_size, hidden_features}, options);
+  inputs.push_back(at_cx);
+  auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate));
+  auto at_hy = at_outgate.mul(at_cy.tanh());
+
+  fuser::cuda::scheduleFusion(&fusion, c10::ArrayRef<c10::IValue>(inputs));
+
+  torch::jit::fuser::cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  auto outputs = fe.runFusion(c10::ArrayRef<c10::IValue>(inputs));
+
+  TORCH_CHECK(at_cy.allclose(outputs[0], 1e-4, 1e-7));
+  TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7));
+}
+
+void testGPU_FusionComputeAtMultiBCast() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(1);
+  fusion.addInput(tv0);
+
+  TensorView* tv1 = mul(tv0, new Float(0.5));
+  TensorView* tv2 = broadcast(tv1, {true, false});
+  TensorView* tv3 = broadcast(tv1, {false, true});
+  TensorView* tv4 = add(tv2, tv3);
+  fusion.addOutput(tv4);
+
+  // This is not supported and should throw an exception.
+  ASSERT_ANY_THROW(tv1->computeAt(tv3, -1));
+}
+
+void testGPU_FusionReductionHalf() {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // Set up your input tensor views
+  TensorView* tv0 = makeDummyTensor(3, DataType::Half);
+  fusion.addInput(tv0);
+
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = add(tv1, new Float(1.0));
+  auto tv3 = sum(tv2, {2});
+  auto tv4 = castOp(DataType::Half, tv3);
+
+  fusion.addOutput(tv4);
+
+  const auto options =
+      at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor input = at::randn({8, 8, 16}, options);
+
+  auto reduction_tv = tv3;
+
+  auto outputsOfReduction = DependencyCheck::getAllOutputsOf({reduction_tv});
+
+  // Grab only tensor views, though there shouldn't be any other type
+  auto tv_entries = ir_utils::filterByType<TensorView>(outputsOfReduction);
+
+  std::vector<TensorView*> tvOutputsOfReduction(
+      tv_entries.begin(), tv_entries.end());
+
+  auto reduction_params =
+      cuda::getReductionHeuristics(&fusion, {input}, reduction_tv);
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+  cuda::scheduleReduction(
+      &fusion, reduction_params.value(), reduction_tv, tvOutputsOfReduction);
+
+  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+
+  cuda::FusionExecutor fe;
+  fe.compileFusion(&fusion);
+  // no broadcasting needed, omitting the last optional argument;
+  auto outputs = fe.runFusion({input}, reduction_params.value().lparams);
+
+  auto aten_output = input.to(c10::ScalarType::Float)
+                         .add(1.0)
+                         .sum({2})
+                         .to(c10::ScalarType::Half);
+
+  TORCH_CHECK(
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
+      "Error of: ",
+      aten_output.sub(outputs[0]).abs().max());
+}
+
+void testGPU_FusionInputsIdLookup() {
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 8, 8}, options);
+  at::Tensor t1 = at::randn({8, 8}, options);
+  at::Tensor t2 = at::randn({6, 4}, options);
+
+  // create a cache with max size 2;
+  auto inputs_id_lookup = torch::jit::fuser::cuda::InputsIdLookup(2);
+
+  // testing basic function, same encoding for identical inputs
+  auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0});
+  auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5});
+  TORCH_CHECK(id_0.id == id_0_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 1);
+  TORCH_CHECK(id_0.eviction == false);
+
+  // new input (even tho same shape, but we have different signature because of
+  // missing scalar input
+  auto id_1 = inputs_id_lookup.lookupId({t0, t1});
+  auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1.id == id_1_lookup.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_1.eviction == false);
+
+  // eviction should happen at this point
+  auto id_2 = inputs_id_lookup.lookupId({t2, t1});
+  TORCH_CHECK(id_2.id != id_0.id);
+  TORCH_CHECK(id_2.id != id_1.id);
+  TORCH_CHECK(inputs_id_lookup.size() == 2);
+  TORCH_CHECK(id_2.eviction == true);
+  TORCH_CHECK(id_2.evict_id == id_0.id);
+
+  // look at input 1 again
+  auto id_1_relook = inputs_id_lookup.lookupId({t0, t1});
+  TORCH_CHECK(id_1_relook.id == id_1.id);
+  TORCH_CHECK(id_1_relook.eviction == false);
+}
+
 } // namespace jit
 } // namespace torch
 
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 0285559fb8fc..a058326c2050 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -135,7 +135,16 @@ namespace jit {
   _(GPU_FusionCompoundOps)                          \
   _(GPU_FusionCastOps)                              \
   _(GPU_FusionAdvancedComputeAt)                    \
+  _(GPU_FusionComputeAtMultiConsumers)              \
+  _(GPU_FusionComputeAtCommonConsumer1)             \
+  _(GPU_FusionComputeAtCommonConsumer2)             \
+  _(GPU_FusionComputeAtCommonConsumer3)             \
+  _(GPU_FusionComputeAtNoCommonConsumer)            \
   _(GPU_FusionScalarInputs)                         \
+  _(GPU_FusionBCastConcretizeBasic)                 \
+  _(GPU_FusionBCastConcretizeRfactor)               \
+  _(GPU_FusionProveIdEqBasic)                       \
+  _(GPU_FusionProveIdEqRfactor)                     \
   _(GPU_FusionRFactorReplay)                        \
   _(GPU_FusionReduction)                            \
   _(GPU_FusionReduction2)                           \
@@ -183,6 +192,12 @@ namespace jit {
   _(GPU_FusionSmemReduce)                           \
   _(GPU_FusionSmemBlockGemm)                        \
   _(GPU_FusionSmemBlockGemmCache)                   \
+  _(GPU_FusionSmemDynamicReductionSymbolic)         \
+  _(GPU_FusionSmemDynamicReductionSymbolicArg)      \
+  _(GPU_FusionSmemDynamicPwiseMulSymbolicArgWAR)    \
+  _(GPU_FusionSmemDynamicTiledGemm)                 \
+  _(GPU_FusionGlobalIntermediate)                   \
+  _(GPU_FusionGlobalIntermediateDefaultSchedule)    \
   _(GPU_FusionConstCheck)                           \
   _(GPU_FusionSymbolicReduction)                    \
   _(GPU_FusionUnrollWithAlloc)                      \
@@ -197,7 +212,11 @@ namespace jit {
   _(GPU_FusionTraversalOrder6)                      \
   _(GPU_FusionTraversalOrder7)                      \
   _(GPU_FusionBranches)                             \
-  _(GPU_FusionThreadPredicate)
+  _(GPU_FusionThreadPredicate)                      \
+  _(GPU_FusionLSTMCell)                             \
+  _(GPU_FusionComputeAtMultiBCast)                  \
+  _(GPU_FusionReductionHalf)                        \
+  _(GPU_FusionInputsIdLookup)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
   _(GraphExecutor)              \
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 9d61cd5dd157..0c8a1f9a967d 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -550,9 +550,8 @@ def t(x: torch.Tensor, y: torch.Tensor):
         jit_o = t_jit(x, y)
         jit_o = t_jit(x, y)
         o = t(x, y)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP)
 
     # end-2-end test of permutation & contiguity handling in integration.
@@ -595,11 +594,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         jit_o = t_jit(x, y)
         jit_o = t_jit(x, y)
         o = t(x, y)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            # numerical issues here due to our scheduling.
-            # can't use `self.assertEqual(oo, jit_oo)`
-            self.assertTrue(self._compare("comparing output failed", oo, jit_oo, 1e-4))
+        self.assertEqual(o.dtype, jit_o.dtype)
+        # numerical issues here due to our scheduling.
+        # can't use `self.assertEqual(o, jit_o)`
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
         self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP)
 
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
@@ -630,6 +628,81 @@ def test_reduction_permutation(self):
                     for perm1 in itertools.permutations(range(len(x))):
                         self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1)
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
+                     ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
+    def test_reduction_multiple_output(self):
+        torch._C._jit_set_bailout_depth(2)
+
+        def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor):
+            o = torch.mul(x, y)
+            o = torch.mul(o, scale)
+            out1 = torch.mul(o, z)
+            out2 = torch.sum(out1, dim=[2])
+            return out1, out2
+
+        t_jit = torch.jit.script(t)
+        x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda")
+        scale = 0.5
+        jit_o = t_jit(x, y, scale, z)
+        jit_o = t_jit(x, y, scale, z)
+        o = t(x, y, scale, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP)
+
+        x = x.to(memory_format=torch.channels_last)
+        y = y.to(memory_format=torch.channels_last)
+        z = z.to(memory_format=torch.channels_last)
+        jit_o = t_jit(x, y, scale, z)
+        jit_o = t_jit(x, y, scale, z)
+        o = t(x, y, scale, z)
+        for oo, jit_oo in zip(o, jit_o):
+            self.assertEqual(oo.dtype, jit_oo.dtype)
+            self.assertEqual(oo, jit_oo)
+        self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP)
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
+                     ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
+    @skipIfRocm
+    def test_reduction_dtype(self):
+        def t(x: torch.Tensor):
+            o = torch.mul(x, 1.0)
+            o = torch.sum(o, dim=[2], dtype=torch.float32)
+            return o
+        t_jit = torch.jit.script(t)
+
+        x = torch.randn(8, 4, 16, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP)
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
+                     ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
+    @skipIfRocm
+    def test_reduction_half(self):
+        def t(x: torch.Tensor):
+            o = torch.mul(x, 1.0)
+            o = torch.sum(o, dim=[2])
+            return o
+
+        t_jit = torch.jit.script(t)
+        x = torch.randn(8, 4, 16, dtype=torch.float16, device="cuda")
+        jit_o = t_jit(x)
+        jit_o = t_jit(x)
+        o = t(x)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4))
+        self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP)
+
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
                      ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
@@ -651,9 +724,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         jit_o = t_jit(x, y, z)
         jit_o = t_jit(x, y, z)
         o = t(x, y, z)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP)
 
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
@@ -676,9 +748,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor):
         jit_o = t_jit(x, y, z)
         jit_o = t_jit(x, y, z)
         o = t(x, y, z)
-        for oo, jit_oo in zip(o, jit_o):
-            self.assertEqual(oo.dtype, jit_oo.dtype)
-            self.assertEqual(oo, jit_oo)
+        self.assertEqual(o.dtype, jit_o.dtype)
+        self.assertEqual(o, jit_o)
         self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP)
 
 
diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py
index 4b9959c1231e..41e16df7d686 100644
--- a/test/test_jit_cuda_fuser_legacy.py
+++ b/test/test_jit_cuda_fuser_legacy.py
@@ -1,5 +1,11 @@
 import sys
 sys.argv.append("--ge_config=legacy")
+
+import os
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0'
+
 from test_jit_cuda_fuser import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py
index e2869eca7b5f..7559b85519c4 100644
--- a/test/test_jit_cuda_fuser_profiling.py
+++ b/test/test_jit_cuda_fuser_profiling.py
@@ -1,5 +1,11 @@
 import sys
 sys.argv.append("--ge_config=profiling")
+
+import os
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1'
+os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0'
+
 from test_jit_cuda_fuser import *
 
 if __name__ == '__main__':
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 174bb858da44..26ab975373a8 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -339,6 +339,7 @@ libtorch_cuda_sources = [
     "torch/csrc/autograd/functions/comm.cpp",
     "torch/csrc/jit/codegen/cuda/arith.cpp",
     "torch/csrc/jit/codegen/cuda/compute_at.cpp",
+    "torch/csrc/jit/codegen/cuda/codegen.cpp",
     "torch/csrc/jit/codegen/cuda/dispatch.cpp",
     "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp",
     "torch/csrc/jit/codegen/cuda/executor.cpp",
@@ -348,6 +349,7 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/fusion.cpp",
     "torch/csrc/jit/codegen/cuda/graph_fuser.cpp",
     "torch/csrc/jit/codegen/cuda/index_compute.cpp",
+    "torch/csrc/jit/codegen/cuda/instrumentation.cpp",
     "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp",
     "torch/csrc/jit/codegen/cuda/ir_cloner.cpp",
     "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp",
@@ -357,8 +359,10 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/kernel.cpp",
     "torch/csrc/jit/codegen/cuda/kernel_cache.cpp",
     "torch/csrc/jit/codegen/cuda/kernel_ir.cpp",
+    "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp",
     "torch/csrc/jit/codegen/cuda/lower_index.cpp",
     "torch/csrc/jit/codegen/cuda/lower_loops.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp",
     "torch/csrc/jit/codegen/cuda/lower_unroll.cpp",
     "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp",
     "torch/csrc/jit/codegen/cuda/lower_utils.cpp",
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
new file mode 100644
index 000000000000..f6e791f0edba
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -0,0 +1,640 @@
+
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
+
+#include <sstream>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace codegen {
+
+namespace {
+
+class CudaKernelGenerator : private OptInConstDispatch {
+  static constexpr char* kTab = "  ";
+
+ public:
+  static std::string generateKernelDefinition(
+      const Kernel* kernel,
+      const std::string& kernel_name) {
+    CudaKernelGenerator codegen(kernel);
+    codegen.genDeclaration(kernel_name);
+    codegen.startBlock();
+    codegen.genPrologue();
+    codegen.genBody();
+    codegen.endBlock();
+    TORCH_CHECK(codegen.block_nest_level_ == 0);
+    return codegen.code_.str();
+  }
+
+ private:
+  explicit CudaKernelGenerator(const Kernel* kernel) : kernel_(kernel) {}
+
+  // Generates the kernel function declaration
+  void genDeclaration(const std::string& kernel_name) {
+    const auto& kernel_summary = kernel_->summary();
+
+    code_ << "__global__ void " << kernel_name << "(";
+
+    std::vector<Val*> params;
+
+    // Inputs
+    for (auto val : kernel_->inputs()) {
+      params.push_back(val);
+    }
+
+    // Outputs
+    for (auto val : kernel_->outputs()) {
+      params.push_back(val);
+    }
+
+    // Global buffers
+    for (auto allocate : kernel_summary.global_allocations) {
+      params.push_back(allocate->buffer());
+    }
+
+    // Generate parameter declarations
+    for (Val* val : params) {
+      switch (val->getValType().value()) {
+        case ValType::KirTensorView: {
+          // TODO(kir): review this
+          const auto tv = val->as<kir::TensorView>();
+          code_ << "Tensor<" << val->getDataType().value() << ", "
+                << TensorDomain::noReductions(
+                       tv->fuserTv()->getMaybeRFactorDomain())
+                       .size()
+                << "> " << gen(tv);
+          break;
+        }
+        case ValType::KirScalar:
+          code_ << val->getDataType().value() << " " << gen(val);
+          break;
+        default:
+          TORCH_CHECK(!"Unexpected parameter type");
+      }
+
+      if (val != params.back()) {
+        code_ << ", ";
+      }
+    }
+
+    // Kernels generating random numbers take extra (seed, offset) arguments
+    if (kernel_summary.is_stochastic) {
+      code_ << ", unsigned long long seed, unsigned long long offset";
+    }
+
+    code_ << ") ";
+  }
+
+  // Generates setup code which is executed before the kernel body
+  void genPrologue() {
+    const auto& kernel_summary = kernel_->summary();
+
+    // Random number generator (optional)
+    if (kernel_summary.is_stochastic) {
+      indent() << "const int idx = blockIdx.x*blockDim.x + threadIdx.x;\n";
+      indent() << "Philox rnd(seed, idx, offset);\n";
+    }
+
+    // Do we have any dynamic shared memory buffers?
+    const bool has_dynamic_smem =
+        !kernel_summary.dynamic_smem_allocations.empty();
+
+    // Do we have any reductions?
+    const bool has_reductions = kernel_summary.has_block_reductions ||
+        kernel_summary.has_grid_reductions;
+
+    // Shared memory
+    if (has_dynamic_smem || has_reductions) {
+      indent() << "alignas("
+               << dataTypeSize(kernel_summary.largest_smem_data_type)
+               << ") extern __shared__ char array[];\n";
+
+      if (has_dynamic_smem) {
+        indent() << "unsigned offset = 0;\n";
+      }
+
+      if (has_reductions) {
+        indent() << "void* shared_mem = array;\n";
+        if (has_dynamic_smem) {
+          indent() << "offset += "
+                   << "((blockDim.x * blockDim.y * blockDim.z) * sizeof("
+                   << kernel_summary.largest_smem_data_type << "));\n";
+        }
+      }
+    }
+  }
+
+  void genBody() {
+    for (auto expr : kernel_->topLevelExprs()) {
+      OptInConstDispatch::handle(expr);
+    }
+  }
+
+  void startBlock(bool continuation = false) {
+    if (continuation) {
+      code_ << "{\n";
+    } else {
+      indent() << "{\n";
+    }
+    ++block_nest_level_;
+  }
+
+  void endBlock(const char* sep = "\n") {
+    --block_nest_level_;
+    TORCH_CHECK(block_nest_level_ >= 0);
+    indent() << "}" << sep;
+  }
+
+  std::ostream& indent() {
+    for (int i = 0; i < block_nest_level_; ++i) {
+      code_ << kTab;
+    }
+    return code_;
+  }
+
+  std::string gen(const Statement* stmt) {
+    std::stringstream tmp_code;
+    std::swap(tmp_code, code_);
+    handle(stmt);
+    std::swap(tmp_code, code_);
+    return tmp_code.str();
+  }
+
+  std::string gen(const kir::TensorView* tv) {
+    std::stringstream tv_name;
+    tv_name << "T" << tv->name();
+    return tv_name.str();
+  }
+
+  std::string genInline(const Statement* stmt) {
+    const bool saved_inline = print_inline_;
+    print_inline_ = true;
+    const auto result = gen(stmt);
+    print_inline_ = saved_inline;
+    return result;
+  }
+
+  void handle(const Statement* node) final {
+    OptInConstDispatch::handle(node);
+  }
+
+  void handle(const Expr* node) final {
+    OptInConstDispatch::handle(node);
+  }
+
+  void handle(const Val* node) final {
+    OptInConstDispatch::handle(node);
+  }
+
+  void handle(const kir::Bool* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "b" << node->name();
+    } else {
+      code_ << *node->value();
+    }
+  }
+
+  void handle(const kir::Float* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "f" << node->name();
+    } else {
+      const int digits = std::numeric_limits<Float::ScalarType>::max_digits10;
+      code_ << "float(" << std::setprecision(digits) << *node->value() << ")";
+    }
+  }
+
+  void handle(const kir::Half* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "h" << node->name();
+    } else {
+      code_ << "__float2half(" << *node->value() << ")";
+    }
+  }
+
+  void handle(const kir::Int* node) final {
+    const auto def = node->getOrigin();
+    if (print_inline_ && def != nullptr) {
+      code_ << "(" << gen(def) << ")";
+    } else if (node->isSymbolic()) {
+      code_ << "i" << node->name();
+    } else {
+      code_ << *node->value();
+    }
+  }
+
+  void handle(const kir::NamedScalar* node) final {
+    code_ << node->name();
+  }
+
+  void handle(const kir::TensorIndex* node) final {
+    code_ << gen(node->view()) << "[";
+
+    bool first = true;
+    for (auto* ind : node->indices()) {
+      if (!ind->isZeroInt()) {
+        if (!first) {
+          code_ << " + ";
+        }
+        code_ << genInline(ind);
+        first = false;
+      }
+    }
+
+    if (first) {
+      code_ << "0";
+    }
+
+    code_ << "]";
+  }
+
+  void handle(const kir::IterDomain* node) final {
+    TORCH_INTERNAL_ASSERT(!"Unreachable");
+  }
+
+  void handle(const kir::TensorDomain* node) final {
+    TORCH_INTERNAL_ASSERT(!"Unreachable");
+  }
+
+  void handle(const kir::TensorView* node) final {
+    TORCH_INTERNAL_ASSERT(!"Unreachable");
+  }
+
+  void handle(const kir::UnaryOp* node) final {
+    if (!print_inline_) {
+      indent() << gen(node->out());
+      if (!node->out()->isScalar() && !node->in()->isScalar()) {
+        code_ << "\n";
+        indent() << kTab;
+      }
+      code_ << " = ";
+    }
+
+    if (auto op = inline_op_str(node->getUnaryOpType())) {
+      code_ << *op << gen(node->in());
+    } else {
+      if (node->getUnaryOpType() == UnaryOpType::Cast) {
+        const auto cast_str =
+            cast_func_str({node->in()->getDataType().value(),
+                           node->out()->getDataType().value()});
+        code_ << cast_str.value();
+      } else {
+        code_ << node->getUnaryOpType();
+      }
+
+      code_ << "(";
+      if (node->getUnaryOpType() == UnaryOpType::RandLike) {
+        code_ << "rnd";
+      } else {
+        code_ << gen(node->in());
+      }
+      code_ << ")";
+    }
+
+    if (!print_inline_) {
+      code_ << ";\n";
+    }
+  }
+
+  std::string genBinaryOp(
+      BinaryOpType op_type,
+      const std::string& lhs,
+      const std::string& rhs) {
+    std::stringstream expr;
+    if (auto op = inline_op_str(op_type)) {
+      expr << lhs << " " << *op << " " << rhs;
+    } else {
+      expr << op_type << "(" << lhs << ", " << rhs << ")";
+    }
+    return expr.str();
+  }
+
+  void handle(const kir::BinaryOp* node) final {
+    const auto op_type = node->getBinaryOpType();
+    if (print_inline_) {
+      // Inline expression: `lhs op rhs`
+      code_ << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs()));
+    } else {
+      indent() << gen(node->out());
+      if (node->out()->isScalar()) {
+        // Single line: `out = lhs op rhs;`
+        code_ << " = "
+              << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs()));
+      } else {
+        // Split TensorView expressions across multiple lines:
+        //
+        // out
+        //    =  lhs
+        //    op rhs;
+        //
+        if (auto op = inline_op_str(op_type)) {
+          code_ << "\n";
+          indent() << kTab << "= " << gen(node->lhs()) << "\n";
+          indent() << kTab << *op << " " << gen(node->rhs());
+        } else {
+          code_ << " = " << op_type << "(\n";
+          indent() << kTab << gen(node->lhs()) << ",\n";
+          indent() << kTab << gen(node->rhs()) << ")";
+        }
+      }
+      code_ << ";\n";
+    }
+  }
+
+  void handle(const kir::TernaryOp* node) final {
+    if (!print_inline_) {
+      indent() << gen(node->out());
+      if (!node->out()->isScalar()) {
+        code_ << "\n";
+        indent() << kTab;
+      }
+      code_ << " = ";
+    }
+
+    code_ << node->getTernaryOpType() << "(" << gen(node->in1()) << ", "
+          << gen(node->in2()) << ", " << gen(node->in3()) << ")";
+
+    if (!print_inline_) {
+      code_ << ";\n";
+    }
+  }
+
+  std::string genReductionOp(BinaryOpType op_type, DataType data_type) {
+    std::stringstream lambda;
+    lambda << "[](" << data_type << " &a, " << data_type << " b) "
+           << "{ a = " << genBinaryOp(op_type, "a", "b") << "; }";
+    return lambda.str();
+  }
+
+  void handle(const kir::BroadcastOp* node) final {
+    const ir_utils::ParallelTypeBitmap domains =
+        ir_utils::getParallelBroadcastDomains(
+            node->out(), kernel_->predicateMap());
+
+    const bool thread_x = domains.get(ParallelType::TIDx);
+    const bool thread_y = domains.get(ParallelType::TIDy);
+    const bool thread_z = domains.get(ParallelType::TIDz);
+    const bool block_x = domains.get(ParallelType::BIDx);
+    const bool block_y = domains.get(ParallelType::BIDy);
+    const bool block_z = domains.get(ParallelType::BIDz);
+
+    const bool grid_broadcast_needed = block_x || block_y || block_z;
+    const bool block_broadcast_needed = thread_x || thread_y || thread_z;
+
+    TORCH_INTERNAL_ASSERT(
+        !grid_broadcast_needed,
+        "Parallel broadcast across blocks not supported");
+
+    if (block_broadcast_needed) {
+      const auto data_type = node->out()->getDataType().value();
+      indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false")
+               << ", " << (thread_y ? "true" : "false") << ", "
+               << (thread_z ? "true" : "false") << ">(\n";
+      indent() << kTab << gen(node->out()) << ",\n";
+      indent() << kTab << gen(node->in()) << ",\n";
+      indent() << kTab << "static_cast<" << data_type << "*>(shared_mem));\n";
+    } else {
+      indent() << gen(node->out()) << "\n";
+      indent() << kTab << " = " << gen(node->in()) << ";\n";
+    }
+  }
+
+  void handle(const kir::ReductionOp* node) final {
+    TORCH_CHECK(node->out()->getValType() == ValType::TensorIndex);
+
+    const auto out = node->out()->as<kir::TensorIndex>();
+    const auto domain = out->view()->domain();
+
+    const bool has_block_reduce = domain->hasBlockReduction();
+    const bool has_grid_reduce = domain->hasGridReduction();
+
+    if (!has_block_reduce && !has_grid_reduce) {
+      const auto gen_out = gen(out);
+      const auto op_type = node->getReductionOpType();
+      indent() << gen_out << " = "
+               << genBinaryOp(op_type, gen_out, gen(node->in())) << ";\n";
+      return;
+    }
+
+    const auto par_domains = node->getParallelReductionDomains();
+    const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
+    const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
+    const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
+
+    const auto data_type = node->out()->getDataType().value();
+    const auto op_type = node->getReductionOpType();
+
+    if (has_block_reduce) {
+      if (has_grid_reduce) {
+        indent() << data_type << " "
+                 << "block_result"
+                 << ";\n";
+      }
+      indent() << "blockReduce<" << (tidx ? "true" : "false") << ", "
+               << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false")
+               << ">(\n";
+      if (has_grid_reduce) {
+        indent() << kTab << "block_result"
+                 << ",\n";
+      } else {
+        indent() << kTab << gen(node->out()) << ",\n";
+      }
+      indent() << kTab << gen(node->in()) << ",\n";
+      indent() << kTab << genReductionOp(op_type, data_type) << ",\n";
+      indent() << kTab << "threadIdx,\n";
+      indent() << kTab << "blockDim,\n";
+      indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
+      if (node->pred() == nullptr) {
+        indent() << kTab << "true,\n";
+      } else {
+        indent() << kTab << genInline(node->pred()) << ",\n";
+      }
+      indent() << kTab << genInline(node->init()) << ");\n";
+    }
+  }
+
+  void handle(const kir::GridReduction* node) final {
+    const auto rop = node->reduction_op();
+    TORCH_INTERNAL_ASSERT(rop->out()->getValType() == ValType::TensorIndex);
+
+    const auto out = rop->out()->as<kir::TensorIndex>();
+    const auto domain = out->view()->domain();
+    TORCH_INTERNAL_ASSERT(domain->hasGridReduction());
+
+    const auto par_domains = rop->getParallelReductionDomains();
+    const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
+    const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
+    const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
+    const bool bidx = par_domains.find(ParallelType::BIDx) != par_domains.end();
+    const bool bidy = par_domains.find(ParallelType::BIDy) != par_domains.end();
+    const bool bidz = par_domains.find(ParallelType::BIDz) != par_domains.end();
+
+    const auto data_type = rop->out()->getDataType().value();
+    const auto op_type = rop->getReductionOpType();
+
+    TORCH_INTERNAL_ASSERT(
+        node->reduction_buffer()->buffer()->getValType().value() ==
+        ValType::KirTensorView);
+    TORCH_INTERNAL_ASSERT(
+        node->sync_buffer()->buffer()->getValType().value() ==
+        ValType::KirTensorView);
+    const auto work_buffer =
+        node->reduction_buffer()->buffer()->as<kir::TensorView>();
+    const auto sync_buffer =
+        node->sync_buffer()->buffer()->as<kir::TensorView>();
+
+    // Since block-level reduction is already done, those dimensions
+    // with tidx/y/z being true do not participate in the grid reduction.
+    indent() << kir::GridReduction::getPredicateFlagName(out->view()) << " = "
+             << "reduction::gridReduce<" << (bidx ? "true" : "false") << ", "
+             << (bidy ? "true" : "false") << ", " << (bidz ? "true" : "false")
+             << ", " << (!tidx ? "true" : "false") << ", "
+             << (!tidy ? "true" : "false") << ", " << (!tidz ? "true" : "false")
+             << ">(\n";
+    indent() << kTab << gen(rop->out()) << ",\n";
+    if (domain->hasBlockReduction()) {
+      indent() << kTab << "block_result"
+               << ",\n";
+    } else {
+      indent() << kTab << gen(rop->in()) << ",\n";
+    }
+    indent() << kTab << genReductionOp(op_type, data_type) << ",\n";
+    indent() << kTab << "&" << gen(work_buffer) << "[0],\n";
+    indent() << kTab << gen(sync_buffer) << ",\n";
+    indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n";
+    if (node->pred() == nullptr) {
+      indent() << kTab << "true,\n";
+    } else {
+      indent() << kTab << genInline(node->pred()) << ",\n";
+    }
+    indent() << kTab << genInline(node->reduction_op()->init()) << ");\n";
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Woverloaded-virtual"
+  // TODO(Kir): fix me
+  void handle(const kir::Scope& scope) {
+    for (auto expr : scope.exprs()) {
+      handle(expr);
+    }
+  }
+#pragma clang diagnostic pop
+
+  void handle(const kir::ForLoop* node) final {
+    // TODO(kir): handle this during lowering
+    if (node->iter_domain()->isThread() || node->iter_domain()->isBroadcast()) {
+      handle(node->body());
+      return;
+    }
+
+    const auto gen_index = gen(node->index());
+    const auto gen_start = genInline(node->iter_domain()->start());
+    const auto gen_extent = genInline(node->iter_domain()->extent());
+    indent() << "for(size_t " << gen_index << " = " << gen_start << "; "
+             << gen_index << " < " << gen_extent << "; ++" << gen_index << ") ";
+
+    startBlock(true);
+    handle(node->body());
+    endBlock();
+  }
+
+  void handle(const kir::IfThenElse* node) final {
+    indent() << "if (" << genInline(node->cond()) << ") ";
+
+    // "then" block
+    startBlock(true);
+    handle(node->thenBody());
+
+    // "else" block (optional)
+    if (node->hasElse()) {
+      endBlock(" else ");
+      startBlock(true);
+      handle(node->elseBody());
+    }
+
+    endBlock();
+  }
+
+  // TODO(kir): fold initialization into Allocate
+  void handle(const kir::Allocate* node) final {
+    if (node->buffer()->getValType().value() != ValType::KirTensorView) {
+      indent() << node->buffer_type() << " " << gen(node->buffer()) << ";\n";
+      return;
+    }
+
+    const auto tv = node->buffer()->as<kir::TensorView>();
+    TORCH_INTERNAL_ASSERT(tv->domain()->nDims() > 0);
+    TORCH_INTERNAL_ASSERT(node->size() != nullptr);
+
+    switch (tv->memoryType()) {
+      case MemoryType::Global:
+        indent() << "// Allocate global tensor " << gen(tv) << "\n";
+        break;
+      case MemoryType::Shared:
+        if (node->size()->isConstScalar()) {
+          // Static shared memory
+          indent() << "__shared__ " << node->buffer_type() << " " << gen(tv)
+                   << "[" << genInline(node->size()) << "];\n";
+        } else {
+          // Align Offset Position
+          indent() << "offset = alignBufferSize(offset,"
+                   << dataTypeSize(node->buffer_type()) << ");\n";
+          // Shared Memory Pointer
+          indent() << node->buffer_type() << "* " << gen(tv)
+                   << " = reinterpret_cast<" << node->buffer_type() << "*>"
+                   << "(array + offset);\n";
+          // Increment Offset Position
+          indent() << "offset += (" << genInline(node->size()) << " * sizeof("
+                   << node->buffer_type() << "));\n";
+        }
+        break;
+      case MemoryType::Local:
+        indent() << node->buffer_type() << " " << gen(tv) << "["
+                 << genInline(node->size()) << "];\n";
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "Unexpected memory type");
+    }
+  }
+
+  void handle(const kir::Sync* node) final {
+    indent() << "__syncthreads();\n";
+  }
+
+ private:
+  std::stringstream code_;
+  const Kernel* kernel_;
+  int block_nest_level_ = 0;
+
+  // TODO(kir): replace with explicit assignment statements
+  bool print_inline_ = false;
+};
+
+} // namespace
+
+std::string generateCudaKernel(
+    const Kernel* kernel,
+    const std::string& kernel_name) {
+  FUSER_PERF_SCOPE("generateCudaKernel");
+  return CudaKernelGenerator::generateKernelDefinition(kernel, kernel_name);
+}
+
+} // namespace codegen
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h
new file mode 100644
index 000000000000..562aa1554eb2
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/codegen.h
@@ -0,0 +1,22 @@
+
+#pragma once
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
+
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace codegen {
+
+//! Generates a CUDA kernel definition for the given kernel
+TORCH_CUDA_API std::string generateCudaKernel(
+    const Kernel* kernel,
+    const std::string& kernel_name = "CUDAGeneratedKernel");
+
+} // namespace codegen
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp
index 3e0f5303b966..9f8f7aba1cf4 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.cpp
+++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/compute_at.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
@@ -20,11 +21,10 @@ ComputeAtData::ComputeAtData(TensorView* tv)
 void ComputeAtData::clearPass() {
   // If the last pass set a position, update the new_compute_at_position if
   // latest position would be greater than previously set.
-  auto pass_pos = current_traversal_position_set ? current_traversal_position
-                                                 : new_compute_at_position;
-
-  new_compute_at_position =
-      pass_pos > new_compute_at_position ? pass_pos : new_compute_at_position;
+  if (current_traversal_position_set &&
+      current_traversal_position > new_compute_at_position) {
+    new_compute_at_position = current_traversal_position;
+  }
 
   current_traversal_position_set = false;
   current_traversal_position = 0;
@@ -52,16 +52,19 @@ void ComputeAtData::setPassPosition(unsigned int pos) {
 }
 
 unsigned int ComputeAtData::getNewPosition() const {
-  // If the last pass set a position, update the new_compute_at_position if
-  // latest position would be greater than previously set.
-  auto pass_pos = current_traversal_position_set ? current_traversal_position
-                                                 : new_compute_at_position;
-
-  return pass_pos > new_compute_at_position ? pass_pos
-                                            : new_compute_at_position;
+  // If the last pass set a position, return the latest position if
+  // it would be greater than previously set.
+  if (current_traversal_position_set &&
+      current_traversal_position > new_compute_at_position) {
+    return current_traversal_position;
+  } else {
+    return new_compute_at_position;
+  }
 }
 
 void ComputeAtData::validateNewComputeAt() const {
+  FUSER_PERF_SCOPE("validateNewComputeAt");
+
   TORCH_INTERNAL_ASSERT(
       getNewPosition() >= original_compute_at_position,
       "Invalid computeAt detected. This computeAt would invalidate the set computeAt on ",
@@ -82,7 +85,22 @@ void ComputeAtData::validateNewComputeAt() const {
       ".");
 }
 
+void ComputeAtData::setComputeAtDomain(TensorDomain* td) {
+  if (new_compute_at_domain_ != original_domain_) {
+    TORCH_INTERNAL_ASSERT(
+        *new_compute_at_domain_ == *td,
+        "TensorDomain, ",
+        td,
+        ", does not match with the previously set domain of ",
+        tv_ref_,
+        ", which is ",
+        new_compute_at_domain_);
+  }
+  new_compute_at_domain_ = td;
+}
+
 namespace {
+
 // Wrapper around set_intersection
 template <typename T>
 std::set<T> set_intersection(const std::set<T>& set1, const std::set<T>& set2) {
@@ -121,12 +139,15 @@ std::deque<std::deque<TensorView*>> tvChains(
   }
   return tv_chains;
 }
+
 } // namespace
 
 void ComputeAt::run(
     TensorView* producer,
     TensorView* consumer,
     unsigned int consumer_position) {
+  FUSER_PERF_SCOPE("ComputeAt::run");
+
   // Make sure the correct fusion is setup between this and consumer.
   TORCH_CHECK(
       producer->fusion() == consumer->fusion(),
@@ -160,6 +181,9 @@ void ComputeAt::run(
     // Check all dependency chains, select the next TV after producer towards
     // consumer. These are the TVs we're going to actually call computeAt on.
     for (const auto& tv_chain : all_chains) {
+      // When a chain only has two tensors, they must be the producer,
+      // which is an input, and the consumer. There is nothing we need
+      // to do for such chains.
       if (tv_chain.size() > 2) {
         // Make sure we only add once, but we want to add in a determinsitic
         // order
@@ -188,6 +212,8 @@ unsigned int ComputeAt::backwardComputeAt_impl(
     TensorView* producer,
     TensorView* consumer,
     unsigned int consumer_compute_at_axis) {
+  FUSER_PERF_SCOPE("backwardComputeAt_impl");
+
   auto& producer_entry = tv_data.at(producer);
 
   // Use TensorDomain interface so it doesn't set computeAt automatically
@@ -209,6 +235,8 @@ unsigned int ComputeAt::forwardComputeAt_impl(
     TensorView* producer,
     TensorView* consumer,
     unsigned int producer_compute_at_axis) {
+  FUSER_PERF_SCOPE("forwardComputeAt_impl");
+
   auto& consumer_entry = tv_data.at(consumer);
   const auto& producer_entry = tv_data.at(producer);
 
@@ -229,6 +257,8 @@ unsigned int ComputeAt::forwardComputeAt_impl(
 }
 
 void ComputeAt::setCommonConsumer() {
+  FUSER_PERF_SCOPE("ComputeAt::setCommonConsumer");
+
   // Convert the first chain to a set.
   std::set<TensorView*> common_consumers(
       producer_use_chains_.front().begin(), producer_use_chains_.front().end());
@@ -281,6 +311,8 @@ void ComputeAt::setCommonConsumer() {
 // Similar to backward traversal in traverseAllKnown but we should only apply
 // computeAt if it will increase computeAt positions.
 void ComputeAt::traverseBackward() {
+  FUSER_PERF_SCOPE("ComputeAt::traverseBackward");
+
   // propagate *backward* through all *producer* use_chains or from *producer*
   // to common_consumer if common_consumer exists. Only apply transform if
   // increases computeAt position.
@@ -307,6 +339,8 @@ void ComputeAt::traverseBackward() {
 }
 
 void ComputeAt::traverseForward() {
+  FUSER_PERF_SCOPE("ComputeAt::traverseForward");
+
   // propagate forward through all *producer* use_chains or from *producer* to
   // common_consumer if common_consumer exists.
   auto chains = producer_use_chains_;
@@ -338,6 +372,8 @@ void ComputeAt::traverseForward() {
 }
 
 void ComputeAt::runPass() {
+  FUSER_PERF_SCOPE("ComputeAt::runPass");
+
   // Initialize tv_data for all TensorViews we may modify
   auto chains = producer_use_chains_;
   if (common_consumer_ != nullptr) {
@@ -382,6 +418,8 @@ void ComputeAt::runPass() {
 }
 
 void ComputeAt::setupOutputs() {
+  FUSER_PERF_SCOPE("ComputeAt::setupOutputs");
+
   if (common_consumer_ != nullptr)
     return;
 
@@ -421,9 +459,6 @@ ComputeAt::ComputeAt(
     : producer_(_producer),
       consumer_(_consumer),
       consumer_position_(_consumer_position) {
-  if (consumer_position_ < 0)
-    consumer_position_ += consumer_->nDims();
-
   TORCH_INTERNAL_ASSERT(
       consumer_position_ >= 0 && consumer_position_ <= consumer_->nDims(),
       "Invalid computeAt axis, received ",
diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h
index 84677ae99448..a9112a6225ca 100644
--- a/torch/csrc/jit/codegen/cuda/compute_at.h
+++ b/torch/csrc/jit/codegen/cuda/compute_at.h
@@ -56,9 +56,7 @@ class ComputeAtData {
   // If we set computeAt, save the domain so we can reset it after traversal.
   // Traversal state can deviate from the domain we will want to save after the
   // entire computeAt pass.
-  void setComputeAtDomain(TensorDomain* td) {
-    new_compute_at_domain_ = td;
-  }
+  void setComputeAtDomain(TensorDomain* td);
 
   // Return domain set in setComputeAtDomain
   TensorDomain* getComputeAtDomain() const {
diff --git a/torch/csrc/jit/codegen/cuda/docs/.gitignore b/torch/csrc/jit/codegen/cuda/docs/.gitignore
new file mode 100644
index 000000000000..1936cc1d441e
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/.gitignore
@@ -0,0 +1 @@
+html
diff --git a/torch/csrc/jit/codegen/cuda/docs/documentation.h b/torch/csrc/jit/codegen/cuda/docs/documentation.h
new file mode 100644
index 000000000000..cfd4435461b9
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/documentation.h
@@ -0,0 +1,23 @@
+
+#error This is used exclusively for generating the documentation (not a real header)
+
+//! \namespace torch::jit::fuser
+//! \brief Main PyTorch JIT Fuser namespace
+
+//! \namespace torch::jit::fuser::cuda
+//! \brief CUDA specific components
+
+//! \namespace torch::jit::fuser::cuda::executor_utils
+//! \brief Fuser executor related utilities
+
+//! \namespace torch::jit::fuser::kir
+//! \brief Kernel IR
+
+//! \namespace torch::jit::fuser::ir_utils
+//! \brief IR manipulation utilities
+
+//! \namespace torch::jit::fuser::loop_utils
+//! \brief Loop utilities
+
+//! \namespace torch::jit::fuser::scope_utils
+//! \brief Scope utilities
diff --git a/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen
new file mode 100644
index 000000000000..b9a51b187aa5
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen
@@ -0,0 +1,2515 @@
+# Doxyfile 1.8.14
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+
+PROJECT_NAME           = "PyTorch JIT Fuser"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = YES
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+
+# TODO: switch to NO once key concepts are documented
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = NO
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  += ..
+INPUT                  += documentation.h
+INPUT                  += main_page.md
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                +=
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        += Ui
+EXCLUDE_SYMBOLS        += internal
+EXCLUDE_SYMBOLS        += __*
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = images
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = main_page.md
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          = --std=c++1z
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: 0.
+
+CLANG_COMPILATION_DATABASE_PATH                                        = 0
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via Javascript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have Javascript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: https://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 1
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           += ..
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED  += PYTORCH_FUSER_DOXYGEN
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png b/torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..48616c381bc52e5237867c4552a2f4505f5054d8
GIT binary patch
literal 96754
zcmcG$bySp5`!)*0P|}DX-Ho8Mpnx!Rr*x;%EsfIMASob<64KHjASocIbc2F4C>@^t
zF#7)DoNul3&$rg-V(NKz+;QF4eeH=<Q<25HMsW=V1qDxD4xxd9f;IyFrD9=#R~%0)
zuu)LxP~;I0w7iWsbI>!iKmO@%!-6p|(~pkcVt?bV;3dzEH!A0aHJTIco<2It%#H=)
zVCQ0|ds;C1XO;R-b=$!Axmn+%?m4$1(cw<C&0II@AAD}ZIbN-I)QwyHPBe7og5c1j
z{UscqT7*e9jw(^}wMyT~uT^FyCdBq^6Kg!p{iN05ysVn>%``^X9rLaP@4Y4Y(U<bf
z%9;F0%!;Ia$*l1t>>7xRn9!?B1;GvJ2}2pCZYVW}A+Ie5*D2Mde>`5Eddr}$mp1S`
zKNCMO<&K4nd@4sOlR}D|c~9Ie^+Fi|jrS8e1X%6XyT^ML0<?~E&FtyiMoD}&L%q>t
zJYO|`{TSp-<$Rc?U;9YD&VKS<ape0X6zl{P?2H@~>6%H|j0;9oVmj1l!g`;Bw@O)p
z%1@s?`wr%ibnl7l8{Nu+6b@~LYh>J9>>4jCu&_P9Xv209!al!?Hz%uQ>l|k}nmo5O
z-lwr^%77ab!3A)jOM}P6^%oI8vgx2p6W``Xt+yti6*KIPr+s&-{?7Qm-RR566dseg
z&ll#)7pe<WwU4X3_spM*dX;C3dZ~z=?@ImrR+9d;SiSR4tXV9R1PnY-6@wT=OvA(Q
zuEtA@72p2*=a(YYJh6W1_sUs<qeY5j->U{CVY$S*_0IV*H23ub&i7=Q3uI#{n16nG
zee}DsIS;wCa5!6m^6iLC7&K7bSQK;p>DtGv8)Idf+|yFw_^C4uZU%U0XB(KX<?dKb
zN&&WHR@Jn$TL!23Iyw)ap)e8lf=i%1ai0)`=0_Jij%7bxr<^J9m}UH(NjkgcE4j17
zE%`{epTfe@$LP^Fnyi!cE`?Lg&qX}ms%|WI%3m1}EDJ^2-87!SK=$gQrc$Pt=O?2z
zcA9;OjNwrRgXui9Cka9GShzQDOFX&XerERVtu`^AwNl%+!6>8&!otB77Gls1B$30y
z2!76I^S0iG3D=<2uc0fJGQC-@eFt9g<T?onlXNs`DjubvT(+>w`5cCh9P%p-U?z;|
z;*VLcw#qp;s7~VHJ7c4<QoZF4>gEu*pDc>b8rYzc<u7$k^V>~P$Ck)B2Qf(yQ73eE
z$Ru8ExGgLqnA@PPv`WHhuKBsC9334Nmr<h@1;6c!IIYe$J>-B(rGglyL@Wk#AsY-@
z#G=Cy8d_IQpbOrdZ_DGJ38Q@ACgj}_g1xR=``{TyUwbgx;H=(Z+||C(rJzXjv!Y!Y
zm_$2q(3xOdrHIdOw&V!(O*XwIvoeGCX?oSxePE|v8aB9&g?*SyhVCMbrG_xDd@-qf
zWs4NSz!*xFL%d^BM`VWdxBM^98&*C&*94x;ZZni!sk1t10DZ864K>O(8vkCCGPF*?
zN(*x>zDZ*KKEMwm`hz}6f|tO;T)V-671vI2&)b<!DAfUJnY%FXO&*CAM&y*66<+`^
z<_$Raz8i2}zcQ3F-9)hZMXnta?i_Hps@J5!Ra&D+;rla+V>#r)O3;IaWn=u{iS#5k
z7}$^0uWvL<`0^J=fq~1=Id9M0?Mq=#cHf*(XHm(v4PmMN>m)?braW+NjzwC!5Zvcr
zow-mZHpwL5M4H$7(^{6ou@><zDk>8>4nBU`^m`{ou-g7=XteT3bHIWO+l*ua134Bo
zSmbrqG~ipmHjRd`JBfr{mO@tgl4WJ$X>QFnc?lVwO`2YP5_KG`a#MHYL!>nv2e(Jv
z;C$&g+gSDFmo>XdYcuU5%NtA!MzD|3Wa;4qv~RrwNReN3BYqJfxLRNJ5OOS2IoQF*
zc)I`DVPEazaoouO!A$rol^hOg5%&ZTW98$h?qY=?=N1Bav2gJwF=P-oQ`<#%ni)OT
zza|0?ps2;MfBzAiPW0$(r%g58F?txy6Zn4n4EW06{nkzDtFMH$=SzjZRLhsF@;h;5
zeyJzdUJMsIUO@BsbYoapXdSC+_R7@-!EO!_B9|kG?zY`1)6cJOl0!m5<f2H}VFk+e
zaBM^fN|pONy=s~5nTG3$(@zMHHW~zCApS0@`LC;F!2#E=v8B%vbV#BUa?~vv*nBOI
zz2NuDb88CQu*odZnjRS|$-wR0Vp)Bqp#2DARx8!L6G_Y*wK>-^UEy!qDUSjN-(Y4|
zOiQ5_^(_A9SD&PTUl}g<%|SL3WQP4xIZH?Ebd6E3!Zg_ItUW9l1<SD6TYdDkDi25q
z$t=p5gx9VH8Po^%Ez4eFk{X)fZH{0lQK_jV#83(i6&sm0$x9Hm-yO*qw&;mVEjMaP
z`agWjcBDW)Pt4!4H-TPe3pT9`i)#P1I*k5{+^r$-&*e*GcrxXMk&LiVs(FS(I~b#Y
zviJEJ23}Gy8YZ-1Lf9e13w}Q=p@53|3o?qGi7J~bqhN!PP=`@)3kp)fvWK%7u3j?+
z*ZM(ia2Nw!3s*X*pi`~$W48vv4gYEhtgwO8d#+8oK-0qFg2|AMT$~;xFLs8vVf@bV
znoE3&j&m)*q~6&qJ@~!d7a8lJZ1U}ct?Bnr^bYaOVI?EsF*e+uYvFvYn&)|TXCUoX
z^6A0G0C0sO;})Ow0Gm?t?iltrny=G<_r94p242x*WI{J<x^hA#M>Tn6+*k-O$Y5a(
z`|HqJ>m~#xN{5q%K`G{F&Q~>duG(gJ1zeEVYuoV2?n1ErWR)~F;$eTQ{~3r#eEI1#
zqMk}19T#br=p>1HZGX<Q8O}AL#`!U@#7X2ZaBVB__gdolZYM#NWgl@Qt#~Vl?2-Uo
z{1_-d`8_F<J1mCY8GaQ4v7t>iC7eXR+GI3T#%TOo1%anOS^Ki@{^IO(`QCp6gv~6=
z!Ax!7VZ{0QqolI{G$@qm*E0OZoz9{W58whQ#`)*nR^pL3-TViHBOsG#kv_(9C?sVZ
zwkhPbX@YWEy9w~H3+c=QXb(`ZL*(0g3mCbGVpY<4OpIGE4h^2{uS80u7z0xT#>4{P
zi+`y6RE^#DS+Dt2ur4k8zbX*F8$C8GLGE{5|H}U4WWR`b!>G|+T~aIk{Nz{3YDX=_
zD(Ui{jq!I6+b&Nx-dRmlnDrY`k}gzP4fK<l_z^uqm+}2`yiD*t3GZmHGn}A=FQMe&
zJMKc+_%z*0^SjjJ6*CPvk+xsag^pFlFSJOx4Q(e>V-r`a&GM@kA=6r51-lji8+?R}
z-yJNbmLo*D5n&4<6BkGGfvLB_J{8$d)yQ*b6%$@V=bokwIL?o93Dd2;HH7%R|9wEa
z)TP1o`%8!Ey8RznPW=EWX&bry_>}nK1>M2TdcV%ob8wjfn>>FOaz5)6uwnfmzZO{!
zW@=QLOEWhH{<-k-sB-~v8GY-?M%nw6wYDd}UWpvt93B8M-|yF;5bxtLE)NV>-Bll1
z1fnyw1$f`0nCNG5Rj}v5Lup%39t!Sd0jXe;vHcVYHb}sx$5kNsL)=b(|NOr7em)IK
z2{dAUZ1Tx0sT|sG**40#@B1F6>@D{U#8L_|sTUx)BW>@IEs^ehnR<)&*uC7aA?4s^
z`3f6hhwtHLA_)JAmF6EM^YBUpW*XyZ#mkQi+pU^imb<Hb4j!GXB&$m9%+@(%rEnYR
z8$A<AfOBcT)v5~o6Y!c(n2X4Ncj3WDX)-K7tRnz62b|h2SSqw$D?J~r$ioK))hTIn
z1(MoTA%lWve*0@s%D|wcqHPV#j?Yir8vIVGTvO)&vQoeuYkuN}j#}PN_~yOitVa0=
z+t1OW>rB_4`PTZoSVp!2Y)f1<+vt&e$D+I0RdHvo#cHKDaew1o+uO%<1&PHt9s1E$
z5gwN9+W>dE&U@-O^BA@y=1<qPH{uN(Ojg@4G8ItIhSc3)Q<KnJ1|jtET6_$!nfwpc
zdsjAt3E520Cyu{1!wPI>adU#@w}?u%Ft6=rx*O4@1BYAFMM}?k`wtoc6x#>3qzI~s
zlp7Q3OSfeiD4tUHSby-DZE#zCh@zT%Pfx^ildT>@*8TyO6n~zm7x(W*V8U-FY-Nw8
z#Xjko-#MqsLJ|LS?iPaJuS%QN&m5Tp{-CJ0u_)vC+kEB3-u=*C9CE_rOn~#>VgEXT
z#wh{gCc0g<_L*Ty=VpO8*sD~kyRKM65_PwDjGGhq?MB%+_W;mI6}Y)7DU~A9pSdN6
zU_8;ymkdq$NM62BX9a2)2+pRyz)drfmDG*4)y-FLtA^rHo-Xd#sX{78Ul1yq<?g}<
z?!o60t-Za!?j|%;IY9R&{GZqY^-DKGlu{pShlo)|PEIaD;bkb_%f!+Fbr+_B#97L;
z8BkkcaShj@7n~08v}p~iBr=3b5ye`X35S!*e_9YL!-0#Hj!mGn(*^=>CfK-VUb@H-
zr;Nc!7G=vAPcy9o&DIPmLvWF)6$=Yw0qRAoRZsYFM<`Aby%dJs=NG{ul;nDEh-^0B
zMs$1VRavTYe+KoF#`6Ysx~FCr0f(E<U1McjSV9tYaU+MESJLW&oIsei!jm24v%EZf
z9npQq2y9tee=27Q?y?Lw@IaX#Lv>eCR~$s$vuC^;R~d~MD;<P?;II5a68yBn&Jub?
z^p+l@JifX^arL9JTJ(<vx%vq|S6DmEHfn>?kBGu#O9^Y^Yy`i{T1u@`B$w_1?5^ud
zgb3PH9s|XNjA%lL1F*w&zyBW*xPXDe=I-(*+lRvjo+;k;AQ37Y=iYmw5$R0j6uLPR
z!87*pIbrvX*&el+UwwZ&wRnIZJAm^2V|WjpNCKm6yFxo9<4*%YITcBBzmZ=2E|_H)
zd`RYMmb4(#o)HO_Fdz#EmG-Cti@D|}#Gv8;C2)F;-4|D%jEJBctJRTV@kY>A1Xl=M
zngG*Jia~md99<Q*uCn|2uVHbHOCilyO8CQJahrRl7r8_gQ+|grq&qCowJnHVFi+_&
ze;m829qh0r+;ORd)pFk0|8PCoxANfT=QXDl^V7Yc#BSC7pLpmN1mQ5D+(#E2L>E+@
z(&YZLV990BaG?XlgvY5$_8{1XspMMyaM^$91VWjP%`-+F>b;*_NT2z^AVK6%zGvwK
zjkmo5p2Q^2An@pG0;nV?>j><Hv4c>O3PEYF^-4L51%!QOlB;{6CKCY^GZ|DR3SGuj
z3)ahVae1~wdFf|bxNp<cBD>HPc|D0$l}4|{r*`V5`Tqd{w)l8>=<EZefjf;Um<^XO
z1!bT{lcxhzPAJgy{KZz>5FVmXJ-nh1a1~#G!kG8bS2if&B4R1NRa`*Wv2PE9fQGBL
zJ#NDGANq!dIPP5~bTvd>*p!bKh|KJ|6)()rK7=Jp6e*;xfb6v2Ngz&$fkcehV8gSV
z)h3y*zyw2ZmXMPIKp>VNppI7Cs4{v6JFuWi(>c!6j~1&lf|8db5*fWS5<n!v#!NN3
zGL=H&-gKTkVmbh9_#cr?I7VNeX!UPOVb=_k|HvgqFyRj_?*`mg@aVY8>uPjtun0!!
zdv+jJyp&7qP;;=c0BBgmeT@!EuArt+_`NoQjpS3>|95o&ssbjEwcLLSuA@}VCfzoZ
zgvbpp-HIT7q!B7900jLKZNL#KV1m@KQ*IgPfQ-k`cD^62-qh}Ckb=VX8Bj4PAXVu!
zxL5z3QZks*-JaBzt0{F7f7WX>>yvV2Atk<jEf^G`eFjb~%)PgV|8CO%1g(m6ftTm|
zJ8ggZ|8DM#KK#c}4(Edh531aMYP@;)F1J=>c<##FKZ00dVBbDzbhYTPV1dVDa)X%y
zR+|%*qqUD8GV(unsJR^su&P2BE;$pZgQ{$XX>gKF$FE=+1<0ahcZN2Zp&h&#hl2Rs
z9ZQMc*A2j@P5oi@hcWt~*jy;_161}BQ~+Z?g?$yE_({qPEPo`}<~?*7y3g2#_0A8h
z*13%uYwRZ~=s#x-9O|XC16&NugacfRj9kbuTbjLM`f9&!0h1h~a5_vwLUhx^&B@WX
z+MGY<+uCr+dApzu98^WoTfw=(!PJ+;oD6D==WU&v^hh%UJ~tS7m9m%!Kf|ZqJMnyZ
zrJTI5U}^`Vi^TiX*lPS>5MUHRbp$Mrk%Q6;R9|IZo*bq?x8qU)+qrZ-X!eTNU?eQo
zdXp3tZ`|No^!2Uw0G6^frLxKkKw0e{jGMTrWcp=K-iKD!84<*g`Rgfk1v$C}?1lOg
z*YEv(AR}6V0cx}bTm(7sQQE4&DkK<xQp10j>Q%qA8c6%G$>u8!Ef>?m6-R-IoB=)7
z`N?XY2Jqj>N{jIR5Wp+hNY!M;@)MwTNeDgqDhB|8EUH7UnU(5QU~vHJ<;b{<2VF=4
zmM9PS1#s~<Ar=v-aTEfNECs_o3_zvn3=ocf004=4HMT^Z*SL7MH(8(?3PUUQ1W8bt
z3d5dZ5~KkR1tR?{KV{s}X!U#2c@T^zh;d~l!@2iUkEFm#qLVK_M|`?z(W7%^XYZh#
z6~&dOh@6fl%;3EvRQUA+3{(V=BB&r~`!enns&1@wmPwm}C11A{sCW5AW&1exzVFqy
z%s@!_agI!SxA}uXc^8&Tk4Ls6=rHwJ!*-1*_Pl%eIiKt+rRNN+s<|lz($UxM(MFN+
z(462|;2@n5_7w1161ZrT$k{Al2LlM-KT&C6wf^<Z)6Kq*Q8$u7RKkhtR>%>lWsU#Q
zLP}rZf99$8-Z2Maoj%?t<I+_V@mOa%SpQayh3c7nwWICYV1>(*mOPLvjH(KeI{V)q
zn!K^705~mwDvMDTpIxgsVWmG+k=JI3>y1WH5{S_Q(@t%JFD+5U3+5aamV4sAUtXNK
z{`#TZ;JsInE#|KuM=km)PDSeQ=;j@Bxf~G>EsSkmtA6kIu3~?A7=GY?%6uuCSV+UT
z6%6=XbwZ&Q-y_Gne#aFvExzeNati{myKX;H7^I`*z<pCzB|$g?93pZ>IsmaVi2XN=
zV8fhGTAdf40ulm?YYZZr$#{%)3wG*q5ISyMNBVcV$zUKm9D_VX4E8M6vM)&*RI7Fk
zciZuyoQ;C*Ed$(xFQ7T=@8xl&=@e4f*1c~tSy~{|5abMbAh?}<@<y672pnnPxmkD|
z9+E4>A_S|6%1daE%nJZ%J_GNIvsL3mo<xVr2L<n6YgJCj)n+o41-_+;^sONHJw^$l
zEpqX>D|!ulA`Hs0#f#<^$Z`b+a)mnz{<SQ`l_5&NTEY8UoK=3@$R*T<;-S^cNmb;%
zhzW_JY*o<yKG}@{1F<>x7P|T@7<d$To*xz+%M6W+7)t^gxG<aj6?Ftw;~0AXUGIJa
za-=$dfOAMqv=&DWMJ{N2CL8VVBo^)fED=w%kNR&?RInNZrA_IC#?`%p+aUrjW4S{I
z$%7>XsX`6~=cK&~r%a~w;MSxR_MWMK7oM%)E+%q^<=Xkcdmsv5(M}Lx0JO&lM0;Gt
zPdPdikasZ`UR22c8ww|Qzm;KN6>?laFNA_gXv<_?FkMZO4%K?~2Nlu_=}_arcYR|L
z2>v}^CInbY*y3Xv^1E;r)Tp&~Dzdt$zr*sA1opwId!mM1Q5@jXb)QYEOa9&(*y3Ge
zXqJHGg7*$%XptBVd|V1b6knYyE0Lc?l>@Xr3TCJ>5+Bl`nn0hwp(mDpb*pv`C^hkp
zZnFN}tJ|VNt5@@ZtW*h%!Xm`)-*fQN-$&5@-v56S0|WTqb45-8`oH2D*u0V${<FVi
zX9XMt9~OvP#J<iV?64e)L&hW(hWqaPGFE6V0YDLjL<VWgwa@uUAa(_9F;dF>HAZX5
zLedf~#gSWwtJqFWG~43q1!0QyjW4fNzXPrU(5*L5mgB`&0C$_jAbqXV6+keMS0D@t
zl|r$%73>+WCKU7rJA}Ewq}iLE$D}o7Af1O(@i$-%ia?q8Zdcg%u+(xOO&J4^f(Z>1
zKZcMKYhP&Ik5BR_%qy8EkJ@<l{Qlo%@dgcrn)i|WPySg53#u<s4woLU-IrIf?V-LW
z``;m0AfLjPs7VyNzSJGdS^MZ)5~yF^PK}r9F+xOfUcjlmv!FYmfh>=A7c~I=cyQ*n
zIwTLmwIv9qysj&9AWFsD^Z6|Y*j?uPKED$HYc_CkcEk+2I-@ThppoD6u6U8dR(}Za
zd^&iY^RYL`GvD|YbmAc5!fw1g2e6q26oU3_fUZv+%o1vn4^2nLr3496!2IEVaxp7G
zE-e?L-5AmK=kiQ}L@GrP<V-LD?zW4=EKMT+)gkI9N3&i<PV=q2_7k`7KG}Z>=yiie
z_n!=axFXiACUR;4IEp!U-goEz2}tx%_ieoqfOLgm82Z;dP_rolFbOrijJ%e*u<d3W
zYu&eJ^Z=e=6>^+W+X6X1S}9vt9kklY;QU=;<%8LGt3PE4sY0C|keap4DM;P1?EXS&
z;@fDzy!c`m&XBHQY70AL5b;0#y}voh_q_v;lQ&9*OVI6y5&$p_r;~sm)mCx`xjdzo
zVu^h0drv$qXEZO@)RgoYNNpV0<Xpp`#vr`-1EAUfFej;##azJV_tm>BO9o<i?Pc7t
zc-v(K);FD}?OX#uGA7Ved^p}t5jziv7Wr4IdEC*Yw=|Y$bzCC;YlzpOA;z+t{mmqD
zus8(2;Y+jGZy(h?9WO+6L+OocO9TVNK)d3Gxc}agG$<hC*WBEdYDq^bgx@<%3;sUJ
z^FMqpQQY}77H;irU8-9t8A!0fTe>6;s7TII`eypN#2KY%+jCln|D+N=C?<Tdt>Qr%
zW*F0g90F9?`qrN^_A(>F6g@ym^KICr>qeDJXX<nvmtnn{sacpn14v#;pzdw(Jt}7c
z37Y@TbJ7v#y)cw9boozoHS{3_H1~>^UJ#AP;4T34nhrSK{*gji6GcgkfhVA(`wjqB
zKEvsc!K{Af?0=2cPXOXgDHZm$B16f|q##j-DKPmnm4|OMq$4K*Wf*?B4tk3m4H+%W
zQ^tsCu?;{QS2@m#{G6&SN#T5`Lds*zb%R4ot$e9|xD`}a+8@OqEAmU%(7$G+MCLO>
ze6Y?cM1Mn1ge7c_bOdnrBv@0RJPbX=T~&;y7S*vDa-6KPvQ?@niwJ=D--(I`Zy&z9
zjKUEzF9(gpcS%W$H*YbqOYK)eeHSs>z3JYS*a^F+HMv~1(p+RDc?}|oe^fdFGAGjo
z?_zr!AUeNOK79h3m;JoWrXOIbvA)2PIj$c?5HYR{i~rGz?6#NY4@eX7codA)L!{&U
zM}|vqsZcSMgBes(M7rrNECRew2*`O6M%5NQ_^B<FdX1n{q)J46negixa!zPaa^GF3
z{E6JrA9M>2M0i5>SSHznRQG+{-x$0vC;kS)RSGc5lFd0lEcz)Up0UKe=}^}L4Rdu#
z%E`vWL0=GC^T+J!!;aR!DT2*<os>qOB^*~ZirsG;DcI9&$%BjnR1l5x>A%JJ|HO1M
zkl32AqSF7S+y5j8=nvO9ZHIqa0BQdJWR?HQA}HAPAUjP-t2JMB5s>%#&;S2DHA}*s
z5!!o8AHUK5b9Oq^Bk0z{F<$>tk{LE4l2|(DAVz8SFK+<WW*dy$Gmm`d-`G2$#>VF7
z$}6TBG_^sV6_LPrI{5GXKthlmF|Ee_!gy<t;;m_n4v{!A2)+r>gqy;@JKBJ~zrWmb
zMZH5=MGZX16EL*ry2=L<C{QSEe_)pUcS`LKAzP778%DyQ|CJ3TU<{H(gLeOy()#C|
zEH`BC1JDmJ(_42W#r{2ykqSz)U5>e7fW%0I8DN-bZiW*ae|LEo8#q&*>31n)_T+*b
zSn}CmITE`7p!i>V`~SwZ;ImL;=r7Kp12_kTibU`qtoDQYW)Ce-43W1g0jpovFIS!S
zH-WZq{@+4=?IQ3@K?jaw&1Ubtha!Mw{j=9Y%j>d)=zGrhTB}57P$XeO^Y8p4P{63M
zB$jXe9Tnjk9E3DhQ2uK&_$;&s=bKOIARh1C12G%w0-b}dNbuaPy7<3XjsXZg++Px)
zQvTc8R{8`EZ4PW~Y;}Th^G0B_qa_b{BFXt)fu^4d=m-=C`~fu)`@nz190?K=U`y^w
z{j;N&@EQS%S&&^a@A)1o9q`M<+}sE1lY<?hId3UIXmNv%wE#Q1XY}6@Gz00zRMWi^
z{=XB#VuMx4%r`&b(~<z{4DDR;z)J;6yHNaMdJ7BWJz&{E^oZY}GhhV@)pz-bm~;mh
zcj|X1wDcvv9#GLKfW`yt+@kNt6<KqHHKAX)m3M8!9kA1Op@%e%V#xh%hrt<)5R7lI
z;k@uU0`c<~mjM6*ajszkC<V&n00&{ihPd%_f-J^3oy9MpI!}F~lMZ?_Cfp{i_QCPO
zO;;KThQl#i1C|;|x3GVH=L=nSgL)t8l>&}PYw0=`jU~1fJ*pE?yEUGmomz$Zvy;##
z)&^r`2CRhR8R1v)z?20RS2|K7nPnF;NrqgBK;qq!h%s3kiQ=__kFghb(=J2W8|6^o
zU>=2WUcnJz?T(`_u|Gif4YI=$u9Eq3%^`EqN*1|D=W^{l3=|#k?THC&*%;oLC_O;Y
zV3|r`j@f=1=6qN#_gW>#5~RW4F{QezmMa6IT})HRBA+oNQRrWNa%_l@3UU8AD!ovS
zw?Si6dv_y70)=~iH>2Vp^R{Huqzp7ttZ0<d5lGqs>dak3Ta_FCERzm353~RdOe4GJ
zNtlSJnCD0d;kV<76bHR8f^Omqf?|&j28>0-sz2pZFEwgF^S>-C(Z0%%nIYfK1t;Xg
zLyf&D$wJFFP+UdJWO^Sq@wvO(6OcV+<4+DV{+W<OdkF)@og(LFWjbY<WSQNv04frF
zcB5QXmXdSJ#T+}X5MWw}Y5)1fX62KrcprfB%uFkpGmRHn_q>nQE-zMTzmI(~r-nJT
zp1f+N7OevkDzzk5WsaS;Jbhl9el}Xa1Sw)>C0lC3O?D(GmxVX86fCamL;P#ZX8^Q*
z?yrwj?Ea8o4vJ;4OcH;x&$RmF1wQe#G>oCjzlVFW#pWH=J$GHumE`%zbSaAceb$nl
zIiP@vSXAX+B>Asfk^9}0D^~Qd;asuAHrfkjg6;Wa-?rI=Ec&)E$CsA<+451O>|8({
z21xT!-H2;V$auKGjruOa7HgH5S)muy2C2-M<>lM>Xzzq5P(QiR3|QBwTlfyDER1bU
zgT|ZwiO+=$)J*PD&Csn{WoLTsx{~VtysG3+bwbcBRBr~uhU4UH@r!5K7mE?uqHdBG
zB7v6#MLys9HL4IUFr<P75;S>BBN$s1np`y{+|rP?a-7@_oaTGU36AB@svB>onxz%w
zKP-J`HrR$;+GA|+q6Uj^&n7bZwxvF6671Weq)!r1w6>cm&QD=8{seEaAVXShvvfNr
z14WuG$u(OiJe)iWei)XG>EV;YSc>V#7_gG@@?^U&HtgwJ@dG-k8ma4S=?<jCyD9C7
z_YJJ3g%rP^rYdR_c&q>XTwVrP;F&Ky`vFK}yiT|q6(NX<=@|@XP%;I)J_l_(5U<oH
zl9)v`C&FhKosZ`3pSb=Oo~-uDbDb<VYzuv<l>DJ3eD@Ktb8vgy!NB({=IXDhvrQgp
zkF+_OoXL3p67qZyW{&4R`#}a1nVbN9@fFbIReHNhwqDxR4+oVJ$FuPGlqRojxmi-9
zm%1nGI^7>y&w|@?rpK+UcA4(_pTGF}R^7<QeP=0LUk@FboL>+}Bt*z)6ai$^3uI1c
zwI=pt*vF|Tk`xKHT!6}~dOXhEHa6HV*RB8;bev+43X4rxdGn?J^_GwPy8H98ul|Pu
zN!a$fq+D4#gXs*RIDNfDQp`R;5irA`qk2`R5|YDb0BUc1z(#jnC`9V~c#pAgwlo<I
zi)#<Tz77wp=<7OO3!<bE?1jU}%FkqE0`AH%<KnFI%9(fjOv78u$zo&{7KUQ0oTu3O
zayVbdQ1U65e~hxK;jU;wrXm_*X9ENQ1-~3<E0mTx+*|6AI&GeHhl4A}GLMG~0L9B_
zz7%Hd$wDc_9P?Uw3ydB7f7I5uIh8?o?`z3J)$SPbWuIklD+!{{q-%!gVre?}!!Wg`
zCvI*4?UC*KPc(cIVxQ6<zEYxal0!K=+DVTWztkNoH{t{p1kQT`5b-p2)a*&-Tm21|
zx*{1t7dROd&X61rea9^@6~KB$@R}94&7w|2z=h9qG5<yYShcAA)7jJ+Qrm!VMi2-(
zQk{2J(V;mmL<AWyP>@G_c`45bs>j49zkVdLK|AP;_-{4;+a1sfa=ko%Qv0H_8=cgP
zuz#o@Pz&(ra!`^q&dfn2Ct$zP52_H{fcp&`YsjjSc)uem;!4>$CBN_on`*kv=!eQp
z1}la1Br~ZPvbi5|5kQgFzz-^3KqWGPmcXNB;-~J@bq<hH(n@Qx(W7KOq^5+S5wPaG
zPxdX}sO3NXTB@f>%&Ni#ir0i9#q@r_ZWaOg4X<S{VdtP->4%49`plGq_O}hEfokhE
z4(fXZKWRzDO^@xF=VOAbd?=z>KsJi&6<|8<*`LC`XiUaNCgBIActCKHvMHO3DoN0B
z5nU{O#JM5()Y72>0g|%;*yBBgci=BhP|IBR?iKDIL8-g|!UvfMlHBQ1J-&EE4eNny
z@d!&ZwqAV}zZ3Hv^N)#+Qm;otr$a34SGc!kDl~Kkv$$?O%ao6PVI8)v;LV`l*0zh^
zeMTrr{#t#aCs@i9h*e%IWQ@Jl-VnezHKGxCr1fskMH<0lP^Yv${_b6KDk$2I6lRYd
zgULt)bt|#6KymY~_ErIb>*^Wou^bcM0RqvaX@WTGI+JVv1fL-uvBp>|YX#JW>QMIG
z>!-pILr`ns+uPJA=5ch=@_4b+Ue0d-4C#W-M$wS`fp&g`XFON4;KDw}W0q()sI&*&
z;{K2L+%viv^uhFoJJ~=jaTj#G9xVz2LC<r6$7wZ>$5kqwCMxnFVU3{ETxsdosT@ii
zNnQ1dW1vL|Jv{@OVNiqR<10$1u&Eo{{RUa)TQ|MfbY9?PgXvx*x*T9mAf-Zo8P!L<
zi)=I^*4-!XTGwxu<Q7`riPY`7v#&x+tjp^bvpIhlt3)L(bB(ljr!%wp%+{C4AE*Sa
z?0{IQ*5}=#|IykA<Ij=8@EBTg<K31YpO!H<;sK!^F8Fz*iE1MP^pkpv)eBc%=LKl&
zOaf6r@9tt}5g6pUam$@X*Y65ei*w-9Q%V;=*paev2-J3k<<BAjP-?au%GQJQfFNk8
z4`%{}>&ig7s?o2-aN4&XM7ef<RC;#u?ma0eubw3dTKKa+z+(kg-1T5x8zOH~326oG
zGrA&(q>KnY#Nru++{jl3bWhWiWrm=GzU^%Nc9v7CI_wfSE+l;g;;a~eQn^#|t9Nkx
z=7a9GHbEM#otC5A(>N*0IiHPN2S%YgGDbC`VuqW07a?b+za@LGkI?}|p|*pEVC!6t
zwjTwyF3`=^#P{g$ge>8F7{M<SdfTJ;q&*0gWMnSrXOT_2fdvR2{Zai)7DM?{(Y6&&
z*}~XDH|PYj0?xLCUNcbOuyVOp$hE`JxE(8K(m|&%(k{XY%g%tve``vhw!RlMNc2B_
zkfthMocQ{5H*ABJl$-EN@1LMmCZ!BDAkls^l?>9nK%MSw^L*^ec)pIx3^$#&^jE3h
zX{vp;L_A3O<14FXK2yH!Bs6Ah6i8khG49|0h=cpZ)0e1B>ab#wU{Cvn1nL26;N>4d
z43L4AT<57;2KM|TY`g%JBPVUgn~SY5m0EbfwPY0I<@Q$VjTRN(Qm$D{v3~8_6fQlD
z7N^>Xuympj-c~1C@qkkE>iK{(2WEZx+~tqatRQ)67EFAD$^vHhJ<NH$3XS*nlNuLi
zCGT3Vl~Eont{OFYYMVEMK)Jt~7uXMCaB+EX!pHkSf~&LjtT^;L<#;yG7ht>E%#%5b
zVs4=0kQN`HO)Q)dT0CuSkEO}Fc8uR4gVxGNpH%r|cXv^Wd2?^}z1ymaHtYJIrD&e7
zELf7fTRr32g3Unty8yLq#BzzuB5gM-7)V!vsPmy{8PqEQ0Z#FIf2wv#>*eX=)J?Re
zEI_Z|Ij|cSfV8T$CwnZETc`-MSb#sh^{A3yfV`E-0Vm0B;p5{ox8`Dcs@;OHrW@O#
z-fTeZv{qJwXcAKYoqoeGX7SOdrHr7Kh%gUSGc;X$yMbU8uCUvYpz12Y{NNh%k(5+!
zBC)3cPD)b$!>kguguknXI}Y(1l5_!W<+T=!EJJ#F0QB%lz7pVs$3@C)Qi(Z1vV5Qt
zEO~4i8>UT(!^U--{cU*9D-S|dMKW7H`vKilqvgYFN3`vta_#Z(p=@C;&G4gy7eADX
zw4?vbX$Bi_O4rPp+RxNie##U7lX_&c*b$n#8s`w4(K#heR!W;-kNXjLbH^`GcfdGn
z@vBEXI!=uWW$?a-!cno}nVK~{J{+w97AWPWw4nPv+72R8hb@-4L1nCN;ByA$CT-=p
zXaoF$k{%ztFhfcmf?tG&l66pKRUPM6;=Pv=rIQ&Mdnup%ws8al%ihTOeygUMP;O|5
zgNRA^CAXA8<(p^^GhjP80VbwyHw{NfjZ9C%D#koIC}L}~Jy*kQ!z;oL4BpYZ@CbA3
zRY3<OUMGDpLz@inL-WtA@@FC}h8q%(L<B$Blm>YmR!fUI&27U#!E>*9ZDH=&1gRf(
zdo5-Mxe3d+l7|odpIX_28Ucipwqi04<E0Ak4=q^buR(#L)daCs={v1wDqC{q>N`N#
z!_8v;eD~hw5>Q{XDX=X_`=1}X9CXNnkRBLih6Z24f12>|4k{2~agR<-1vODwt&XWa
z)4qvAwRNpnnrr50SJa3q($+UWR{9$JZr9x?$TBE(T&T8e3>H}8PIsTX=!o~s`@^oY
zC?^&T=*aDid`RLQESa2~4Wt0n&-``W+g2e|Q!Uc1#!VV?ib-8ZH+*!T@ew(+xe#nH
z&ke{~S{cwSEo_(J@Omu!$HMaM{&4pUFOrszb5G?^h>zdp`F%Ebc@fhHl;mY|)y~u1
zzvws3;GeC40VvR!=Y$OE>bjWG1p#7I$3gd<0}gS~2Y<@PY~BwhZ8-mcCiJ(XNYc|{
z%JE4@{xvW0^==l<kh?LohpejYQ^v1VsB42aHqT)551dzJyy*;^4rI`CqgM+jqHS+l
zR@f0N;^Q=P+cI)Z9LIHFU?OTRj(vn<l{`do>DcOMLa+#`1vnOu4p~X`E+}r^YR3%|
z`K0U(M56Uf1@ssd&R>%Uu9-G}lLb3z0&}){+B@1KX&O#)olv?7{!^DS!og3kT|aDA
zuAcM3?ZO50NS>S18MY6l`OOo7cdGT2b4SI=cPNsN36VtE9@)SRYb!Y8hVhLWseidK
z6>P=d8j0p$3LD|E{-lj%F`eV{^#vKTb4$0mG#Rv=A%yLX5{Tt!H6GZOG`|>{FvCYN
zb#l_j&PUzcZPll<dH2U!mqF#EQS0MyQg5&QX8vtfty#rU82#>aEFt;V%KWcoMv9G|
zKX2}Cd9oC2&Bs=}?NGkCabx;vE!x;Ewu-K$rXd}~blX5}Z)gd>$`knnx(BvvF)a~w
z1i_sY=-VTh1T<P3M6!_|f1i=jNFO@})!vl3rd#Y0e1l>xux`hn>y*A=@nbmGTts4B
ze16ysqv+w?br~80-NL;K>pUeqO3Vq|cA3h;9h`F-<H&kiO6-W*ZZCX6nCWL%P@dii
z#k+Q&rZKQWY1#MJ=X@!J87ahcyi>cXmN-9Q7T&~eJ5|_+&z`@~rg|<B45dvD;~qL~
z7`Y6lFDwp>?h6hRNNs%K=(sU5@lBEe^GyEx7oF#8i=Zb!dhBe|p;CA;axKCk42PY|
zIHC_R;y4Ss8S2|g&dXuiN@S{t8pf}L=-zgWbC_c;(C)A@569r=J^#z6Q~0qLRMtP`
zWFVzGM8{#S$JaoFi~lG$yyG#2t5;zgiJMuYJExpBnrh&UhYg2=BME1YJs6W!dz6u6
zJ8iMs61U2uY*(_{r{I9EzVBy$7K&$4Pa4i#;KITuNa@jwyB+=jMF!oMD0Y{+OL+q}
z5~-|eNMJ^uoA9xl9dWPY>q6z~W(vklZbC&|qF7I}vS)!CmWg%#N=ZzS`UFiHLWtX?
zOS>dBJQY|{pG8hvgD~m1jwkrrz2dlUzmUKZy*?sm8g{4ZJz}H&aeY#!TM)zPfhyw<
z6v2p!C&35&xZ(VMi1RXZ3%KB`QZ;A^Po6%mY}t5vW5K&H9xP$0o``yQwMas-oI#Nb
zUhBPGW>`A*rpgqlDuSPw%?$G=P!(vy9yC2P8`o|U&gYJ@b^3%M)|o5uBHA`Y(6kx{
zJ@5LaN@sum^5w2u{RX27{^UdUf)>7>0i9y5pVh0$FSxr!eaA)N$0UWsSPg#0t!qR~
z)S@-XVR)CTKZzqQT(l7u%d{Og@-1~aC#)A`+AdF4X|0%L1UG;VnE07lCy|Zf1`KB%
zz`$=|_XW3WSBbd3)s2a-1P9~r&t&bDdr0?oECk`s9iH&G&thpJmiSj{f}CaU@x_o8
z<2=<O`%SQ9g2VZ6JuD)qha|~&k~HLpS?7&5x8MH3)g?r^A(bPNG>`da&~~J{d<qv%
z<O=FsmSqkCV{lxTN=@2~4`j)_b?F&JmP+r$k&L_xB)&6zyH1BFAaReo;}qL(uv^%W
zD7LR?@OL8W+?t07@rVcurlU-Rtf$sOTv=PODY*<TfoFe(ROGCj0U145RpT*R0?*9E
z9NlQNO+ifrrT-y%z!{nzqH{^D##TMmlHdsj;aPYn6RFWI-3N0V`3uwF2WgIex$Fh^
z^W3JrHj`Q}Fcr>P3n5A)S3E(L4psbm#_fv2-Y>FM&c+pak3HXsfXI}4?V(LQeWx--
zWvebDUHCP_!o_PiUQXw+E2eC5AuN)vbQ$#Lbk~RqFv!%HT02BAay<!skE!DlpJs$&
zg?W%-z%9$_1``~1xE(i!%Ho$sb=*sxOv%I0Z<@JQq~#lXqjyZJt_D8vbYx=e3}at1
zO-#Vq@hxp5q%2F$41M^1Xed@$_cz}5qONK@IiE~fgDH~>SaM~UB}>mMcsV*Pl>z!(
zJxzYVu~>(0`@~aW2i@Mv#ONKCnRq<Z2+11MH0Lgb&SZ5~mqO9^zIoS;x9*}kY=!4`
z`SB}aKSX2|6()Wo0MS!4Gh!SOgOg+&@^InRvt{fA^6MrHr=i6=An=NVgEGt=iEN#9
z3)0amw8#5h++4;rr`~k_)SmZUsHNS#89V2F73dPA`AKgRexBIs;T9LubY|{iz^M${
zoobgPdT!!z=oe+HY7-*YhWhOLTBIhg`Hx4#wB4);5=?{>)mCL(d3?sCJlU+^m|tuN
zX}wO7C<BEmqJy)Gtg{0rn$7U#9)8F|r%$(_Y$ziUUOpa~LxgA-HKUXOLST$v7l)ox
zP!f~AfIe++E;L%0_%VGsnLLw2bznHFr2x9OhUCpoBg%Z&inR_8vJLjHZ-xWA4|=tq
zC5T|DmnJ5>#4)EZmfzMb*yHMAl<Hz6r#Ou+Ec*OVcSSwh-$df8lwaotHs$+c?xz*p
z+HX_({n4bZm0rqW;BelZ!r>GXq;sat2o@mjSZu<kY+)lADG+3?2!86p7ySo|?i0!{
zLQ02Y`>arLP=%}}RCk%{CM;XwKUC`)T6>z#Y`o?yF8QH=ddy~XZsC~I{}aBa^`5Kr
z2mD*>9|p9@bTjrk5L^k|-igc6Xxx%u>El?Yf|+RRTttg8(*%@UTQ(HEwj)$Y{7@}W
zU$pfyTr$nc!+x-;z1?I@Dp8ZS``p-cB04~mnbh7Q`u=6$v`CoqPy8Cx&)YLP@=7GO
z=YumeSa(Z%Dz?Kiy6;qQT{5E>1+O?v^xj4bjIyO2rUjXNTS=}8H5eBaXIy=g#{9iI
zIIhv}kEusWZ@kEhJ`!VOa!x5PeEX!vKxoF@^i3E_4?cG;&IleS1+O)OJj^qJK5j8H
zWY#>ww(GvdYvOJBdVCidTb>!6MAYj#l3gatH?gDEzznD#Tjb5&SyEleVKP;L0d-^I
z3mpgM;tkV2gX6%}uUXaaH&dTu<L-=U&8g-hx?>l~SGQyMq-#WLyvs!43G>)bayQ-Z
z8-C#$Z`?)YW1cOFIP?<>Ie~*22+=oS82bNq!69sqC_yk_k?207PavQYrVG7x<60S}
zjpTAKCzWdGyoD@Y2f2=oVfZ}b1B@A;imd3H<ht1|=cwBcc<<y;UpFV;)~xWB>RoQO
z?P5m9#4Pb<qWXq{ioYRXUQIO6xkQ7SBZn&L1a$z0u(?bc6uM*c!12Uo*7F^A7?6oF
zpj!UC-0P(|@%-h(C@4NUr9?z+x29}Vsgi{2<#Y47u3S4Myn|#U!#yG}n2_(;<Sb?T
zF)p!lI?6p%LW9s*Rj47k3J(jsqj0z_&ctZ_j+q6VxD9SG81Ji^Twd9z<^0~lB#%2w
zCYTG4Q9+Tx^+p?I{Jh3UuDeB`iFllD%H&}Rh|Y+cs25AGm_^Lbz0*CgYV3}l-}9l6
z79q&B%u%U+{ffzX`xEx_+ZD-5Irkpye;>5y7pu~Q2r@!-*ls~+KB;k)%q@};8S1-r
zSDG><nHTzljY+)An4`a|TUj>9n=AxiUXCW|!CM;xK>_|Rmsr1-jrkv5sen3|UKnl0
zW+M_9GLy5f2!C*TG=)v`!L_tvw%SZjtcb%3BIr(kE-&O71<UDj#2)kDt&hj9S$NYz
zWL!oXiezY_5Z2SF2xcWBWg#s_eD-N2hy?9sQd=ur(rA@oe51wFn?Y?_0?};?&5QUR
zQHmvL4~V~(VmE**AYiB=BhsLM>%7USxIomN_)wS(921Py&B&RKDC`!zlDcDKldu_T
zag5o$x~xbD8o{n9(bm$(5J)C{kUcCzt5`Rm0Z*8?u}aK%nZ~?})vgP%Uml)9anA(w
z^CJYOA}U+L7=zNgfSAK-yxb`4u_&by>=FpZwp6D|2gaf1ta8RMYj6cNBxBKdt>U2T
zWftdT3R=kcn|o#(fPE{_+a_XsYNSQ_Ps>yza4Jva5bI*ib0I~?@R*_A^_RWvpP2X?
zk3n%`@%0QyS%9E{;8z@w*(jdu&kc4x=P?nB1g$$N5%=EMF(1#L;NaBDfQ!?Al4GBY
z>dT%Oa?P#Z;8c(TI4r~nTKGq$uF11o<q=`ueZbKU(4M;fp@oad^ajlO`p9tJ{bbmi
zd0e;Ss#R{fUxMC8v#m~N7(Yi}Rw?P%=v_<S-9A#+&3U88UwwM8-XdPo*0~<|6tSZK
zXzZ7czlL9Gebnc09I*$--|ZRuFA*%LrQr+T!rLmvT#LLF5r2Sihu8nq9TLf(gK}>l
zC>Z2`vJsxP4Nm$h04b=EUEKJ)k&OYMB$q2dL{f>-HllAF0j;(iDE+hR@=ZTH-A@jo
zcK)XY0J@|BK)hWYI?KK9+Yi*kEpyg<_ixX1nY&9}%sW3Oij<Egh%bCfpaFQ$77<w>
z!C`f3-U|cjpvjV9ybUw6$Te_ywBL89h0Ny+oXgSr1QZsE@aKm>r9uz>{AFN(*1`a=
zLKmN>(mUf84(O;2XvwkVB3JFkh@RVz<hKXZtcr>WY>_F&YO^I$em`gQ^qRcWzS@74
znN$6fS8*Wg!ctKE<5cYtzx|D=5O9dDe|Fxnzs@NQXKJe1!o+UeEkitWZCnJLk&4Pn
zeVz@@!kp*h3WBrtDOztd`V#1+3`xJ_zG!g&nZz6Sz!0pL4l{f74bHTvYDfw_Q|}@r
z296Ms3p`3;{f1aGeu&_o!ZZe&FyhiJ>KpdB5Vbc|=fHWLMM>>~9jLyw-fFQ+ZdLUw
z-&^XBO&nu5h4{kc!B5OwL|dhQkf#DgjR~i?{t#jD478MacUxmuHI80HRQE)|KXDWc
z=)B=%d?jk!oMD#@4&m8c1n#d?@|W-C8@9Nsc;;RhnYZ2;P!bMdn9}O(8-k;t-l90r
z?;AHi1^We^l(6y-yD>Glgp1<=v}jg<)i=}k0sIwoIB{3x0T@0<nPEfKqh(<2)D$)g
z(x)HEOnNIVdYHhWbYmwQT>>ukf{-cP2d^9pZ$#M&$$X+F%DM0BDWHC|Gyg{c57<i*
zM2cRX0|}Ob`}Rbo)IwY$P#{6PqieogCjXnp@kCJ}bE%3Aps{EFP#ypd$^)vMTcK-R
z<E0!CXgI}9m38iz(g&>Oc)gT~rP%+TZqg4<0er7HA|e`@VD9=Qe(_sV5Bx*{rDfTZ
zH)Y;oO_$&tC<#^!In*3Ba5L6z7}p^k5fv;?$}M{|P-8XC^rxpt>h*{1TN^KZ)eas8
zw7PtQ>5<-(FE<wB;W24ea~wH%Cep`ROY!E6s=-aXK2Y^l>^kdNU0M44&LztpF;hy%
z#{tE&IYy?Dra%7URC7(rQ{z0G)#IC~7ovjw-KjV$@5E{4GI8}@4g^&uy;hs(>8j@}
zP+y%0&awKL9a**Agp<VJA)=`a^1mh?tLSwUc*RhN^CfU3?Z>~T`W~A505{YCTDIGG
zl!UK<FdzwNuh|suUgK(jMEejGGaGnu>S6Uj_QSBL?J3}4&~vkaDlrv^eiXr(Xi02~
z!MHPUX64q_R4oqPYKT|{ua!da``4g<71epXKPWWMrW|x2UjBvrol6Xf>t;LiY#ogj
zRwv#Ro|p3)+#c7I`t583k%E4)8hv=7Nwu}IUhSh)S-wsK?jsrjhNnWBh$hq_G6o9R
z3E-VLFi0I2<bL6g;M}=JW=G;n)B=ux-*_D}fl+VVeDWiU*Z0sq)!!NL)Wjse^QP$x
zRp>_dJ^*!hh-H6@Jk)Lihp%zRN6CqZTAb#^*!63QfA7qj90UUv$!1B<M46M{CQcjZ
zgTi?VY9-3|>^RwtTN^?5w#wqmqZ1#yikRGLnTD|G=rw?8<dRkQ6?@;2z1&)rV%9vI
z>5uhUC-Y}|u*6j5K21Q(_`|?;v&m@mm8X1PHe);qxB3H)Z*~mve=cM(JRIL{&-!g<
zHdC{ffB6YUGh3&^emB7T-KOZDbVU5NzoN2bKpOaghh(u0K~B_LR1RmWoOkeWeuB0X
z6B82xnCv32#u|EQ6TOc(I4)j$8P+T2wVfSR7`9j<8&9);IPJtq_Bg_M`al%U6=PIS
zs<Q;L*8tGNeLEFXOygRgXUmiZ@eW-^r1VT(7PQ@pB!Xd8UOUDf)G_>@ZxRk1o}|_}
zOuw4>3bg)QKx*r)jF379E_<HZ+WCv55v}{4{Y`HTCC-9QH>m@7CehA8UBo3g8_8_e
z5dx_?1r|L%s1VL@SDb|qWVA%6<cYOR`HKPtDx|xc4a;Hm3#w2I*VRt}KeRCbqWRY$
z;(*sU+p<d4EvYk}N*%-y14D98Dji}t8-m<-=8PZ`Ke*}r`{j;UVUx{is$B2x1Lva;
z_)JN)l?g%Lu<WNAJGE938(sS4^=?g%B57vZ3QP5bHSrBkWxw}-w(8r&PNLty+&4>1
zdKbB-TteC(q!Mejwj!;O-~FdK)zeX*G4FjTJ6bn}Vye&J_p@hBMIL3u8qIzJ<ky=8
ze@|xV@Fby9Q?^($zt77X0!MQF=Xq>Gjms?x#<??@Ie#dp`aGa6I!lOrm9<ScLOxoQ
zi2L(PX1UWQAvS_E7#t6N2!2R~$2qZ%=MV@`^|2I7!x=Ra>^H!n#{P-#DkDpzft|Mt
zaJW83+|32;7~)PgXY!s*@;~7ZITS!HN|6A@Tj~xv6~6=4&~Lu2nXWCzq=2=3DFWxQ
zIaG<pR;OYO#lMI#U7ss-)|TDm$Mrr;vpov{`eb#xYgp&t;BVXh{S|yh|1vvicZ6xi
z3C*p7J9eW*ZkwKApT1@gf>i_J2t=fwZq({fPRR@pl@(j<jS=TxA61|j0i8y9w*b8G
zy(?OeBHH*uk;~zx9<#jzPU%N)`US6{DACD=J=_|~CSzyXJ+FJAL_esSuPrD%!^5W@
zu$XtWk65tkmxSpsC$xR;c||fJJ)o0l%=k)#YD~9sS#DE!uS=V##ku?DbK#nB{+$U~
zin?`<y*c^gDYva#BT|im=}u{Z%)Wt7*osxjbf1eW{HfD=JBz@G`@^AyIHJ0v{Yc1;
z%x884O%$DUna7$doi}wuI45`8a<yA(saut<qWJyR9jyYHXzNBh0wVwWD^Z4Rp5NU4
zEyAI5jKT(dKLtp~KIpnVS(-iifJw_X?&2LeMOtsh2>(*|;{H_7%a3Ixce*QdEXweK
zBAB$<d36ZnWm|#UF=h_pkUAK2vWuBdduRi-8l}j}9y-pqmhn0i(%sPqO>2|B#@N;e
z;imqLpbP9_q{~1-x&?K>$p=(&?~$*!Zuc2V1QHTBG-lYUzR1`!ei&B|&Xc!BS$Rk!
zo)OJ(JOcdjO!=LWMUCKN?CBWqItB`z1z<NNM#5+Iv(6ubN(9S4Y?HIFbn%LX)S-*F
zd2FO9SnZSoc{=^E!>z&M>$D<F##o#~Q!#EGsbU{sWz{pw^Jo~jYUK~=Tt3$gPlCf}
zAzsQxb_DO}gC&?vb-cDKyvLnJ<QwsF7peel?yEI_SmoG9wP~zSmrdI*Tg3Txb6%uw
z(iq`otEQqtE6&vvc)48rD&<p}TGjVcH5Q&ZVvYjYLH##zIiu1B#j;8U99AWlUMu;9
zOG=2<TUv%sMJ|S?*Ejntc)v%_?VnWiX}s18m&l2uc^|z!L&La&2A!<AZAnj_>Ob<B
zqg0hh155qKo7C&F?-@MFjdSay-z7Q(hH3g%)X9u6P*i@dd;P6l4Yt9%9ez7pn>zRI
z?YH!T6go~Mw+ePZ$HJBQ=hFi}&~p3g(&YThGfh|N57pjO-Rqz29PuKi3D4Ggd}z3r
zDuOzhsQ@U7J>Xnwps%Oow>3SP>~kxa&4B467;n$(*~3Q{mY;z`$1(>Klamc@ulqOO
zn{{H;8iY(>cZn5FMA=d=5JdBtv)B)Sj>}8%+b0ScogACsob}GND!`&KpYAP6+6xVo
zz^SNRzAGtNY@5mwOJ;_`OJ&+i8AQB-<BMEGdt5B~G>cMe$g3N<3J$XCJucpE&ej>@
zZ=LAiZ_T|m&t&DEVWVB)JkJz8vcVe|IKn*Y^-8~d;-HZFDN_Qdhh7Y&jG0ePGfd6p
zP)B#aO1<`0c%$}g_-@UsEVFG*M3x!JWVi%lQ0*2t<RzJ0DMCElIG7t{8=`GK=+w3y
zDlF#n+m?-@-nR3$6;Hv|uv3^e6^ypBM|YL+<@K~*09E5i-rJI{5&ZNsDnUsY{QM5b
z9L3!0?I>HO3uz;8u2iST^8?=v{|7Bo*T<em$Vl=V#%C)<ELsym#SLlw_3L<KgqDll
zFRO3$*5f_gtyoI*xIa~3&sktr+09KU7F2O>nlvXuf>mNSJwk>u;U230(`fHZe!H0$
zk1iEEMOdevXS@=aI{)?bzF$s|o^%c8oe>eSC;drf8Wp;eYQm362UCddPlu4l`<=-|
z@Rw(p@LCVP;p#t3Tg#s{DDJWlRhjpfuDJYl#K@j`CoU>{TiiNs9>Fi-wZDdn{q``?
zb#<HX(JOXwe3Fq@R63p<#Ml%1`JKeWbHpRJoj*{-#*7hu8M?L<l{mn557^3`4zVo(
zl(@mr2ZKO%-sBc|w)LLljkB|J1;5kWj04dHD1C~P1D#iHga{kXnd#LH4y?!_I=3j7
zs3~7KFUL`iKY7|GmrWD!PDF{wVIyq)C=J~-bUVT}hny1i_)t_X!oM4H4U{{)jx)l~
z!A~J@_K-i2_)umJPK5V&JO(HD`!H|!xuD$Rbt|NkzHfMMmp;TmAWB@C-?Jyd5`|XD
z65Hm4Hn;*&)7r12v{6|okXhaP2wuun;9<Q<f1NCZMmuy*c8Lee-v9Tv8g+l>J>Nc)
zNWE^4vPY!XBbGw5;#*h>+Agdce%l$S`ow2*3s7F{+)Q<9GiLFh`j)<^nJ(Ax=-`5*
zKzm;QTheUSc>hynS#25TYJk<NW7*EGPr}_h3~M^a9H!H^P9Y(}Yqq5XgF5>YeQoD~
z?kTb~OQuyx7i;>k@8ok~!mbMGpdrmsay#z-VedVIqFTFcVL(ta-6%OXAQB{rB*`?v
zgn~rLK|#qNNwNZxGm4TVm;phM43ZHfh)T{NIfDd6;Lg?Rd%il~t-3$XpZnuh?Oo;G
zcCTLRSx=aA%rVEv{!-Wn%nNIYsj&+7<J)uH*(xmxm9}Bcu4cw+UqoyeWtg0&P}>!f
z)ZGOpQ>=|v$_)j-)dJLv%0-S3qr2}~aCXxYY)@oS^pQ16ux~R!W$A=EeSm_${=>p^
zK~(H=>`wTK_YmzIctGb%G@Up3Az6|(q$Ict`N2IZOea7Eiy!pGA%OnYm!(4X{@n8s
z?689E!o`tR7f1f*x1z+iip^%3Q3dZp5BA>5Kl4eqis!ReWe_t5SeE}G`y262TUGKx
z+V0~acl4~W5)OUJE`Iu0gQL+b{oE1OQ3>1Cs_I`Ut4)q~hh{tIK8uy^B~>%J@o^(C
z^>9A1onBTUO+?nQfg_A|`SOQ|xkWv}^LZ8qT%V44aqli3p*(KO`18IJ+WvdD<<OC+
zfD)y>i!zG`oTXQz7|#;6lwrnDghGwNy_>=zb<&Z9b)m<5otruvPxr162L`Q^x0p;2
z^94Cfj!8N)3{=ML5Kl4;l*bV<?L0O4M&#jCroAXI3dxUTJ}F5++y|0-^yWDwO(xg|
zVO`)>*4-q~h$%b`j9#tftsM3m7PYX)f`!Tav|?9k8J0~vuwq}@rpP6UO&Y~&tC?DB
zW6AN@)P0+o-idQY)8`MBQ4%ms((x{F<ZCXS?%rj{{RXF)A<-oLn&N%+(#z^Ij?kCz
z5c-W@a*@JFfqp@Eh@{ydnMjln`EO9u(&aF_fsPUhIm`ICl*8Sb)irp5qNlv;^lN{C
z1X_pqdT#!gm%kEytx3FWOWWEJYdTF~733>lt?k&ab=zm~x-VOQrR&rH72EKmTbz4s
zkfVJ+SoPpC&*IX`HPX2?<eJ<VwG;is;vhV%B(1Cegua+-ZX2aZxyH3?m)-tmQ>}#T
z@9vzm_Lm?sNe|^|W@}VyScsf@T2S&9$Qoj{y_eWexjCP7Fh6BGRFd~@Zqk@SZygfN
z*Wc?=3y?DDV3^c26tUTYLAUGknEUizUR6xHmmukEp`!z86DyxUYEb&X9oXU**r9mV
zaN%|2n)dQN)^3q?F$dYKt-Dil*R!e><GTK|Agm<^3}A~}6n62H0e2Gru~xXwg}Oyp
z3`uWXoM~$=kwI4>!^B)82j>r|;ba?o^cf}VejG`d9e)aGZh)kD`Yn8QCKohlxbgI?
zh|gx1lNzdEZRvcoLxe=nd3^V)F*t`~LDn83AJi89*Og>}B}hBxp}X?sBa3;C1r0jt
zGG$o~XK_y`U_%HB#}tlNG|Y6ntiNBKYfZCIOY5B$%d5cxH?ER<Itc-@gz0kLd}#|w
z@$UB5&nGlz+&xf`3xvK1dje_Rhus2cy6A2PDw}{2Xwft~Qa_U%gLnMLW5-JaoMWg{
zdWEmQ?-soL=GfCAL9_M>Q30Y_Jru0IuA9*1eJANj&zqmGbag?M%j_av{;;Gp>5K2e
zn*6gNrATLG%Bc)B0(XQX&z-A|x5?^kqhDKgol*RjVBzEk9{an|PFu7lY6SS_*}rdi
z{mfE}jbs+lzHmZ_HrC!MTfLT)a7~ub!#ul!+|WESa<Ikectl5ryeZ4qpv^`^Y;u?<
zAenh7z(Oq%&UYm|SJoCkGyMBSr;X8HGj<mE>GF*p)mVBJr|8Ut8SHsYes*{)dow^F
znNHxb=uKf;i2)&mA?uV9U_!-YY#73u6_tLrV_9>Wf@}N;P!=x5n)>hStVxiO=!Yo)
zQwT{6-3DkxWbK;;49aZ6E?qCVHtIP=!JlbidRf(MD*P>>|Mv2zAaq(u7_9c=gu_B5
z@l!8TM1fq>4P~ZV6x><fJ1eK_>gw9Krx0l;=vTb9(3{5(G#pn+q{Z-+fJv1;QjUbS
zgPV<NDIPVdX7HB_pLan1?IQ3mjZS`F@Xa@|HWZ;Ogl_Q{#kYGdgix`qxTJW3K0tsY
zv5l=*I_v4{VqTCtBwO5_%^&$vxr#Tx-gM`D0!NA?W7CWH05;-FOx@QG_BW#i(&hW8
z-WTGUmxPbvR|^I_V63l+G^&pZiw1Qym!^oNXa}VIN;i)Wk!XXi|5r`)DtEqLt_}?~
zD(>ysGpg=P$D4utFDa?)5RM)U70MQ5Nv;x_Mb&^j(_QNaJkM80D|u-6^&Z`(FuXXh
zKyf~n5QI2vvp2L}DM`(DS4ZHK*a8NKYxz;;uRos^IN-tKv;qABM6cm_s3w&jmNT1W
zcx}zYDhaY>b%^)50k=ZrlzPa+=k(O300moMmeKeAt%>1tF|O|FS!CX#NjWWlu$v9S
znVpU@zw1)nVgt|+W@OtBt+tDflzCnBVmSNo4y#Oe=*W$f^9p`)ZkJ9{%(vxOq>^!=
z%n0pV-F&us^zeJ7Hzkh!Mh_C}Ib^peTNXn>J5V?WT3H`~-drnm*Kq^C4^7+sMmH}W
zz_Cw|)aO9kK&J86*^l!XQ>%#lR%hC|8_-~}K!x>&4t7Ge3Sixx2TWiZG7M;;y$<g>
z(#5`a8ovMKgTMa(=zT!zyB;8?C)W;JI`u%m?AwisXlRRXgV^v!-94g>@wy`bFGNH_
zw_G?4*|`)RfqWW%T}1otBk-6AyMC@zu5y8x%toK7|Emg6?BInAQ!9TJS|BwAJ6EeV
zqYalEo5q2=sP+?3CvM<pYJmXh`-C95feP2kFOH|9k<M2{&<bF@*M1EY-oTUA0$sQs
zIW^2+j~@$=gEwZog#HM4Jyp-BuMjGss0(>u-_20v)!|QwsaMNrvHyg!Rvx4^Ide$E
zdNAn<5-N4786~Pi;`Jt%9LEYT-aO{d&3QBb;V#K>wO_N3DA@5mNRaYo%}}A@$w(HY
zjn;E|8)W%TH3Zp11MM3BVatv`G@L2>hj7WjWY8W<xT)`1@hPD%AwYs;Zi`mV@>1&3
z<#+kAWUFV2s#?f7vLa6KV*C$&zDNbdl{-Ngi!gVn|C<9z^*nb!#M1c|ejK3?E4?ym
zMG#X&+O$<^?Z)ocEAqS^D#TU-Cjf>NM7Wo5SYpcs6uwPde23HpoF<fdjv1?gU{E4n
z3mv~j>AFm)E3p=_t~Jpzf{Whf8RlMhB=!%rA~q_!^i)IU5XhRMX0>?SyM@V3Sv@`O
zqaXZDfrR|jXtvNG77iu@;J)7N1VssDmEt#DU++=df#?q}1)~`1Ti|vee{Re9crLHY
z*r}iSKQ3<8-)sb!{e`63%{tF|K1w>*AqzIvKx3)j6K4<6p~P&&R(r1O%a1pn=oOjl
zMRWly|7~_i6Ee9~g~+{#M?%m18cyDsk8Y=<Tigy+h`E`RV-u&Om3Kj<Zl1h^XpCeV
z!)WeW(j*t8!Q(}-$oSg<sGsrtdV*neLg~L!pkJj{1V5MPM=NjK{4Lw<b&&)!cQIXj
zp)B)fmy(4ueYOXz+q7Oq&~~HBKql5*k_xniFa@HEjTXRO*$)nGw7UKy>mIC>R|20<
z<<KIn=~jO)vWYGCHKNA~$T9C5FX+3@F>g3*`qA_1ni@i7{0TfY?%Us9UJzD1G1(B2
zCTvW5`g+dF`_hL)9#_X=(!>lii9L2ez67Z{B-->5_1@8st$PX*a)PC<P_hV@6&Ne&
zb=NO35AJ<vqqmo*@&oXM+~oPD`z5&zq+_71*@rEDv;Sp(`E%y|c3^P>9K!=Rd35p4
zi5ZgIV@S)AnF_%~xvTBEoi|~xby{U>l5MHgcutcpwporTxi)Z_$!a}P=*C7+VZ8j|
z1Cx0Y+N@uxhQ4CtHGB4XJug3+L|WRL`BS7T2QDX>PbiaX;4OOTMVp9uN^m~=DZR~{
zKlE90do@%2Rw5Ojn*mm>*P$M=$nV)Pa#|KNlV41*mGxX%n7gnEegQ$i&wQVwp?(s?
zI&ja_CPJ7{H0W8;m$#<!(;B{jD^HK+sjNYpebQVnk>GbHrU!GuYazjVa6)B%c)%VM
ztTRTnh{&cW`NwS6M=>~jcdXvho$KNlo#>4k%hu2v0(oA`z=D?j5{6EWt8ql<s+NYg
z_j@snOqh~$KMJ$}-M&cTYo%T4G3#Dx>QwoFt}c#K?Rx`Nby2MIjr;+V`%R<*D>f0;
z({&@(>vs_yO;C4!xs&ee%$2S@cK<NrS>?R7FSmc8i|VrUki8-SuzDZuI$Nl(<<9%9
z%F=>%o{B_`hn>p30}Z8yf|pIJYf@s11bwSVYDp=s;4Ny=ul|_jtonSR!K42eOKk<+
zDq~_n&8+X$_hW4H@uAHc1i2MXm0P83mC<@Y87~5HWw{fvC0OPD!EklMdo~ieX=s}6
zP~H$7JvyJ4KOQ9<Jf<z`ztt_su`7`~<&+)5Bt3bq+oX(Zs$E-vZ_9VvZ7zdTm;N60
zL-)o@i}!L2%(C9z9}l`%EEt*)qSI&4y)(6wUwm9}5|wp#$55Vb;ENlroo{0jf7jPZ
z`iIk?R#Ww*9XbnncQ!#UY5OdCS>wan)lhG{+X2Q_H|Co^(XY70t(ULwr3BH$2ONRM
zxX<~1uf)l2b5kAoVSHO^7Vc?$MhKylE5oPAUPM7t#L)|nzdR$zf4s}u@(W*y$(C5)
z&bq%LIx0gEdsT*Tc&$*;k3ArY$LHi4gF@YGt=tkx2AvI=v1iAd%+SwDW<ei19OgaF
zW7VFkP`sKGc@-!22Bcr#hS_*@hPn5pox@-l!)j0&<hG)XvV><W4#sII$O<X}4+>JJ
ze0~uso(Qn5p;8t-R&w}xaj%dhUdr`mQfzyY`y^xIogwIg8(qimhQ*fgLZw*Nu$anW
zPd&24ZL#<7Lc?xIl=TfcitbLZFkcTg$$=Wn&mxN42trIkKpxGY&U)@prR@&ue9wyy
z-|W7aT+qp9dOTt_!i5q<RE}DoQf_hUZ0uMyi?wjvoRH(f6D+vw@`|p0cj?zq05#sJ
zenPV20V{PCbW9rC=|TO>&0TJRSHGX_O%(tU$ES+q6z(^HtG}eDbv;+A2wTaS6^CYA
zuSMM|C|D3hKu0Lk*f764!(Yh%a1OSSSCL;sn|uL&P+qLv)QqXb3^LeR+k?n;UM&U7
z$}s9#XZV}tpd@c-`FLCIdgY1Oc_{dt;L9@hO^1rjXUtTopky=o%4{g-gT)$PnZQ=l
z_4T2Re!F=SB6PbZgl`}0t|v28@=h<q@7-ao;=F`;AeTn`@COz%d#>@LfjXPdKXy{^
zTm4E5i|6U`e-TRm-o{ZP_e?C!@If(Q_IAWFP7}_N+Q|UBNEDZir5Ux0-7hi=RCRZ-
z7U@PX5A@K10jd|(Fa7ut4~DVpo=$JCOGMetv_hszy^sCNug{fMVy-)$m?Jd)<!800
z+S~=`iV-y1<cJ?Py2X?J6~Yib>WB$48YWUn-HV2}XKVS$7yZjEkW4}}qk)3;d>s(7
zagQd_4f<ocEyjgFCD?t0CvbgR<nc-g2%9fzy*0ONP2|idUOdw_GUO#ovKw6c%JP_1
z^T$!G4<;=0!-e-v7k=n*M{i7w7eyr~*ko?^oho1jr8)&J08CY<ICA#W*hzXF8{3rh
zrP?}|qeE4FMC0$?XiwJMA_~im^<{D{YKdmq3=a(<4hp8RpqV{|G+7{Z$Je_P0O_{^
z;FI*8hM^`d?Oau|jfaZZWd1}mIo%SphP=wLQ(xGGBI(P`Y-a(Q;de&%JkcffXgzhl
z5Y9?!F7>CP{+5<_g<q~?D}TvAL)9k<QC-fTc=0dlZi9U7mB6#VpdPgF3E<nxL7@&Z
zh9>i$Et-~bkJHYf<bQzfEqz#Rg=6n0I!?7uJnx^VKf4>}zfu|3Uzk_J<Fwj&nqlZg
z9@qJJb=vl{<vr3PPmc~IU(F?%NNSE8!xt8|1BLbgQ<5_7to=dJl(s524KY>{+dCk5
zl$hKo)s8AgRZ6;?XPf#S#{CN#ur4Sx5dDJE)ItOiMj9d=J`F|;`X24!PH?CyTcH8o
zn=|nqic9r*z5?Y>{&4}A+&Irk=8m7rFDZgxm5-GrY#PORZ<A+Yl43o=IWU<1w~;Ta
z1sD2alykr*dO|75>5ddV5^pYA<wCmd{OyZZQa}UjwcYnliw0k*M-zjckuPev+N}+v
zYUf0zZuC5P|K+9e6<+Q52?kxJLxts4(B!wOuZ?z@uD?Pp;~iqvmdtTAWns&`?hPgF
zEuR?9us0tl1WZ;@Y`E5A2WsJ<&~#`~lRmltI#k2UcU%MVJbywoo^+~3#OU<(;x&GP
zwAN8^!%9=8TIdzbA*^s;<vmt&?$qf@s<>gIh?L?-OAq6S+v$7MG+0K2Ov%|W+!ys-
zHB@`oU#LFevEzpkmLkFGWA`?jNF+?l%s5@oh}sp!D(DlMF{p{2DErdDGGa<}ZqrHg
z!JE5Q`nCH#S9+BR;uP`5j<B5bNH{K^nmChoOuZ)Nf|}|XeI62GfzL&%JJ$#*Jl?WG
zlyyU^kOPnGtkU{8x+Qlsl@Ta8H@Q(9=)94%{l`}11`nN^MU3*}0>Xm{nb8U}ZmDEZ
z`u6@AR1`Zto)>GEN0zLooVVi?tWdiU5>0QK_)gCiu0GTK+LdGVY%s2yy!Vzl2+ya>
z-%2*xoj*ZToZ$5|rnP>DJ>F+9UD<x(RYN{7cakF&KqWQzvnMaG9#mMorr)MDBdsF<
ztWxu;qnecnN(dn_$Q<5B;^WB=`^E+3-{&qV`j>LXE<nRvGE&@K8T-IC<81*o;oHMY
zWbve!8kUTGXawEMvke`2tMlez_||4IJJ~*vu%{sufnHz7kH^@2o+g`p4d%8W{6#@8
zFjTS{8`Gv8clUH9U%mj>xGllQ`f9rwdaoaS$^cM(xM5soJQ2Ct0C^PuTj3Cia;3;V
z*ih)BklU9g;UMEkJ9(M$hVLI;6>ox-@Hq$~&u@G`JV&()Js-EH?0^!1Gzcbr>tt&(
z#9a4Q9XGnmHFck94j~-@eF~9dzuV&}oa}n<L;OIU>G^kww{q@IxyPUOz^6)YHauS|
zg*$HsgU@`E`utebI=-e_u2h@4Ftn3`-pGlXv{4La@sJGnq#~xk&Y#`cslKCAbeUrN
z0ig^JDm_$H*P}5H<WS!AL2dgWDLg~g`y>dLUTw+!Y1`0w4V||aEJNu$*$BIl2WXCI
zYJWftrNCM0=KdRCf+;C0GWm(m6P%)cUrzko+IXu~;OWI{njKe_BJeCYtYC5Uq!V;0
z3>3Sh4_CSlCC(~%r_cfT0`p3jV>~Xjgp_ML0j5%b#L<H_8>-pO1`{l(=V~DPdbDZ7
z<9R4D#6e{{m^W)35kY+8Lo9iXnTXbyndR8JFEpjOr5XSlPPEm8w4`lrOkZ?MWfu4V
zLcD_$rz#QkVCo{uMG5gE%1TO!Wsn>e6A}@f2XIY!q`X^$+4vZlD52=P583;YGZaml
zADq51=LfcEK%0hXATV;crzic!Yw`Wcsuaw6Q_HDkJksw{8&$~*8tQ00FYI(yHc|Mf
z){mAf@V&n@P^Fr0T+UwZI-RI{`pLy~nex_FWr6~Y>|K8H&kZvh?l@2fR4bn(+$Y2+
z;7u}!<Vi{}9nRdV6~O{2KG``afIUJ=oQ?Fvsiy})H^Q0|KypZ^D?|PfNKTsQK)0<B
zBG>rq8mLawI7TMl{>>C-chzY{a+nxcvld9P>wpUF^Zg^*tpg%yoqfrnN)RBwkfSUx
z<pl&AJqV7r3mC0IT<)Ys>M6(`bSFI_{cjVcoLJ5^jehwXm|O9J#?sTHQR44BAJpaz
zm4b}o^gEX=#`mr9B8Gyk#xH}cx(wB+Px^J8OIqW7esXQ|m8HwZM>#2Xvj;N2HZ!fd
z(l0OcyyAwwSW&kR`vX#Td>B^=>aQE#Uo*ZN(81<aO5$mZwTv48N?HwbwXBwpNi9kt
zQV*J~5L9L16u%&0Dpgc<XpQ^wlblPBwG19|IJk9VenH=wF2OarM(k7?!QwhdGbZ&1
z69-Q-U_2my%Iuqz`HAQcFCdhkT59q)#}>X4>COV5hgUHT_&{#5en5@*a2_Q3dTz8F
z#8jWbFM9QRYB%N}>6u-yKrq9JfiQs%>aZolUIC7VWJT^w>R@fmJ6cSh)!4fbyCkj-
z^7#1Epqi??D?e5`y4dN~GQ2$O-B~`i<0W%@MQU~9XIgpZ&i?7u(y_K9R5I+<cek`K
zj4HL`Tced8Z#hkwlw>d%rE$&XvRiqdlE|yN;4FwuOgzCoZon?G_MsOVs2-|BJo@-<
zn%`<?CF2_&5vhuL+~l(yvxzJhy*j1%%}yDYy=AHI4=+77NEBE*DeM;Gf4EtiD(|YQ
zvF~LR%_4Q`f#06W_VNI)w^qrg;LTa>yXW(sP<^s=yA3^eRLOTw>!vb)!E+|R0Z@WY
zorpLx9}D;b1Hx^5e}JNA15?n40rgr3!XK`e@zZK<e5LLN>BL8nXqxh5jcgGsl6SdU
zSY-Y3gs<~O4!)n{CzPyeBBpU1EPh2jROSd?C@#Be=F_dcmuF2H_z4u37<Hp!o!Y5j
z80y&Hp`FKvApz!YZEbC{%~L5}tVn%}bUNvndJH>10Qr2iSdWgI0)evftKgth^DuAB
zK7lPEKIvFw6rmBxiPO5;ZApTsf#Mj;*z<;$qvIwszRz0=&bz>57LdNld~+M<qh`X$
z)3od8EvuqgzBAOPG^i4|V*)-KduGfBYK*<U6*h+y-`SPizNXs{Tb>r=Fq+7#-T4R#
z*vx%khH?oo&o?qH75a}3)_|Up6)dpvghN-ehsi<;xHAaJXqp6)%<_&k^|$(km>!Gq
zc~x$wswLUo@%wGIOTR3l?pdQ6K`zNT)TYl($|F4X&X2$gQ=nz5!N&TQSQ0<T2b83*
zLNWR^;nq(wy6+1~{P=7wODvNfq?Zrvw|`prt(i^k^pC9?|E3&u-?NYBx7MH`(aLe+
zP&*M$BiVl=T75Q9*5F#BAnmlYI{R;8LL=-E4y9A;8xj6XYgEu|r2iN%-P+%DIJe5r
ze`X~w)IG10;)-aHfcR{zcVx+pPqf^2993@SLbFoP9I59r)Bb9sXKHfN3OahDW6%d_
zv9aT*ZcR;17idPNsn28DxW(}qy%1H%(T~=p3|ty?sF**=3v$;Lm|qa!Xk{u9(fo=!
z=vQY?B>XXRJbcw4bN<N@bsA-93MO2|089m*1aM%k3x!W|ew&%zJwPdxiMu#C`!Iz+
zE%3ThYl@}_Gz7>iDWVFx`;$I|XG{Z^mWIH4{@Eku1kk#u+8}K(As%Ut(_bs50R3p=
zBhE(eZ|mr2llK&J_*J?sK9}iPyW4Ors>FTy`ZPzkRQKEw3&x&GXQLzsL<_jBx>-^>
zbx$?jW_5m@l3F+<pa&~fl~fTz;J50T^IW8h$or4H(pqXjCT7r$5yk!SO5-QqL-dbf
z^(xUa!nIV4DcR_i6IAyDl&pwSb_jpdbkjS-3Gt=CTp2+~H!#_C%Cb<Qy}_h?UM;hL
zLpkIl(Bz29yHSmx2zRwbokNY0Rl+BF&35?M1bIr<phQ(S(B=}X)4f!~oI)L{US{Jl
z{8E$K0vgrZ*Phu_#Jk`i@ybX}dp<fQ<_e(9Y<E9DRcx!I5Rzsh35pL$!WM5U-S0a*
z*pa_ARC2nyJJZKgQXDtl0T`4vIxph9nORlsBvpM;zk2N@LX;r!D(8W)=_mP5gi2V;
zqc3N|tELf2Q~TPwH<6pE=r0H>j;4Y_B#LkF?tMy{0z_~-I<sOizcNu>)zbsy`4T<T
zC7rX{yj9tfc^X{+=|tMx7n1pd!j+w}&5ejXJR5{S(+&a2SY_`@hG##U4MUKPjeB3O
zEst$BRvuBVy?|jHtLy)vL_shz7e(3bY8GNraQ@z@Ba(W_;x~>)I8j1WkWNhu4WKz|
z!6b3_<X^h9r{219xoMH$JYE+^z59e)v;1f-7n^#;5@;2HE&zKLh}#*wN8gy4Sqkpm
z%#j#ur=1!`giF|chzX6c=d#@%or$`q&>s~fLE^7G&Mx0wPQfZ8*>k9-rnbq*gZb`C
z5mIU0P@Z*0COI_rX^WIk&5trIYGMxpC_b%JeI9iGq55F&TKg74C|9r{3DU+^8cHkK
z6z)aJSG1KbiZJK546^sd_8DcLm+R9n^A=yrHA^@;YJK;q@*YuRH%MUJyHZPv8BC*l
zGCDUd&(9o(Bh1&z;(raZVf?4xsa`8dpo{93*iib)v^-E0c$~~BcNB?6Z>u3f96b*_
ze{`$$v>uf-u1$Do(sAs{ud?p5FE4fjdq;=(%hxKKK5tT5fm86F;vp#eIyS$CQ(0LH
zg}!;UWKZ<pwb*|@U;r{;I!3S0oxx3JL^G-{4h4Dv#00%eE}^Qmd(y^u?N@EkFUS!E
zOZTPz%;>p`Zw(Z&U8z#nXnelUADcHw$bg|B1O?VRPu1VCA%_TZj^N=JMcDWXiggC7
zar=EMsf}}ybYIIW9Yu`IBu|}@7~DVdz;cV7&$%8?gWAbK^2Sr3$Uu;HqDj2^CJ{Pq
zlyO%Ixk*BBKe+QW?4A7I3k!?jpNX)s77`<5y{#`vN2ii^;>7)L7K|-clf_ge-*DzS
zAWM)TUH=wTFXt!gjuP>eG`OZgyM7=YQ)V~lf6$!HnphqI*9qy@R4Gi7x=tJ4Tfv^i
zp+~;H5aHXC(F$BWWMN?71$&WV>DyOJ0aksLeH3H@PL>Y4L;#h;2+@=X|G41f@SEIn
z(nev#soe}wC4ql$J4leuHaB0uVuJ*(7pu$dEuGognf~;s$MpwPI;3az0LOi4NxEf(
z2x0?MECIM<h~^<n-!rlpI(Xn9ZV(GA5IaZrSBNm5uw5W)A~+=0ou(_l<ckI4Jw}47
z8B}cf;%>$gSj)$vH?#%C^yLnAr(C*ZcgmWO+C@3&Z6@hefUYK=hqQ=@=DV96d}BVF
zQOek<Cp_t%N?_{6j(WfA_+9kFX6HFX%Hfqsb;UcDAOw%T0SEU+`<bqfkOUE>u-KH*
zDft43>jD}LZ4h(`0v=cby$7evFxMjv5Yi^padH5P*nmRBd7v90zA2*aOZsQ&e)~X4
zu#Ge3{lqM4zN|Opx1F3TtmujJpqVGL+^K^sA$VY3c%yTmfw@t^$?s@&GV~U)M0&B4
z>@&b`_=MI#s`J}}wa$8PAiG;~vu~Ye6{^|IqJD?C_Cb{J#umLH3qq}1$L{@x0e;An
z;iE;OF#;T+tVW@rxz{>xa&+^qBf6#2#=R6KBjEL-sHLTq=s_3U0rkjyncj+?x2C3`
z2+8b(hSTop%v+|J_uz7J)>QV9o~f_SQF8k6vZGQxyQvuIOCOla?Wos4+`5>q!1IsH
z6bSdG1w2-{bKQ<3abf`5FO=dz7|T{|(L`!c>7JKsRIY{6iYk}}@kn)0kkNVX@cl8T
z_K@+{bYo&0;#(uAlQkerIbF^ISt{5etVyp+fruQ9DAM81{hVVGdIkhzWHv&x+4{Wq
zDQ0^&gO4$nk5!l}<C6=U+Vz?VjuWiE9fF)yheJHvW;I>J<TAkoBH7EF*Ln8u0H4O^
z2Tm6kh;8ZzQxyHwz|-k;UZSagKM@#g6ST8cGURZLU{4j|n9q``Z3c--kt^U0nC^ez
zMI-GC{x#1R2+F?#YqgY%hv&^ABKvCUyOl(`-)j$*HflCDHm55;qGl?g%yjk<BCgM;
zn>&|GrqrErcY!zDW{gb)y4TFXH%{8_Qb+J#&A)(`!X_u;?oLHX2;5iprbItkGO}qG
zB_rOzau@+?N<wG&ld<x&!*}uqCW<R!O675eD{n$W0zzU(q2|a(KQjp4T^PvhV_|xb
z5(+4lxi!QYf*F+yaUWD#UKZHS)?0k+0ByH-YQarkUZ14AQcSxq{hQf*@ncf&^7}rM
z1dsr~hCTMvYuX7Zl$_|+EoPj>bsWJAm7^}m3RPWk1+3r(#FGnU{3T4s6%>U|{2m0R
zpF*v)L;*HVBvdEJhSLYprPU*9ggqo%P-R^x2v`7TU%PSg{8CLfu5Jx<nt%E7REra8
z7PjFcM3$VAO-=yHz<R-*>`&a3_0O*KaeP}NR365}TrUN=)K{-Z)3s-}u~^IdnSgKi
zvEUz7q$uoiaySS96|WqS;FgwE`u(TLDuB=qTN{pH+|`sJ_xH@v`9;MkQ%?L?Ra2Px
zj1I`fbKi9SK@%RDd0Qe#^tG1q)yd>nuyOG^N5rnm`TkM|MPTi?)7q;VSFno<!;<2p
zI{6n9Ph}Fy)HiE#GZVFZj`gnsy;wIf+NufEJe%*2I6lf3!Td_EIg;bfQ3US8gTIM7
z-p+j$!|19=e~wThou{41d}XaYdqYy?&aW=|x9ovVJiyghxq{dCSJ6tBFqbcDLa_s?
zFZu&*8BJL(;xapuJPb<oYI=3Y$<4!KZv?q6h76lAI}&AvXXg#49rxwDBK9kA4XZ9e
zTAKz2SpeC*;B))1(Z=7F5_Qy3Os0eQh3dhbqMIGxL9#|y?aF-7V<hr;nJa7E$$Zw@
z^MR&Mu2L{*^wl+{wByTRJaD&LF6~58gP|@8WfJ0KP@OsnJZD#JzMYn%RnX8@TLVmx
zWn07if2yMSoMt-c%h%!+*hy05{Y}=6WzJMg%;h0ojhBz%@}@F0XpG6R!Q4UqL8-z8
z8b0<L-F`yor0cGw70>R&>@`Kqq{b?#)S^j}j*|mDmGzSx_P;;>AX?!*D0Kzxk5aF{
zaZ>$)&G&WxbwGo3^eZb#r>t<w)t!WMVKbY5egnL5u(b=A3TRBa&57vvn_Ha~O!^Z`
zT!NqPu%hVvC)P)P(##DV#Tk!s_;e<l!tW~i3Tw;t399-R2Ev_Y{Y#)k%RUZYITD@7
z@^5{5k?0#^voCl+T&u&%AAQlHh!wvdwQc+DB|1i+DE+6TB^<~(s>V$des+JK8tzCD
z?ub$KiG*Xi*sU*E#7zb1vmmP+u#XEp6|h{uQIC5LUC@rVKO5m(^Y@$M$MDaE+6b^^
z){2OT_{EOr{rmq+Fsfr^>w^EH3NHN0P@9YOvpIj1MC}womy-78cg!DUy1xqS&XYQU
z+yOFkfRdzl9|>$~8->4C0?|=v?6}k)X;nCN?gg+wcb5T}-hC_7%l*L|{$iooQc0nu
zxtR~`3!2G;AAL?vA_g~ANP{1@unEqO2yRUGq?sBzO3Ct+R&aP1>WhHxjdY%G1`0)}
zRIjD_$M80HVUZ5X|CctufJ$vNhVkg<%zAyc&R9Xc{M2Vl7JaQ*yG(YQKxw!L`SWtc
zrF;lr+1o)L_QB)2@;`E>fomtt^w4?1(DP88WR@*4$lN7GD8?vpw%wen6Om>VMqXM&
zT=m~s6u?c~Z-6zd$4lkH&8mQD;5HeK1{wXEWS$!A%ql;ix{q`U2-r;jw4D43?9KfW
z=?_d^^SJkDW(^b3y+p7N_oCo&1qth+<KtVVsMu?F&To~V40_@#{y|>t$tF;45dh1o
zH&`__evS^@=iNaW*4}#21?LxqEQA>h<0mx}LBbzE8F>c#*@dM-qL6^moB+>|SXGX{
zn8P`E%aw-7i+{xn@p%X)X2RUX<p?uqzxkZ<;SAt>W7IWy^cpSYW)#mV=+GB{+`WzM
zU{n#Vh6Ix%gE*)tRD(<(#Cx>Qt^)9LmAoR5FfKCIKDEWT#Li$#0+)RTffC@HQ;A&P
zpLdBu7+-HCq&kH_!L;TVo{8>>W()m|52>jGPbHZSK0w@OzT@3MgdC#*h=`yZ)!&JU
zTi_#1Oy+IGJ0Xu~%BwqPFL6F##4ve%a3?oZ&6lj|SmCpsveuVxEV8Bdpx+;KdWC#7
z7Q$pPHsZh2nV1vCdX#Ao(!56+Y0^>-L9rpFg9bL<W42DK0jLmSciQ@X0eX;Qx5S*P
z2)Rsn&fx^%BXKntr-J|gn1PpoEF~&)t@n0J-lB-c_1FbrC*mb|(a)&!<}885bDLo8
z_TE?F7znhbDireH6)7~&B6=X3!H-z%0Reqg|81QCduE1w;?k8OdD;m?BX9y)HmSKC
z?_poq#Xi08{#3@w)Mc(%*1-I<bLSs*_Q|Bq8?%LW@n#)ST>mA<uKK>V`30&yU+q@;
zT|eg$5c;|1HnYO0xBez3$)%<{O`*Ba1hx5#O8H$>p)8VU6U)OGOjx_-gHUMgMFJEg
z!o?dKg0R{oP6}^kUkSoUSEW;1o9D?6pgKA!<UO`rvSS~)&XskJxo<OnO<*?ZqSgc3
z64Z6LPUdtMl3)oaC`8P;+Z!61m853r5Py5s7m69=Yo3((vumi=h9PeO=O?8OLSkZU
zXiivnOo~aq|8~E1m>5yk?Pys3e~f63ip~Gv&ANP>UgMu8?ia8S=upFXnZl-g21NkB
z&k@kc@%J~_@sPEN7iSD>vp)q-wtKt8@ujXBnuTwJ8P_unm;Q}dXt3cdG*Cz3;|;YR
zIrJ@FVt;_UpP;j|Gg5*GGV5WE$L;C?gt=E>WBDciofN)Hj;u7{apC232umrtZu|-L
z^wS|Uoxc2$2!56RGJM*mequl##2g0G-uziB1#&Qt7l&f`AxRTX4iLip0;#$<8Ly3D
zr<({d3;rd%vu*0L9$XAAHBd?B24y*No3Xj}1h<I_k5Ml*uzNhdw!Zw0L!~$^xApy>
zy(b75pmR>|#QQ+<bo@BJKlTm)$<IJ#%%P<t?Of@%x29t+q@KBb1t%ISXh}va=n&LM
zVb7hmTjDxhzV}?_gB-9y#AlO_Wht@gR?5oGeg<H5;8O2Y4Txl`Hj({%dD<{imv2yN
zfKgU$B;jv#HzMl@Jf>u41G$d#-dTAy`G|HdBw#XoOGSPWbD+8U2`1vS|KVPS-X&0A
zsc(w-cQKr?izTTByvN#H!bsX~Nd96$EwK#&l&!T(!R_|b@hdWM3$qzM8)28g*3a+Y
zr*$AIsI(l6b-uj2anUWM(rM*5kH$xc!EZo0-DRm?1kr#7o1%**KI<y2DH3lhR$;G9
zgyWF3*$6}-%~FAu_o;ekl>cqjMz}V(H5<=Hg@%;n8dsJu%ljE2$v4Cny1uDWuBxn_
z<M@bYF{6-Wb0or&YXxCHBu8`kF{fD$>Dx1~aoDIQ6H*5lk@*s|5%GcrMjYfDFZ~Mq
zJmV*HR5ydBA@k;ok=@pR-9llT4^~Q^^`soT<FbyUGDz<*D3WOw278RAsG^rCPMiGk
z4pJbPL0mFe4V_NHMsSkbj24|o<nn-$MVnr-DgzY-iF_i!T`x1r!{-nhR9zKx4KheN
zJw*&7Q5zKIkxm;385F|IZ)1=b#skCf^`ap_zI1{{vAesTZjKNbjoWZJV#V6Q!a^e~
zEKGA>p*VXBigm%MmhB%ust5K1sWTHivXp&wpI>4)fBw8?R)y0<{q4(Z8x~7yY`)s|
z?6voepl9^`0aE=(49aJg!XQ%^D=@!LT@S5pdOI9Hem_rf*a+=`Gl(dC7vgcm9__lR
zon7v*a|?@!Y~6`&lZnZuzk9nGBdp}-B1Al3v0$c4a1-4PSFWmpl-gzPGskpwk3C2-
zyBzNC@N3TUWg*SL+$VT0Kl#4aJ6Qv&#iWh9(1H96l3>J`h?m0<*or^L?^7}R`UBN$
z;gb0>cOQl^eYkY=?x(y~cGWZ@?tE{ErIT!Gz>SDHb?v03NvA6!t@sSwHKOH2ZbEFD
zH0}USha11;z?L(b;)qdb6ybMMYk*|DUdW=m*R~I#n4y(#Bz2YwBHYt&fWGr`-Nzo)
zoo~N17YEK!QY|s1_IoDOcjXfnME&0m{Mu4&O_0cFPdV4JR`{cFKkdvOdOc(Ux^$No
zEnD|se}R8w*kOCZ=Q4qDK#d{`jrhDDBy7V3Rm&wqiMyNHyP1<)7lo?15S#=GDat^)
zbu<2ukCEo!P((c7xfw;$9o9tYz;jb)L-oUTtYtnbzXDo1G#Diu1mGCb@%PqBV=@_e
z`{7`)``C)2zJ$yGs=FUgzt|?uk`uAOyHK_|P}G9nh)U^7JI4d7Q7>~PSr6i&9ydgo
zYt+^H(l_ycTmX-bYY$hj#ihC99S<Ii^c_VTKfl8ZJ411HgxYO60k2JyHw%8>KWZ8j
z-gbM<@w@u@>KWghyHO9UhR?0i%RNjl``~(09+NIAaFzYTFJsF;;fQcFY^lz617{!x
zxJ<mundKOS$2w}>5aK0}#i0rT`kNptux!(&6l~<(c<$y<1bM2p2bj?#=A<SPG(lR~
zF~F2OhZU0<SyNwp-n50VAzhxo4^@p8!{${!JZr1#wN`iRns3mWoo5@z(>S)MERzb3
z++V6{zIBUHv2I4C0MnP+cKiP1%&NW-Jp2QY`n}(Y%GgrduNMa2+b_Ga)eM7ZgU6eN
z<+b{vffzh3HppMB_3}euS2Kd`vf~C-C$e1zA#V|&<fO(%Z0Q@N8X-~-FTUS*c#csr
zUk%QMP8UwF>C!bZ2*T*)8~v<satM|Aeh3<>yA@IjA18Bfr}E}Cy?cQf99kIG=L}-1
zf6Eb?M&`}I!{F6SEaK4z)-L&^p^DgsGb;|K15w=v?>4r(L9*o`{o9=&1ud6wtSVN$
z#>d0UMe6MzY38*+7+k8)o=}oj@I<evVZAZpZhoSqRvf1K62#Yo94d8}-Ji#No`E#~
z0%EGAHqdjN0-vfo{OYaQHOHU}4V+;<b5PZ#=gZPuXG|eMbpqK+FR6ASN;S!;7y;B_
z*L5FZHqH<Y^5v9A<PI(*x%xuuKh3W+;Ml3<d)yTm33lE4)HXCm3JP9f>vyjZkC!ie
zDNnewJlvY_$&2DoEKLRD=o`>%2n(Y=BA^>{U9VX+P}o<ShB$kTj<AHb`)$py*VN&i
zQlH8wb;>y~6G`qg-4&932ziG`zZBFi)uE<|rP~z!a(o0}+G$1A^dDpXho=#yvrd8L
z68U9EjG6DptzHVEZ&x#sK>PLG5<vlRi$-F_j0#{8!8WA5@7?ACEP&V4Kc6bz*FLmC
zhPd<|X3)^0YV<hlTj^IP&m#NSI2#=Ehr8<zz(W#dH!BX60~aDF)+>=TLBjZI^7_m0
zsG<n4gPB2fOJYd9%1#9kafE-i2~}lY4eE-`UgFEzD!eLr8+wp3!zH5MC<%%y<%k^R
z`+1i?i$h_k_*TO6EzM%`bZhN)2k--$OT9fDkm^9J$I=_cnSYgr*|v!>_6h|)(4uYd
zQ=ryNrc|+YJw73!$Bj5x?QpI3K2vy(R~t3!y7x-C8p*BpXJB!*@KJy{bDE#u(Vym2
zRyB9?qAZ+0%Ou<;I4ku7u%OwF!P!(1A0beUuwf6P579e8K&cW43dWf-?Wio&x3yt}
zAQEgcb%>6R%+87J8?N#7)~RD`YBo|4`xWwt)?^ep(Sd){^Yil@U6wSUh;8>q!{>gq
zM_cBgz|zTq*n`-3g#DmUJas3xp@PG2!YxGa7raEiZ%n>bYxQJ&OGc$Y6mA=@yN}?#
zP%&e8j}Ev@G)rKbUV!y~xf{h<19jmnLT@mV`)!fLdb|&!<Nw3s)P&9n*Vc1`)iY-B
zN<h`f>A;*EE5&JFTH2{2IH#XxczW^Z)7}uo6(DXJDfC~gKEeNbtY|07!s>kExUvfa
zy!YowRCs&S$p!v=6Si;rqx}7j(7d=^^ol_ji~J|9rs58$jJ*e$t1o=a4?1|ZnK;fl
z0YdjACKwIVJnCti91<Y0pG9&WG%}xJ5kDL`j4JqRh~MMeyL3S#?qewN40cl!hxZ?W
zM3ag5kugM}{fLR2?2=814!DeVV78N~-nb4mL>qx@dO$gREM-V86@+;LaF*t^#Nw$0
zD#a90x9@RlVpD2~QxV44GJI@kw9$wi{`>(-I#7isCV;#E+JX$3y4{}mP7W_azJ$o5
z+>;hOMh5pbH|{iZ3T|`)f?nyW<*pvM6HkTlYTmChWo>egAnGyzhb5~FqQ^em;9)Nj
z+w@!wZ7*`)xdOiQ5HfuG^F_{IkS8@`4lLWnhDAlk55{~Gb3kn~ixT|6J|OqKdQ{`W
zg@+K7(9)8T+EXH@Vm=I3L(hg0&ZjO%QDHiu)c!Q_*_%bk>f0Om=A{BME2G=f3$SUl
zBUlBq`?vsdxYquSJC#pD&mOVv*E<)`_Keq@FmMCn$Sbs*a=C@v%s3fb$n>SbEGq=<
z{cVpPYhm}4qIQs&JHP{eyn3qX4f3XHw+Scja;e*05_8S6zkLhl2jljK8{FJ4)UB|D
zu35|WH}gS<m%S0bqHKppi38(qa;_mZ=5RK3h@bA{QGmZY0>%|11=6MGDH?78+SrfS
z>gBb!HsOBn5I(SPZ8O6P=ZPK5;Vi_8^2*zAlL`d$FgrD3Q1Xa}QJ^pa?l|7Rh8-HJ
zftgn#qzCLCGfLQ;W(tm2D<YqNjs%vB*XTvT^iR3q$@6RBHeZwP9fvca0I}y|VtRBD
zcleqTH#YHHu)A}DEaE3Y7r_po8MW$umqd7Z4KSRX2Kwz(AyqiY>kIs58Y_q2AJ1r2
zX#5-)8pr*3Oxe3q-*RclxLuLC`+!A~49UKQDd2a$CfCP?!Nk4SBbKIxO=&)n3jU4(
zQONA;FEe69_WfPNN007yZ}|hvpmfQ`4Uj9SxjH3_dp9g_iB-?_+sO3_WCP1sNvN5Q
zccj{+Tz`TYd>*pzXmGxjmzn`xmulmk^7sur5iva79EvHR0f#6I5W`Rp2vGO`1Yf|1
zF9z;fLiAt|w22G@CtUm7^(0%I=^oC98o!I6RO`?4$6*GkJpVkn3<6GZoHD_oFB8bk
zjy`-IT2kuE3oqRidFi#x@ZLJik0F3j0O2KA<%43+g{?4PO=x!Z@{9ipbuHlyiHi$9
z!d<Ec6&i6Rv48FqE$t>zI?+mIXh}fa?_>+;*u$!-zAH#(BliO8;eSJ<>m%`y@vg)D
zIP6M}doGAo=M1tFQJ!<6MTYHfcpc1f93-^}-`tXZ7AhXaD~X0tWP+`zq|mt>XEU=F
z43kgjTdpgWO0k<A_%-lm`DQkI<5tf(xYg11$sXMGya}9Tsky{YK9TAce-9>Jmnrtf
z-o?jX;}XT(lj`0UKZDFalm&c+Lmck`&St9!ZH5drSHWgxD5b!aqOInI;hROovq*QK
z8hp^bn6D4#Tj__mIUPwSa(MRV`7tWRY-^sl2lqsrr<DIXPt_wgTYdU6O#D49Wa1kk
zTbvZ23MmbgsW``o<PiACtxM(~vEcDkcl(J3zzKg7&t`Z-_I5^sI8@k&14Wl{s<2yA
zKwySUhHO|FZI93SQ=|zFgs{j+D{~Qf`VC!V%BjadW%O>)pE79>13og|e^%Z9)tgQ&
zGSup3v&v`|{q&oQQ9b1yH_@G7Z&Mo$!?0|llaKTM`HdtW$Kd&4XGES~CEP}Sd>!}T
zYlsc?gQuPh$ovaJc1UkSSaGA<@RI&#Yn@$CPSYYgTVKDO&XcwbmOO##hGqEq^R3hj
z46XGRe!|~z$aE|#EaM-<>#oRaAU;EYjQ7zO$d7w9CRHzg>Vds8894@}mTC=fd*>}=
zCN!T2xdg&ksNCoCiO3=x$HHW`Lo70zo2S3vCN>>RzM88=ow8uOT$m(u27>2c##LOb
zyb71_Zi&#tZ5_)bja%xm{mC{n*u7ixmmZ6rpJ{)*zr<bT0KzR+o~Qc*dQSe^{3*;n
z2_k2N0@A8mgxjwX9P*9pv6(=a-RYbv#XIdrtFR&t8Q%Q?#s1sO0&7wFRLr0^h`f%T
z$qwDlVDN?qXUBJhoznYaWFPL9I^aK!HP?1xeu0lPVubZ`cPfTkO3Facr0SH%h})ya
z#zu8O+jHy&-UTvzdxn_(y%xFInR-k8FmMG1#6Rl?cCRZ|78x*wyD%5NayEA1!Vm-%
zKh`{c9Sc-cT~I(d4O;hkcb57!S^aj+%eQ(BZ|tm$Y>axZwn77v6;xTFL$JD*eCZpR
z%AMM{CpF5j)u)6W2Vd0!e3&gv^l=xF`wvv0{;y5$_-op3fZPxU0@B^KL3*wfu<eWc
zOJFjIs8=D)ocqjYx0ZFOyN9btBIgt0<GDpe<Aa+B?kC^?_y9wAc|O9(dki4dLXV^M
z0=?MTvxJPwp4OL41VTk5ekhtFr^u=m!ysyPH3MwwO{l4<tu|*mpol7T&V4BsJVOCV
zG5(&UXQJ7+596m*ap#sn#hq7B4QlCjgij_}Gwoz~VX}ZJ#|Bpk^?}UM1uFgs;<OR|
zVZE8jrG>>q9a><&15}h0#Uvr!eCylGI50|A*slTr{tu8<Y(Jz6i~*|FB>qRgm^Yh@
z>L%mVSmOYbxY$8Lu@8hDZ6rlU{Qsa%dJ#IW2#&}{dQYJB%<X&x(;$?vffsZ?<jE13
z>ahW5aW6;%6>Zy_t9kDZkbyS(En~dCtyiwr@L!SLN<=WOlb|1kMx*&2RRA#q5wMsC
z3b%sF9dI5&T%i%l0L|?hus20q8ozgCn3BvQR$d*mNZw<?X0i#E71sbmdXC)yL@u+J
zC}t1-{w<+QqiipQ=5VElqnl7Oq*;-6V>QOXMV6NM`v5fQlgxpUOPhV^$PAj(BvBmR
zzZ?th+>n7kPBFL=B8#}-1gz-VWHQXJ1}b^&dt$u2=Px`In4hB$ZtBdC&$#^*ngrTl
z7UXW^I8(_~f;8bBqx%JpK{8+t!eNF%s&$CE9k&zY6Z7<BmM~z~5fpD{*fr(^z#m9o
zE8wV!h>+Utp4ZQUwt#K>&8Hjq(;UD+EF*`~JnP*#4(?R{-HBrl9>{WYtB8DFyl2f~
zz9N74yJ+x~f~C30VTLRub^rTC`d{IP{4ynaf519}H-=y79e6@c&#l9qC)W7xJ!iz9
z_y$BX*U&lORv_NE%zX-T^_;*pSPuxBgI=feckUD;{-~&|ydn5P;sg53=`|nA6~wBJ
zfWg8WzDy~!33v||JgBDmB5D^XEZ5R+4oiS>k+yB)S=1n)l$sOsWORbsI|~pZ&MNa@
z{u2frYX03|5)jn=kIi(THlhAFuP_xMR;)g0;(gi~>u9JQHg>PW#7jCxOVLva;yX`W
z;?q>bTq4loc|)#UvM(BWHgb<79j~_O#+(&z6su~fNCO-1Y4Z=;gSJRlp%=K#+yJi~
zKACFhfPGERh!u8uUic7F^b29PJ)f`tMuZW^(be<MQ?%+DHBAxnEj*R>NX(x5%J6w`
zfL7Dks$Uq!4{FUv7Bo8rvbM$ktZg{IJ&&?yX%do8eECQfLN{Qn+<zwA3t&*dyU)Yq
zXDO{MipVZt&quRTt`^vui$or|^8Q6|Q}0_-A)r$Bd1Yw|lbfu8XYorTYxp}ctV@m0
zPb(NoEufh5LV-}`@b~70plo&enFmMFBap+o9WAk~+}|G3@sw04#;0OYDHgdpfQv)Q
zAVitr#D&#J8dH1bX-t_?cP^S!_2p*=?JjqF;NT=ciQ9xH>&4}i=Sp>MZ?v9;^DPlM
zu0gA9BFe2ZMK#Hu*!1`CH8e#`QD>NHXt@JmyV^tzB@QEeV3QgP>-AOmkbX;qP|Zu1
zM-w+RE?qhiNroN)0(#P?sH0cTMlT{JYp<s*ET1BdGiSnWj<8CH+gLt*r-!UAxWO}j
z&K)OC0`ghP_RUNcF{NbKq+VB*pXv>Eve7P4r5CGzrCH9@qFr|_3U%99KNp;;UjN$i
z-5Gfy*C2BBW#R}JQR#3nsa*s*urA51--ihF2*rz^ZkS)N_g4!ZGV(W~4N${twArMR
z8yRtI6hab8MVf?mP>P_WKgAa!V}Ak#+|saLKVdl&n$LX94B25+`S|&co5OcsOHMpi
zuXYxteMHrr)eg1;8SDIN;nX4e{3TqDkxKIUc*iWrQ6Tey=5LZ{0r6M{+3BqA4yZ6B
zg@T!U1+ZkuURwZYYY1WludyOQ?;WmV8!E|!#H;F1eUfJwP0U8F4wH?J0XTgT7?C=o
zqTxOqR7^@V6xO0sOV%Ejy(_>*I=4}J=7Ft<Q0M`4M3E`Ib-Pc;m^mbU1>7Wvn%IA6
z`VN&(q|PA>C7rpN>4%0nK0h}^Z9cQRMba{=6lYNBGnVpjb6>Qw;v$P+xc{gcDQyLS
z2Ybh{e~$nWBTe$LdIPNY*KFEu;>VqFoDU>QuOD33o!uS_f}qAfWC{1#|DIi8q=g~e
z`A_5kK|}@zE+Pp00SCbgwNQi+7^{#BSs`U4Bm>p6>gZ)zm=%YQ3vmI)8e&9W+USY%
zz3xv{Fp41ziwe7`QJmqvt%Kc@%6;Yt*B|rwKW-dp=a7w~n1P8jNGIROz{1k<vP3oP
zJ;@Flx+$nPu7({JsY3AFG!UE^cHd3~5Elw#NaU_ic5;q!R(5A;;2NN}7WW36NCP4B
zX0gdy!uf9I14PpC0+%zv+eTtcc@!?8ggF0y7U4Oshfy=JG}T+w)E3o3phqhl&vxz7
zKWJg|K8GX1KmPAS3E{-HIIMm6aU`GrAEkEi!h)Z;Avs-ip3h@mB+c@qLPo|DVJ?8U
z>l0*>+g)MLSG(8Pj6K6M_#8<pUzr~o>i$7-g?R=?Z_XgFh7!cD(v3&Q*KlX5I6-cw
zy?6wnyWENH!0B5hqxxw>cC-^d@WCZy)T4E$pCId}mJ0S$KY0GUzt5ioX%;K)!SSh(
zsPeBPumZBXm6SfiC6w?Eh8`hlwD2)_AFlm@MYt&RE0Who@sd9#1_NjP%U|JnZe6Mi
zgU9JlhG`ZR_30TdND(5OoIK7gE*&1+<P&He=zS{dv>BKD^W?-tfw;Ih$hM9tkKzm)
z{~MZ7%bCAq?}Ynt1XlPTrFM|hkeyvbayk~npP~SJn7&+xR#9lEMXC%QRi0;&*U+E-
zUn}A=l2l%6I4I4{`F<iSGSp0?xVobq$uNT8>F$J5)B>Fn2GouaH`F9cJ^<r<`f68Q
zh+qr#KatDD!L?cP+RoyJfeiw;{Q&DgL*75n4@>iy%%2Bmg`IjEbwUWiUILvEjK=Ze
z|ATS<?}l+u-8MWLslqw_?u`E*21Gq%pLaa_AHRW|haEqL<UcT)*QNGUy5^Rai%_6?
z2HjAvPR?myGo;-vARS;J+Y=8C^^aW@Q<KyP<NqZ%GFeR#xJ4_0WT(zk#<-<!h@`!&
z&-drDV{)KSzX2uAHl%OHV$k5PmvJkcClHH|utrc#>8ypm5f+zw1OOdM3@)Rn+!2Vo
zCL>J8Jae8aHzCcrCFQyHv_d;#K7@kl_2FvY+YAMn3@Cx`A42HQX+K!Jo?~cYi6A*A
z#VKC^PG!afe=ZT>ro&<6fiPoP=%a7Wsd#6LJc?vE1F4(dum(S<%fu%pT7~esQu-k?
z{5ZQK4h5n@GHSfAN3cX<6(rH2i*f9QBHis_C1KqYQBd?MP*$-17^|3XQvKw%+YhCo
z#(qqEVk#yMjfT&}oG+(_p*>Db{5aGEh`#lK{vRzZEWY^z%T|&mYi`b_#roF~6IY5j
zmY*je4Ji=(jZmaejIcwpFs`UZEi6U3OB}8KtQtpk`2+6%F+-TaUWIfi(D7<v{r0!?
zj!i4xKiUf1f!qVO%BjwdpJ(=6O<bd5jekZmp1p8)XC-aHAQ`Aj)P1LXptXt1OaL}U
z%by`x9(s<&o6K{;l));IObM7l$&#hV$ezyp<i~5<OL_tQK2X2DfcQ*G>cm_Zwq11v
z<?K_D1dYR#Bje?Pbb|&0{y(Q5Q8O{UdH7(K#2*UEjkC>^M`sx?Atj2$o9nh(x4g3F
zgSW}v@H##Z6@O+R$qU1gf+jBGxeEjjQ7HU$Tg*K&^($l$734WV$euIXk@DhT3H%yC
zdFW(+fH{JPLJl@pd*-Zi%t}@F52chk(ZIALDtFAh-bo;vqLUi?Qw$?V8NyII!KbYr
zY|~)*am`T9Gw@^p1taO4!nGUzTuxC6Mm|}Ms%nJ=$sMeudX8e%pT*Ct`F}e9??^3r
z;Z=@Uktd*tUxupgiXY5fwNJ>6`wC?w;+<yKJ_utShJymD%dwUuYoYuwe(yY>>4rr*
zC;aXgGL{P$=_SNt%0&NEU)b?4*IJ|Zfe24}P!KN4cJd5ruD>vwJqfRCs`%E|vN+45
z*-n%Xxnc7(`1<Ue^K#U4I%#9=ht)7_@19RJ!e%Xit>r;SISoPygBRVAYE5P^G;>X5
zRIE|V<@vgi%l(#_qq7_22#QXB{n{NPps&rtQh<v$pV*w>A^<OL=>nMn9HFg@bNqd%
zAH2(FP%Gy<L9UWcU(s>w(<Q=!!uzw-7Eb_Ctd;nSj}D4Ub%Vlwd|IUk%&6adz~p$C
z?a!P9v3?uy7C;z_zeaJ=_#xNYhm=De;2y*omAjm;sq$zY@t{kzP~n6c9W~N`b}_%z
z?aO~cGeiA^2zXb*{P=INVjF4BooCz&C$=oRg^QjfZQ|hTF_QjF>79x#CaZ?#%kSJ4
zb*1+Dq=-1xI@{x4DCo96K36pSaGAFJ?AOcjB9=$T6xTH_qRaR&hrfLdIuEyEQl*_0
zp_@_Yf&aPFdimqW<FcAC({;yKV;``-77Sh~Z_%=KO&5SzuNNi65&e$~a3@s~;=W*F
zPwuQj!BFeF8K<qxHFQ`?Lz=@!edrOUs4h*n+TpG;(8V#I9tu7B`l9PX^tpHHuP%6f
zuX*Qjq0zUTYc}n%%#qqR^&xnKKWZ{G+E!RvxV?rnw<NAZN|9qSqV`O6>-IxSMYO$C
zf^$7=i?m$uK1#+JrKW{>0*2>+ztaYlYkFDqOc%kURR;VwVRP_1A6qi~PP)#XzDkz+
z<0`&a!OVz@f3QsY>14gQT7N<f)=G9$|8i6DJQ;R1%CtCWv1kK5e1i*@u3<NwaV)MW
zk{tf5ILo#I`zeY#bR;L<6|pRy*l-07duHoxJ2nEqs3IN;B<+l4WGa<pFVH@Gz@(}^
zN>Dn`LT_~bBe9)9k*RL`2>&sES?;s1^^$nbxyM3#d9a;aI^`wwGA}Pk-ND1u952fZ
zRh<*>9D`2pd@+aN{vAzE9whDvLfBnBu935kVS=&S6Z{EjZ!FYX@MxqbdNf~7j7kx|
z2qXE-ymIgRO6ZapKM7%KZJU?;r+ZXd-(OoL?yijFtYp7GBB#oJ?mw|F8JG693i8W>
zgkMf{uPHRRf)*kNEqcn*l8oS`K+H>02jYSHRHL9k!gjyLo)Y`9Pbt}>52sqLsK0Tz
z!Bt^me&kkxEj8-r)Xs<RMXmB}F=MQ8N4HoLc3DR*MFguEJ?<4trH5$|MV!O?zv%kQ
zsH(dD`yZE34t);YjUWgJ(j5mxQ4l1Q?oR1$IDjB2-HJ#HNOww0cXy|R)Nh^pbKm3p
z<p00M@X<BK0ruH@?X~86&ezOY_Ft&7NhSF6^tUl%gyR$BOASmpK@<fL;QD?m+bQ_B
zO|OS_l&j>M<ITjO2gvC!hl-f=e6Rm@f|jt>=|9v5WEHB)pJLR&UHDD~S#;@#?!~gC
z4qs(uUyD9JnAf^RE^^@EfY73v5)UhU@u4?bATPxVadt1sG43@KM<EnfdU;vmwA#D4
z7RbG*u&r{@7oDgV4{|N3SdU~YfAqKZAUAOFEx7HqAKv!osc)7Op;(h-e?<COB`yE=
zWUC4d&!Fj)?C6XX`<kSDLYIEER390-Hb2>v0OH)e;Gi!pOgh;b_n)CiTfERP^b+)I
z#)H_=_rAUVKXUEI6W|km(ffrX)t1)3E(_?ANIz2#KpTwip8x>s31|pLb|jB^KAO-j
z)0eUU1|_874kW1#oUY5RAA$sPEb@3do-gFI6}>WAY4MKynZ6$gk_{a6ZT_>CrkmfH
z<Nv^tD24h1LMKrN{6NSu$hY#R({w1LsHTO3kyKlETsSH;TNzvu@}4n%Pas|u2Y#=;
zPesC_Jmmjfcuo*GwlVc<r_a8Fc>|2#58$8k0?eBc0E=_KTn=5I(6R${;I4f@9YA_q
zqQR_F0R*uF+UDQE(&OWqKv7045D;F}xgM5arL0^N?Iwx3mlGhs=BQ%L6C({zUl#~6
z%}HPf@nPB#Kw8*legT04)pqK=)W0pzSj-Hpc0Q4<sW0isz&>XFZB#@=90+lI>yr={
z*IjVmOm`}M-uDn+{4(pGVRxc?jldq>Y9EDc7hV1|xLHbI&*Bhr#0NpTN(y<iTzwU#
zYDN$NTGe;3_l^3c{T_!eEGF|3Dy?Q`h#V`*5nZnMC;i7X{Qr}vMgM<D)TOH>*O>8A
zqOjp4CE!kYNcjLsLj(`^cZxgUeTl`Oag(&ca$5<*ADbK9>86zPy*UmArKT^A&tJfe
z`a!fx+I`(&Ex*EM(fdNjedot_V{kE$vEJOop}UtYcBnt#XASPBV`$U|26Iyn+e?yc
zq3M@A-CP1L#Ln!XNekp<+U*##0+IDCKsFptELDzV5ayc(TY(RtW&E{Wr7}L%^{=NW
z&%?IpOYt<i-;3R=6Gb3CoQIWPbVyKL1bY4LEaf>%Nc?6PY;_39*8G(KcLIL}`ts1!
z9~$5<M8Z6Ppq1BFMbAI_(Ogm<5DYq(e+DI111>}XnJv)$kaiMeAC8dw$=9>}S&iEZ
zptq_vBcTV`9m0vX$KAA~2sr#ZE+FNdA1;<E=C$%&;@~oP0ExZV0rhYj_*B0FUR1yZ
z0?-qGSb=!hQ{ZJx1g+BYn$<4N*#WK%v$hJ}QMX`<ft2fGkm*@4JYYg*wHm-kf~|ub
zY*;NukWQc4U{}h+uhok_^?uZCUB4zttoFeRJxk(7AWFy+D3#-L!%Lmb7Ym;2ppYYR
zQrGiM<7U7r5R^<VkCu_uIak0C9AABt9ZI9(-F{v4xqf`;dE1tQHESBr(qF)+|J@ph
z!HlGD^DH6D*+mX&WONP&n}E|x?J2j$W{9&voB>#|!%-+PlhXZUfI%yiu^8<1O~R>p
zp8H<|8s-@33ed#H5=&vYdP!W3X9Tj9F#FN1xFIj7fg`H|>{$dtx6=5BFm_7!`wu(E
znDsrtXrmVLxFqhrXP?KtttPs35u|yg-9eb)Rgsh=DTk(Nm@I-DG15)r+0U3EEQ?LQ
z1RaIILZWRaijuv6;gQFv3y1Gf&>B+ANB;-RawY#lwsfnkld-r%<mILWH&6|L1nuJA
zjxJ3!nYD=A38%X*{D3ckjmo&3Xu}|gJEnc25>kmi++~X&)kFt#fbd8(gd-JKMA|Xf
z)#U%=PDDm@gp7RLG=i!r4|72rR%Lk{Sl8@9`-~bMzi8e2Yw&EWC!1x&{CM~M71$Du
znYj&xK8Ng<0+ICJb7YK>AMOO!iM7k))t&p}!v|B#he&U2BwkT9P$+w+L)>%h=04Yg
zknW2x?=rtP!*kOYAc*J^fp+e;984M?-EE5T-^)JAo>=w6mMI6+xPNn5f%*h+aqs<E
z?jy}+GruHgCmjMmVjM5W_OiT%^jVwc(c;#NYNW;0`@?DkrJgFQ>GB*j<P(?qX67m_
z#4Bc*7LjUO_i`Sai#>{MG%{mr*XGl_D$TO#dFhiuuBrd39c*8NJ3;3LqmqmJE3CvC
zofW+T0&8@t&*%*d(P#6@?wjiaBUNwM-5ELz=E0z|7dx3N^C}2orO4aygy9xxb`C=z
z-uB0w;&4AwdeV_)%lgy>n2%LmdQ$ibvF<0N33KphOy-viy6VZLW1QQQprH~sM}hxf
zqCH16xh)AKc4RETOxPPI;<WXaIcFKxCF(4tf&k1EKu{^x()f$U#U{Eth`XQ*QQrIl
zs(jX^Lb^%Y$Q_4i)&UOVmu-V*mG$(<Pky&(WiJH*$Q5`2Arq%#B1|8D>ZFU+(0b=8
zt;NW+RM{=+Vw%p=O>twGITX4}zYPpLm)o_swI=d=62!}o=QMiQ2Qljw`7Ux@ciq`$
z`ruLwi0r-fLvpC<bbxU%l^*Z_WX!ZIQ7u4CHJ$WR#y#|_VQA@8ZIFEza5jNt$&0D=
zj>M;JzDM5zxyr8(>XqE9e;OgUOjJ`k@Mv8Qp|1jcUyBkCgNSMNzY{56NP4aPasa``
z<q9&HN_7uWhM)F;{o7HHA9++w%Gm)?vWT5O4jDCv>kH!x<C%sK@Z#40H#=*5T5!v=
zdV9g71Yv)cdWB$jW$*wwtdD@?61q<Z`n!T&P#j9t(KnD~PY>c|cevdT=isX~fbiaO
z4Z8d98Vm~$h(k!0oD3`fLgycQ3Gu~hJH3J??kYUr5V9c2kVI7hP+0=zD3;a@n^Fx%
z91u!ZTgY*U#(eFvZje>`C=N!1RTOWBsB{k~5?g&g3kG!wEFT2i_Lo^(J=e+%+EoQb
zC$`dJ!*({dXbmrcSN;`iy|6RX5brqLbrSTIVPpAjKk^~#R)gmXYJ!>42o5lSf7Heg
znKJ%k<d<6Lt>gQ0dHRK*P{ST{eR%=;<N?O#`i+zq2Naf{J>3R=QD9{pwLTgFhV6G^
z-5KJ6ui(233YW}Xq@@4%{RE9=MWeeg9^<1?G<j8;5#B$@LPi4*n(uv>tCqRt<ON-%
zV(xPkt^!3|R?-}E#6Y_%bEm`u=uTbBqdWT6q@&p8%asUA8Zgx<2S+kddZ;DL`|PvC
zSMP340=4S0$^+f5xF@F+1P~#AMvS4}zIhFW)eu~!lG$eW{A_b|!r_t^FDy!Tzz&~k
z5Mm#66f}y0d-%%UuqNCbA-Vsi3--YtkN$$3#<GF1)e1NJ(eWm*3u_)JZp^W1loU0T
zA#j{D-oN&A`Ac<F5`4jpFrWKQx$uzQaJWoeJM>pe+^5=BUHdjcf&NB7P&nECo+%8k
zJe{%U=-}8PYjwnj;e6(W*E+U~ur(~XvA}8g;Ji^dGORVWN2Y;%R6JYwwHt1<otNNN
zrdoV>2OT}i+jaSOX5pVmEi{R!dR<DFI>_isyqbO4h}oHR5TzN6s~xeuT`R>dNIMXI
ztrM|&l4A8@(xVMyh$by#%1e6f8Nair*d4!`0eAkfJr~=?LE`UpL{}pDiFjE%X7<ip
z^z5fLEXeloCdy8kZJK{5wM>u$qigpSX}bn&-*%Ao($DuPl;Gi^Acd?GYZHS|9y^Ls
zl%SL?@yDSwN`Ax<s$T;ak->1buLZ1&Q$m+$sZswlN-|&eY|LVGy<*yNZ5<O|VFC3=
z#F1My0mwMFGW#(YQEJ>g&h!$LDL1JXs|mapNEw&q&Dn<j3e`4%h`Zt*7P1;UGr}U}
zMb{2@?5*bUsmNKa*PrG9Yi)5`Z91jsjJ+uxe1;GpM{uMt54k?3&GX&1m52>PhashM
zyg62ZTfyDhp3`=q2s?}Z>+)DFn6s4ZV$)a$Rn-#?LmTr$qQsqKY~7@#rBfIvqEhqo
z^LIsug%3SILZD#i49>fdsQjMvuw!}bh^l4dXyY^}>5mr8`5)itf~D=_Y9OGY&DIhQ
z|Hf|RT#p1jZL=!tvn#<g_2H1K){=b3({92@9i_Esy0I-V=_cLN&!P8)h<uv*2G2#f
z*l~DmGx|~m3yjR%o-T1OGt&?p$-yPS4m`?H@@Hz4jswClT_&$Ev5u00i)QLax8tEj
z`yst<$}?NR>i_eqQJ1G}b!$~8te8|NY&H?MPvdCUKg8SvwXY$6b=h@9Tw_h&P=*>m
z{J*olL2{b8h7xa9b`=hM-YnC5qbiHNq5Zc^Qw(B)zI%%{wbb10McX5eoCnIv6mK@+
z?R~}V-<Fkwzmw6A9K8mf1sGctp;mNV=vV_d((0M{?sHE7$S>6tT?oR}b_?ya!z=?~
zPF)pVri#!4xp@3gEYZ~&$Ai5vxicoVfK_S&E7`yc3gSu_cbKq9Qs@D#gv>*mBLH|A
zq#XAs`Il6126vM8k&+f<UC_xwp__s-`$P1bTtoiQamuudmqmV8o+cWmnHo$-OnkeH
z!7lzRV84&WwOS|5DB6Cx(#lEiBKH$}hpK7yGvKk3Yy_^79X*ViSK!^tQGt8v>%-Lk
z1SV{Y8Lc0x2@nilc2jaOOg-$7nEU;Aq}y@J!MOnSZn(QI(&}_$Phrv%b`a{j1l;y_
zpANZZCFf~-cjk5{?4k`y8#%@FeBK_E+82|(&6?=)k#8|a&3$AY-?wR(=wKvADHcC_
zSFD;Y!V&Y(>_Ir7N#g7@(v_06k**MCcR<ie-hGccQEiXgQ8Z_eAQS34GMCvhXs05s
zJhlzmg_%Yo9yj>55{46rjm5QkQc9P=q!25nm({<)O>U%*11Fa*YA1}aoOGL<1xfZ4
z_O>qiudwsT3p|~4O6ABT44+3sOdAmw!e22w7;>DQZQW9pjY+IJl0Y^`1q5)`-?*~m
zaGNy<1Q1(ZWzoVlRuBb9KRMJcsKyYD(y`TSDa<i#`K9bv#*@rr8Z7mrmRwefG-31Y
zFmt$#;b5JLhrEwLfM9+J!)I4glDF6lZh;{=i1P<UBCyDLv9)=+>PLF-xA}rHbL>%F
z)EuZ#eEgv^#i`)`a*2B-@8+VO{Q?`=cL@SN>kA?T^)X-!ngU+_tHQvK_4M>`(}M+C
zUu@k)U{5?OAJNkHNV*30!#~~0?ZF7aa1ym{9yB%-5mh;e;f%K#!rI+OpxbmLf@uDU
zhF>p}&$utGAvaz)`B3x7(2tbsrb$ZMkPzoeccvm{;<8JHx9Z|;_|`JxLj3+qEtAVi
z%`VQ^#yCGI^a;*8HSy%Lz}I8~a^Xar#(!8XT&*x*k>gT-q@1$;knt<QOv4T>-Cn|k
z9vKvCX-X4Hlc5r?P^Nx`euqu~SBR?nxV`z3H_qaz8%v<uM0Og~rOy1zcLs{nR?%Z!
z1d-U;Ll&&Iat9nDDlRD4Ds2j{-r{!Z!@?umRFViwUYx?=tHL*_->`$SN$?j?jt`j@
zBw{wxz98h*ZHw@S<J?L+f4;vIARlj9R+e)U=I4oKj2yr|3j9uWAChA?1=01MdPGU<
zHaR|+29_*->9XSot+bp2Iy6o-Mf2gi9bKfAC{f;yHsr`C1+g9vumk*BnF-+8ti+0t
zT4OjQ*NgpaHH@n_!g}E6MZil`*b*V&k`B!Y;<E4(V${RX+c7`#F`(L9r7Wum<r348
zN1I$$Afcyo^Y2y=b5Y^OSLJgW{Lw1tHXJJ0{S(I=XH)UoVfaPEc5k|rds&+kR09@?
ziU;j<%X1)k%piE=neYYCRl?Wy`>ab@b2Csl!u<Bc??^ZZV#q=*vODGfLQSpVh>io5
z3k9;@EtESI_5zP$BT(lS9Kf_}ESE2^3~9Nj^2;+PF4g(%SY@3~yR{T*Dcrx>h+>C-
zxFW~etfiIkP}JQj=P{<AHzgT1_2PuSL0nW;vN5DO45kqR<;5N%DY|-#ui4AjFCqg@
z4JFbBMrJ_+V*ZtIUY9k1$jC?ML4O*q-O?TmC2PoU_Y6)KMraWA`kBkwV%<@S#IUa9
zwb%Vb3!fK(*ds8`3WKS!M7QiI5Jsx*K?pyD+`{vZZe@Ll-RRkXs}*hOv{8&!R6A<v
zaNvsbX^TPgpCh^%Sn=rcAq4x4H4dZiG^xrnlttJPAPf02!|UWFAN>ud)TLn})q>?$
zxuwe(<UP-l^Ydpgql{BKr<f=at93xgF-$kFz4mViX%vY+3kohOp{|Xa)LT*J;^W9~
z=GKSV1pm>C7luSxbTSH=h^^KHTw?ap(Q%lLELSw2MAs+bxL+-fatBNUBYN%|*giFv
z?mjgNiOqRbB6(@Zqrmp*UlVE}U=-qPPSgH=lwV7efx@-u{ZHxOWxBP8xTs{y!or^k
zb=e_nxRm_)P^NkN$GlMu(z-a}-H#syJ#QGS!OL8IEfGXd|LTE(@R8tW3cg;4pa7ra
z(D!oUTrMi~YcQ8UtR=vnSAKy*$4jz;mLZNHUYjn9k<5qG5Drfv0$eo&T*qtwnjK-u
zG%%>3aTK3Q&4-y=v(oR#2;5{yN3UL>4XD8xt8sN5H1HpWqcb!>W9Tg+Y1x|m$prq-
z4H}^P-g+S*n8#tnlgHZ9rpMhZPx5K7ceJEDB<s}$eri;Y8Lcnb*)cE#s`YAsc+*2~
zYj!+&n2l`V2ctpz{-t!ZmA+A<xT2rsM=8ww()tpD90m&WJn`%r2bS5=!SNVT|1ti4
zdc33vXS~32*C+uc?WDT)YhSZpJqhB@d6uT7d1VNKZM#=V;G7~EWDXW#Sfl@|8w8z$
z?bIjf8UahjuY66zRFQiUsxP!7Jv@u7uS^4RGVc&Epm2{#U(nxK!6)G}%N^bG1z)9;
zwLzOpw?VX|D@0*dz>+@F91YcB_3Bpjhf*>;Xty$}aO87d50m>;{e8Uw?HRrca2G+l
z&>H{%tH7~7=eY`e6+%q3?dYohgsp_(1Vps$@f{a^(AILOV#M1E9+td2hojQdm3ekM
zkn`CGHt_EeKhfZ?xw-`w>}P_ZOhDbM(?ts7Zl4m~Wa6oB0&79-Q{)H%Fr$cWWhfp}
z>=4jt(72D*&9M4J-Pov#%i^MfNs*WZb;ce;u!3o9ly@)2e}BwT_n~i(f>Lp;*}t>c
z7nZkLCPMFl;`IKj5FX~&HxJY%l#9OTnqP~9+{gJZ4vh_F|Ea0>&eo4|c^c3+*Vn-V
zp8o_&O#F?_Dc6k*>nuCK2S5eKu9ET>8ZF1!oX1dhsI%_;sb!u1&!vLrq3AS9Qv7v?
z@#>rVrmqr*J|jvYL5=-^-S(*E57F`UmliW$mER`D{BKl<$L5SDhx!9EYXbJs!l8*j
zRSXy$W=Bi36Y$aBs$!WvZ@c~t;Dvx{rP>|pHwcc^PxYF#iBDx(#nT#V0R#`({+pE|
z7}c_~&{!W}3^L>eJ9D4;%o&_Di4q>!96molF&<)qqN9RycEV*B_Ij9$%~zY<FH&U3
zz-&B7tvW{qKqhpbAanv0W|N>6J$&mKk7%L?((vykEx)9P3Z8VgtJ1}Px<4lW>81pr
zKUK;C*n^RhH|!RZRnBiGrD6Z?L$<VC(^BH;A3xq;lX>6%`yY~V0!SwIz?99K38ZeM
zlTtFqSOTf1gVe_nobi_Mmje&tw^~S9O8c`kS#!)R>&EJYo#ayj%~S^`Bgh{@8yt;u
z3SICq?R^W{#bTICkR^b2_54)>KsgE^Q|-=QsR5Dz+-bJ?bu?cXD9nZc;z1Q)oYtDI
zRW=vAfrd7c_lDA9v8nNOy0F@VUTa*l$(6JjFgWBN_~Cuh!vZP@tekn3K>5j-v&Vdi
zKz7@+L9Egb6?r&h&m;k0RlkZk)m9uj^gA6n>sV@bT>EWMC-hO8X^H*6PA@DE18$KJ
zN4>kvaMi&{B})NL_Vz`a#9pS&A)Q#;H;;Rp#C(6RUCpmAs9bsTB^zAWXg4YlXg0Cr
zN_eTz6zDJvIHah1>v;2<jXLhwHQKrgtks?JKT+kauH4a-vE*m9>Gd^3L*Z3r5dL4h
z<tbFonASk)f%C^`w1{R{?e0r#b>mf#{xkuJXrvndV^vUjXdQ{Y0;Bwwz(~uilp&h&
zT3~5X|CY7k-=N50%|24#+6jsyV1D%*yuU}Ne8Ke$$dwDlrF`7|nKluCc(4wFZ81#(
zM{!xI-@--0%!vprftC;OvP#JKzaYlMrvW|KZ7+N2Zyne9_4gEP&|LI;AWVJjSOP~D
zCSRY^#{T415=~J?7Su>(D$_EoO{h7lT7m2jQN=aKM-{<wpb~AwWP9GWz^nO+lw6>&
zFuqWQDJfr!rcV#%?R!=K&ElFZ>bOLgSjc6q@@ZzZ&iR2bL;CetJPq28n<y>24jCJ4
zfU%_LEZ)Avl*JEg)}9yX_dKZGq;Xbx;bACD1`ZM@K(>ta$g~^I37!!|7R*|MJ|JQ_
z4v_W!cfH^lsAt3^2E$Fw72E}&%6Vblzt2(3=+36VJ(5Qi3UaR`M1GyFY2=#Vg|$_<
z0B%-;mAn4dbQYvE`6ZOr=RLAp2w9B`%Am(H>BkyLxpZ;lB}xd7HQ@E9Ek-zqEpR#5
zKJ)nqO_#vzNro=hWtkof;&yTIH~#Tb*^>ivTYD1c%`mz34gY!Z`0)tec;)9JFMoyJ
z`u5a*yb71hS~zR|<e%DrxMa_ncVFUeRY<&xVX=&iS@gM+nb;#|Nak6j`s1FO4Xkv}
z-}^h#q1<D(U`KAX|JUm<LOcp`k&r%sn9~N1YDUlpqfTSK`G|THz;)f1{4wJ571;|W
zSkqU+5DRtAg`f=lB^Y9SkHT@b0n-{4N_6Bv1k8f}*}>5~AbmS?B@7<G$f%MlhYM&8
zNdRVcQWpxFbKrmKlBw@Rl@X;K3S7GEL%(c+(?UI)a7qYiynT3&-wux0I+&bui{F)K
z*S9;}E8nE@Cjdjb4v*R3-d1~@!zY5_DVz2NAn>rzdL8+rd`|sa=fslnmu)M!Dnm@j
zXb9=e=<10poD1864GkqskMj4F_1LV($eEPxZp`VCnv5XU%MDxcKi+JSOI<purpFRU
zWR)2A9UnH6QmP-scTBu9UP-g%aXz!WvK`e;uKM~t?dmk`%>1~dp}H*8>lcG8iFcja
zV1_uCZ1`q^xD17SrzI@vSd_e*6&CrjLCD_UOVquw|9->$$@@z@C40>sig0s<<cK(C
ze_qU`TQXg9H;>`U=gb^KmBS2mVBUa)-A)6xuZNxMq-lRrwz%is&u&%YeSkKIZ*m90
z`85z3<pJ@GU))D~6+;E9BFo5m*^n{F^Y#j%@AH75Zy4h8Vi5<Sc|)~bwq=Lp3?PvM
z-@3W8z_R4>8g2J*yfBe&?+O6_<G}<kUv(HrbA`_w=X(>mxWM#74e!b1)}rg8w?+nl
zuSqEiw~cZuZ2}RL&g;07)ONDTs2lGI7}6{&IZZ{Fi@tSxd!!Gj6BGOGHDFr8{Dszi
z4Mx0*FSEKVv+RciD(s(UNg<3hFz_v<r2~k#bhX{9b8~aoTl~>Jf`0UZ8lNC_x<r0J
zZBi2~`@TxXoa2CJtEw9Bu|$8_UYtFXQKIV0c#Ok&CLKCE3e(+<{GZ1L&u>~s_U!9q
zV&9s8`fqGIzQo5$+Sra0GOf8cs;gn;yWKJblciO28hKeaX(#os$&Q1Gp2jOI=%r*e
z3}{W<F^Q!peZin%V{;|`O^@G2l!##j+N0MN)iVBin_$WRP3g)}F8!Yu;Ns=2dcxXD
zaI?wVAtjT0u5ROLd@lKk+2>*4SALSSnxd$(a$o2z1Js><SO)2XIFUL?0z&bB8CQ&v
zq&@aMOmn-+=ZLe8O4y&b#1w5LIe!7b<J1iTf4ksyI`Aq{lGk;AMvYB1Kda;M9i?*?
zlt%a4D`Mua*$+Wx;W1JZ@0F6{{P^YRhp{ODSy2^QF}G)Jr3BCiL*}Vrx*S@h{l8&Z
z6)u6ayNrCWCo#k`|D7D78*t~(>SFuH6qy4UQCb4YZ0=IP8xgeiKbPj^po&A(3}lGa
zcUyZkJ}Z*tB{)7_TJj&OXF8Dl0yrGOc1Q;nEYueiyCIF`byOlW=2H%>XaOgi(80Rx
z%yzWm!&Wn(F{))_C1Ip|Uj=JVOvmS+&h3Gr+GwI#Q?R!7&^GktiL*g+pY52_qGqaj
z2fsJRp^?DR`C!rR@&*5t%|Kmbl?V$kere=W(3k#AWiPH%O>AnP*YMQ&WgaRzX%T{x
zJ|?%r%W?P&<s;(SHT_Ack`))<ZsM0>N*{H@dc(i+$)a3?4(WI9{YKx6*UqeD?Z5Q@
zQ`Fr%{`o2MHeNN1^KnMhYAGVBSmTuq*;bxo&SwAA)66&1`QZ_NsV7*cK5M@13Pf<K
zY}r21X05r;7`)jc{PDIaO|{5={|d7=KnMcdpe<l&GY0d?-VYly0Z;Ore{v8JIiBx$
z1kfDT+M~7-Mm7&abs|tZjtbfpif6$zCSlcnzRvj-)aVwP4<2K62CeUn(%Qhtq_Isy
z?GougXGkA#ShKpw%6KDC*$p0wb?{PFlL!o@eb}{RAtz-#pVb4UzL|3zdSWM-QiyQT
z;rdtvSRqVH@9JG_l<Y6PaMNtvNZK^vtN#9eqAq($SOX!6Q@`l7QDv+Co$q_5yON0T
zyDia)9>0}Rm114ZwjY#e8Hz*Z=m&9i^*%)oHIR4f!S)Y+w0ix#4$`6Ddlqbjh8DDK
zR1t)<W?Z7PX?XvxVi2~w+dJt#{Jwrpm8`})+D{x7Tix<k?KV{Ww6;IBR#ttVB8^Sy
zvaMD@rJ_UMVq#g!-3Lw>ia2;)(^R?ZU^A*7VXi&8(qoCo=u_eIx#jeF#&cg4Qv09V
zWz-|Qe7!#hE9$RXlVZy$5MtNrXkiThmhQYi9<|t+CKGIe1H=*wikGiZ?)0y1FN+`n
zM)c`vi;d0bAGVpuzz68a6!O%@PLrZqMZDU41S@>_e8F*dvh+|nihD#0=6?;yeg*F$
zDiO8|4Yh3N>vME$;A#8TRzLg<Zv%q=%XWqk?<cl@38jiWY#f{Vs})WwHePn5IBYz^
z1qi@Y_HZ0VZK{=ttqs8yHn4Rqz%$_Ucw2E+>X`HMoqzi0|AHbO3oc-V{Ttj*wQEwY
z9T2dbX~+5b3Bw$u%YJZn57B=C!}9K3pQzZg-#zE1RK6g@z@H{I0AZH^a0QR|>8BL~
zhNb!gGMi;te=@{JTG9n5BxJV42N3`zB0{hvH0)nda1PJ4pXf8vdezbB`m$TtXdI^W
zj^uB&*7I+i`N5)QYa-vr4R;o}7|ooV5L?-Vv-0VtlcG_Y@XtyAg2pW0b`3|~37wSD
z+Ndb5zw_jYY6d8okE%-(9P8I;Jm=%gCv!(#BjpNzTm1PAl(Mll<r1zr0jO(oF#WZB
z@H+++D+0;*CmlIfqlaN`rE+Xc3-poJ!hcTy9!SOcOB!X=erJ7yqs+la?YZXRS8hYH
z=$tBm&?247eF5hA`A`dM0y<REL)82j^z1@e-nLROABx-Yf$ToSCJ|(q8ew}Hk_p%+
zZ`9T&a1TcyhZCnkW)1Q&4870sxN2+xZk_Q(lsrJz#tTqsvO$BhM=NND!d!$|DDWS_
zVQopkkC@*OQ%B!VKE4kf@eBQ}BO~4lqFZ^Q!xmxdXYfh^<np9b=*ajPMdaO4QSNU5
z51$&URS`SMv17_Rju?tQu=m#7YqSN2#py{MgP#f5KG8nu647-7?7s^MHNR67X{7!*
z`8$(od~#z&KQl-=mmyX`#QCM0K4+Am52%z%-N@~n`eqllN~o|A@?~MNu;CLcrtzhv
zmH&ehu%14Le3wD>fwo-Ju<5kthJz{8Cs%MsQyxb_HYpj5m(LvW{JYYk?TR8YmxbND
zbpYD_Y5<xo&mz0P6Lf||dIzy8as<SUgW;Gy*`U0OEo(duN4Cs-BW(Cm0A<0{t>58*
zFY0FjIGqTm)`M1?ZJ_ZOta?+S3X^qqoSC9`tqBd^rxPL>-D;N9!CiFbAd7^HPkQnt
z#?|j8kdn_Ad8N<+Z|4nUTGEzZy6-1ihfAcN2zO)|b<pZ4<p4`()c<qCqv~tBGEmG%
z`MAe;J+m!7qfk|vxoA+=GjLlJ;<>(kQ|>XjM2XdZCOY_^>w{T|=$fhay)6{|V3)>g
zbDxDnHU0EK{J^wqKHLf9!-b#o>owK)V}?{PphUpz&!s(A-1+N-1r{XeKQ24VUDnj+
zoraHs@kyaQT`>@Ys9C;Iq6;fhek2|~Xsaz$E6l_4%kHfrNiH$ext|*+j0CV_aczL%
zr30*`IN<}V<rT=G9R2h8IrkiA10Es{8C9DyiZdF10@j>+!r=#>FMU-*12rh_i}6q8
zk{+BSEBo~e&5>jH7*#*Y#x=hKqqf6sK{<qZ(8sJX(T`ZseXTBHC}!Nn65j+;l)|Zj
z2eh!|10M%0a%V~~+;X?8k!NF-<(uVRvzctEZl%mWmz3<NT+2zZUT$D;R_yr-(~}Pk
zdQXgXP>MKrqGE*ek_4|<Sbe!+r7ha}#uhgFGx!yJbzXK;awdiy#L8PAXJMloX)Co=
zMUo67Qi7x97WhP4ASN!hkWM+<XF)9CHp}_>l^q;ZLC`GV!bu-NF^6n+r7UZY)=mni
zQPLO*()^tsgWPu=s{tE><C>)BYFL*Ww0|jF)SSvF*voP=+D>~FU{=opPYG%6fm`9L
zJ4sc{(5wHPYzFCz<nT<tH0gi`FN3t8Wx3(`CN+_07LednA67^E`Z-QUcZeg6!F+h*
ztA7C8J|oH~2+?(e>_r4pn}e3+!qckf?V@R&yw-|#mE=WjLg=x;ZoXum$DG66*0bvS
zayBJ*eQr8}$H+Zrj?m$|ut$MHAZF~@t<QE68T)!6QA}=gW~@c7Mqh0+XU`t)5SDb^
z?SFnwJVLzYu!gtrJS3h~>unxq6n+o(+ohttCYEsXT+9kzf@c|i;D%zz_-hSVZDbfU
z#%|8b{Z`ZE-<I~w5oW5bA^9EkY;D=}a=+02q}RxD>mbYoVavGUzz0OI1l@)xAdiQy
zY&wS3w=k9$roPIEJ)uf|IK=(&$1!J-(&H0#xRKbO55F(u987~uW#~i%%wUmxMuPx6
z_G+!(mVvH_Qn^E}x~ybyKZUZ$fKe4CVa5KBKoB!U8&#PM(%bs?p0yD#yI2B02zfK)
z0!u7Q8Rwej(o~6-ZEfe)=FrHk6?fztR>l+Er?{bU)x1atbn+pfFXuA(`br8<QnEaZ
zn4=#}WS0LjYpa&yu>#IHmc#fT_kNT2WIT^hjbeLzjJ6}+RPR{ObH$!B9Lj>+c5~n5
zOGOxOT>dIp(hOApp_hbe=Ktp}97YX_8k5mcWhI+e-QPWX3eL&9G6sf%XIaWn#YhJL
zyUWMQTBfp?9CaBXsAd)XL>T@DLq(WAI@>OqFcNDP$jQoPDl%}Heg+-%;Tpl60svpz
zDKb|*niwF3MYb6W?qDg`9w_!WfLpoB-%qJ<8+s-whGy}X@C$$s=I<@0pdWoBv`yG4
zws<l)WB&h84RV2|_f?OfY`};2KTG}J(az2oNqfvKds~P##GD^<$T7#l4e0`ZFB1Ay
z3DuMZ$quN7ZkjzrDpp{6ilv0HDUUh<Qf;@TjZOD=o(VJ8z}oc=uC)u>u#QCLA(?c}
zZJ1HBiH84bF0%jCy4Ny}h(s?!hVFYOR_DOzKMkmH_X=oN%fE@chqcKK4Gl|pf)bip
znT`(W&wlT=a6v%D!gK$(VO#$qQ(q3q1%Gq6j$QtgX;qEl-+W)o5cY=j<##+K`M&N{
z3}@LhI0mr*2BUu&z!%(>CM*BJL8YMCE)!j0dz#_GIbwy%ibn#jMq@|^Wm|wGSTOsl
zl;o!iYQH*P@EEivM^tk8kFs#RYR1!(AG#F$asPfki!+`<IKM`S4AiLktr{x(Rk<E6
zFc}k~ma$-!W_t+j0YCy_`+w*^Oi<SUT5;Dk41OZxrIzWy@_6B<IZx>EvTBZ$$yVRw
z^WkzZlfE44^+iUkLE~gEHrl_Jkvro7!brk|nwWr|`08gj%$+ttGR*gg2WdQ@a|l@X
zu4IAx#2`D_vA_c6cR{{F-ffx9R!cAp+?27vMjH1YSPHPJ%F{*FP;eUt31{+faw^^)
z!(ZdOJJ;-qq%m-P3d<vwZy7;vnst+3@ZQ&QH|uA8LO~82`?YjQnzq`DOh*W#&RF&E
zw*HxJ*@}67wlluV_6SfyDEO}b6WFkVLV^(D&O7l)!r)+rSfTEg=h;?FNm2&?Em|_)
zwIjL6_a@_hgqSWrK*ajX(jLZkf`7rMri(|S?-&7n(*|Ab{3YrFp`QRUq|i%=JH(vY
z2tmJ1+X6a9U;LAIs?si?CwEMO6Kr?lmBg<)IHr;H$w2;L1z@p6Kw6lN@Ih!)SWol(
z(5fzZs$KJT2W(T1twkG$Mt9RN#DUc{9^_Mz$8U0Br~Gcmm0+NR`ZGcyu*m5a%1DVi
zlYta9kMq6cyretJb~$(FDTw<)B!$Y_g0{aF)RV!Vj*v{iIO4{cq2e{!wM+C162K@C
z{)i`!YKwUT`f&;-@Td}il<g@Wz?K7Sc1%dLB3R>pgW^>-;Ap~^b%JC*(Svj5?QVN#
z=x&BT7<vJ~=<zkaK#mBIh7i|Imgum9pgv}=8&wc`|Lgyz{H_xSBmW9{=B7_#b4U7O
zXLue6)ox~ZQqiUhx~t4o?y^SCCPu$VC4K&!VK5TiRz6Fq@7u}m_RsQOpKYaB68`rT
z;|aTiUzJH;0QH~t7C1BMCeG6E!o0Ufn4jo2)TkD{R~Em&aqLWD5dNF%0uH5W{M7Zo
zTg{uowUrJ}&xBpAI<e0Ie-m^b%}Xs01LQw3wLcXJKeB+szi4oOC(^|vV#R3Cu+Hr8
zJF`)q#J8b8JIl6@N)~g_RhsT)x*95<Fv)iMeHyRj__t&~1Fn4@!vh^CB_(?>7I7@d
zJ@7a&P$X!^5CUG_;*aN5Uc|#dCi?<X0KdbD*^ylVcvGy*i$E~z9y)dhd3O`=jz#)!
z3M8QW$Ou1Z+Ux<MnaNU%Kj7Wb0C!w8K)b&S=HGPz&0y|8W-$R#ivLhRGbNgLXdj|F
zwc?3&0mG^>h=wCgkL&JFlyicR&c&QuJ8dcqnn_^A>;X*$##3bm0Y3b;bF5$jq2eAy
z2EL^aC!ja4@XrM@5(V7^b|ViyC2gDoFpo#RZF+z`8`-QbM)p3=QLvh2cVfkD@P1l^
z(7rhD{C@Egn|y^Ct5LA~ZAk2gk!|B%B4!zBHi!5fKC>bOgG$U>V+7Ao8+*KvOq-C&
z9<B+bQ96Y7N-uL=r(0af%q`>ewddicr=Es}b4KH&?rz0-#n4^JweH1j3)-Nk2=E~b
z*tV6uyk5u~&`XvtBpmLE+67Sf>Uo#h0gxv%2atime~IDof6p?n<sYL?(@DOc1b;yR
z?|>L-|4j0>(R-p3Y#G3a%O_Cm8k#O7=JowVI*e?WzxKfZm`Gc-hVHQL24IX;B&wjL
zSe-5dpA6*dKFiLX&a-|}-SQsSH^2JqrWYq>CAHkj9k>bx_6U+ZRz=Q7Ml)xDOV6pf
zJFx3e_uy|<I`DaSU4-O+0-AQML|AtjS*L3+%hqa=KGBOZv9PVZ!BS09e<uhIweXny
zR?(;V?YI4RwZ=>0w`ZgC&q+~qbOf7D;qPb@@rCHMs(?p#8eif@AM}$et~2kfhAFDU
z3BSyOggVQV=ii<6^BV@l_1X5SbdN8HaZv)qDxdkEmVJ#RFkJ8kw}>L$Dr~{$dVe1P
zGAa}}YdeZ1>2_+4-TfO@qeSM&ZunegELd1t+)+#p`S<AuVP7E=DWC54zy!M_^M+?_
z5#kHyK}{rvmb|>a31|dPw)7?6^#qMHSN>@q&JX1sRWp|Cq{2gnQb@Kqc$6LXRl$GZ
zTgJ1eD`Zu7iDHvkzV@A-4a0Jj#dJ9q63Gtkx=-Z*cY$e_ej?0yvHUM}zEb^D6}P^s
zrIX2JgP8E8lg;tRy(&Uop@Zn|KLsnm@2P&r`mXz#`aw_DapOiZ*_yI>w;SwsC4#H2
zvp!4g<Tv94(8GIanQ7W996@<U0z(QKs4=$%4Vo@7{9#qSY=t#u<U<p{hd3#?694X+
z{cF8AK2ACz7OI)EZ}0ikuSn-PUs@SDjL2(IWx5J)Oe3kCiXrjm3yJUN@Ma|Ll=)t?
zI8W+SON&soW7FzYfboXrhv_HmWBKdTJ#Feb5jWWnekdX&`yrREc<8)%VmWjs%S@6_
z*f$D=@hcC15)Qv+pdl#lHfeX$2L#f86QI(5^T*kSEHCQCj8$pz`A8A!67^YFXDspX
zog6cFGHIi>-%MD;9^=PjaYMIg7%t=scsWdi%8`|Z4IsKVI0G<gx!wzE=lH^_kQc$E
z627>XF!n<mP!4f02e)%yd2oBxP=;}_LU6DEjN$d5PH8UUn?S>MkzxB1f-y?IF7u{5
zSS;f(t2#Z=$Vb^z$*FwxoWTQBuekK9Yi^f5ty`#wBE9?ceDJ*#oJ9l}<p`i}lAg6T
z-JWY{Srz{(cd)ACucbXobGXMSjboi-s`<#s?@isb6ciWF2<)eR3KwJ>;tt2rvSmx*
zMtM=Kef?7bT}cFD!^xtLf8H85!;^6~H-x%eg@)C2-A%MWiNrTIJdg3gdiyete>+B$
zA7#cpIj8S-1!2EQ{Sz~E_2WuYjoL$Te@ztM>t6Q3eWzABu8By_UY+{V!rC@n7x+5&
zL&fd>&{gNt@{(-3lejB9ZEiocZ4G#MV1&8<x<6$G>R*6<Nj-EyFrRRF$EChZ-{M$a
zVvfR+;~oZhINy=tW^aULUdYLa0+-OxLqlSR?SY6b<C1rJus7)=?-Z{@eq!7%k)jti
zE;PI#^H1Ay#ysB#JY=WkF+c)vTAtTcU3Qjo@?;E3D|3KzwNc_<AW<JzLejYSg%Y%f
zN5K94;Y#S1`BU<<6TXS}xhI<l*S5N=G^+{Yuv^jPAW%y&bk&Np2~jQApG-&>`6<Sg
z151{o4<xMv7+A%vIAt;JG?8(KpRZmJwE`Et^`wsLn+^US7u;}$G>@GhYJ*Bdw#fwV
zepC_?NllYz`y5GeSvUIcbjTC3?h3TLQkeOc{vHR?uy=|Be`fS6%nX0PY+L;JY@*83
zqC&nxbrU<H#RScCMzhe)IJdNk#}18)tJEG3EyyY7c)|aw*?nP>#GUc;hlM<4(kLO6
zO40z~pB>L6u!V6Aqv-A0Pq9Npu^lE>UcYe~92*y6$x)syQY6UDjPw}xX+8n5!Xm)A
zPw+lrpfI2$ifL6GrEN(6OFX>yq^ppU2ogp^&t-c2`1%jlU8HN;Gt0I=m@;3RgQyu=
z1m>MaU%}Gca2IJjF=6p;Gt<xaBfoxlcHB)b7`Uvn7uQv&6?b)dC7SF*zz_w+EfII*
zoF4HevDRB*OFnN9?5ddirjW=sKc^pvt9^L|+JhPc5Q-++yx}!w%4jaNy(2o4q;i=N
zlJwBus$qp$q%#p_(rP^@%NMUFI~-fK&1RIEIjXR`Co;5hpc^tqpC0)qW?qdQkX=`$
zyB>T{AF(}ba2B2?(2&aBS2$y88YA@Y8bZu0A9ip^x>GiAK1GGGwB)GUv(v)p??lMQ
zc!W2&DE_Xm=9*gs?#Dv5I)10u*?#NYn~T4EsU1_ayJjrG;*LwrXfXj6)DP~O{1<Rf
ze@$B0n;folmen_MbBsw>-{9|gX_i!F&k9SMc47nFjY>zrbXGSPa&cWv*vxz9r5-YE
zlw>VIj_p}TD9fASN`l+KB0PQGfKW76xQEQLaL>F`68i<NgNwFL8j~+$gSbNs{q(hB
zcbFe!?6YJZoOkwxuIE1ABxc8R&y2w9A6C2R7jm@2RwA<(M=Squ;bviyQIMPC>yPy{
zdmm<v+c~V)h!Z!r3dZ#w>%eq0HobWhO3kI-TmHdpz4AG9Hst(VH#4!x@OBH`zWiae
z^Ut>ltOrMi7==AsNd>d&vFod(D7#p*CEFSOFT>cmCM-|jpa^D|A5?Q%UjNQ8g~wmO
z<6tSbEeKxGq6OQAV>xzq_QJW)v?#>mN@^}1Plq37JCWzLiUkaHk1zTD#*`U?J7`lb
ze!{rtoRCYvi?F>G<f?Xa@WCah0uL6_aL(i7nfr-+d2^4E#YqA@TGI|!=M5|rpM`Yy
ztjVxNi?aIQlIVAjqcLIogi+tvG6GMZ?VZ5RXNo5GavZ)*)buP484e^sf+AbI1rle^
zLM5u$L-8xVaNPI4&gAN=LX_BQhh1Yh^hc8sx4(;w3%Q8g;hEcrh$~@7WuQadhW-vi
zbfFDIsJ`*cc?TO7XLK>tf>@zfDQzS~A@VpH%B6-lJpXxh*{b_`4LmD9pCBGgFF>4Q
zKNOcrl*qJRWJ00GoZz3eUL_%n-DcTObDsF1Mi2qZ806yXwRDH2TYKtK#i^z3f)y#X
ze>12xPM%d6xtK3zED}btM10gp=5HSG+x<$>7PH-yF9w4U`j<D|7FT+|)bz}G($nrJ
z@nOi(c~!4+3tOc2&|;!t{NKdk%6xl}yf@7M73yLCzmYEdRFic4L*wsfJw7kH>i#5Z
z^#5AP$+5hEPdeK^mbgf^xk~*r+Oc=j!*tMN>^t@ag7AYF$4V-~d8Zks*(1sdI7M99
z7*cL~nEuXq>=x^&>kxd1@lN%wumfMQ9GsT{iqCGCAlwT&R+@C&*Ar~-pK*mS%6+n5
zQ0MV7lYv(praUv1F2lg3Sksk$Y0B?)X|ExXQmz_A!!)b(N=tbq{HtO7UJlM9PvQ?`
z`lXem!#!2q;tp-Rn4>xU#66=&DTD=K0!SK{ljk0xEO-r`2ndpQ9^bsPd`dZ8n=Hcf
z5ktn1=W1O=ivg;Jn%qB_&KYQ-BxYmO6;%-!wJ?LXzaTol;nV~%j&G^f<Ekhe^y%TB
z(l@HNJ<I=8rL)+TqryUA#|N{qW?#kkq}-)c|DdNXtb`8ktQ`gYhp>TgdioN#O&*Kn
z0>U?$M{=3o=hKR>RYJ+7BBs8r#d592N3o=k%A@nr{{hiR=WKbz7NDXJ?#Ib7F_>TZ
z^22$JjDJ2Tecn?Ktq*c`v?&i<eb^nlZ`SN{JLlun8~k%9KsYwYjnOBWRJ7wrp-Su=
zaf<!fy(Kk*E_4(aH}-<W!aB>~u=Kr}8}DlH(RJ2qeEYacn|EjA+v~xKp*e$*?EcTf
z%*j62-^qEN9kc9RqKIgwdf$_4>``cRK%VN|eb;fkQ#jAWPn_8*d@=K-Q#PASN)^uA
zJbR_n?}<K0Lb5@qa1)H0FHDcT@5iBFPjL;M>^PC#9-##NHSA~!#XOqZ=SrKQ5iP*<
zunre8sZkW({`$#vkW_h+YwuI8S|B$alplsW5)rCxS)uFO{tsuS)5WsF0f*HJqRe97
zZc<@AXr(t3yqRfbb74uCvJ97nWB5N*uzl!T@=+qYws;H%gZiVRj)yo_Xne7bfGoUb
z_NVJlgj(|mAm`XA5LkY~gA9iQDd}y@7egTy66*Z<1p|UmNG>8zX;V*dYQej7&LjO-
z-2RO1s9@Fv8H4--{IfUhdakG*OMX5sODMi9Lb&brIjlK_s~e11?kvo9VX=N8%Lgj7
z;JX&}x0L1egku_*HnY>$C-am)eYctvZAI|52c9L2YOMR6lh^Z?8m;C90<xAN`Jov_
zU2}x9xWjOIi%*S$lbnCfv{|e>O4P!0_s<im<%(&t!WkSElyk{N;T?|a{FVB*SmyZ&
zi5l+#T5y3d;+sg|Qzya$Bo$k$OyA#e=j|)@pQg@rlfzfw3t)Ili^}(yWVKZ`0OP#L
zbOhW5qV-oBsq;;w1VKG;I(hx!9N7a>V*pW{yR-Ws2zgtANJF!}(5EkY%s5T+a*x0!
zH<JJ3ui@l(<#U_8z!J1hun-e%_iJ2TVYghgnpNzD6FMgD%K(|2*4MJTU&m{4GSk8c
zG{_HB5;j8g91;eSt^$6E={xSa?BG>_el3&JkCs~KR4C>4O5_(MfJb;gomDY}qXu!c
z4?j^}VVkXnG<&*n{XNW^ZpxS;MFHPr9b}D3#8oH#&n82(4Qi1&A%kK$zjV9t4;UG!
zs_rXM;y;~%Glm`2x$N(h7gTto<rZQ78ZWKeVMC@6wMlL_1{~u|l7&wJDX(OfB05T$
z0nbhz;2gC#ykc)-tfQZvJ%un7tQs4T4wq$kZ4#w=O{&gbW=0k%l#*1<D^hk@VDWx|
z!Y)@Y*3Xx09CfI-II8I>Le2l@1z@Z4{VT!2`?8~;o#+KjXLMikceT&=(5=3l5`-iQ
zEE>U?)x)`#4Iq-E#ro`&uD1sPaZ2LTz-pAi_G>hd;}$eFbw$hq1BP{?uE&qQ%2yjV
zS<0IYfS2I1k!&U`m5$BWF2dK9eEpAXwXI%J1h`%r6A}{C_}g6q0r7$S@9+g6jQ!)K
zopH>e%kjV1(5l$)!X(@W`=4sDa|=`IdRr4pn{svY+5)3t#-o$xnKP^JB+3q(bV*W#
z|2Qi@C4vo1v~yHVwS}3-F0=L5mx#z@M*L^>9$`pHd`t7zL*9g-d8YRv3g*_?6T<K}
z>Bpd^qo(VK_Z`nZn4`K(kG>>Bkfyw~6Rp_OD;%8A^|Z<5yg{0xx*=l`oOQqRlrj%q
z6~)9*<Ck*9b#Wux{;+%QE}HN6cVa4V*W%wPI`EA(f`hSA33ZUTbLWp@McD@p%MIGf
z7u2g}h-E-DoZ_E0CTxE<Koa)|mtst?1jKLa92l}Rblul`t^VPhG!^Ooi*Fy5{;+tb
zKGVeH<e6i|;MAS|Zf~$YukJJEv90Tp=|oY!vYTH4OAO=D$SC&SRX54h=KWRae%Y;y
zi`RwiMEN>i4A~3!PHlEhw(y&bEo1#t+gHDENo;?Jys1onja<q<O3^)my<vSypT=HX
z8LPnim_9w?$1op{R0~TFBT?dBM)?>fa9hqnn}JrrkIu|*za+_$a(truv4^=UV-uJV
z>o?K=j)HLSF)^pAuI|CP%x5a05y<BZP;oYE)(8%8!iRj5OU&jV>s#{3Yc%hT4yzz4
zwg14c`>P?}x|B_pH2Z*g0I_pfi+NGE>$E-6iQK+bn~|3u1OLcw!_b~yhLB@ky&`Y{
zfmPo2`T5I3APR7Hq<W%qiFl@3QEE7kK~KiyH)(RCJky?jny%elx&FJ-dESDCKg;+T
z^bA*M*){v`S59_S-Ih?!{G;q*pKIbI`PblszR}b;wb5`M+{k^3eZ3v3SL}XjQ*>$&
zs`;|Mad%QbV=!F7<-qv(II8y$Pr6>Misyr+up&B~P6zd6m^o*gOn5Z8=<A>kQ{m<J
zb!cFXdP|4`SEP!QShwtg<?I=Bd47uSdZnO{4(W`$`Rsesev6~@pa%l`4HAH*do(W_
zr!XRqbQ|1$5?1XoFM!BUS|a)9k_+@z<-ADH*-%q=LT0e!)+#WiAK1$;L<<uE4k%9`
zV-x|mSc%gD>;;vwJI+dK)ix=Y`ko2UWy_HV?2{yrl3tA5MJ9$Fw8PSx!P$57O0whh
zDNa>lp-|7cGg#}z;=U>PmibCNpU?2VbHBb^y*lVlwO;sLHDxMhRn;jx@pXEpN_TbK
zX5X3Oec<?gW^Am<wJFwz>-&rAkJneB_Z5eUH--_-H$5aAm#rjTmwMOvHzL>ToA(v>
z)jRhUv5gm1-XfLN6UDAL9}eSo6|=$USk{C+u3o|^9)gfGg)njC)TWV++(bT$0ekn3
z%m?%%LHdkAnAKg-pNMX7<CG9QpwZ(|*D$jxsm)`za{W85QU4HEuVf8q56+l<0dh%v
z8Kyy(KxSG(=QA0SS3C!QvG5`9N@^xyUZf!SvuD$xcoRetM=fYNQ|F)@SU>2D(B+ME
znZEpa5GG+7%ReC8J51LXQfidh?fE`NZvbe5uG*m5?%@-ToSsKRUr<P1K?X`o2kwVZ
zJ$H+z`GKX&X5sqOdV7R+cQ$@7g4>@XoWFWs%i5|~b6>yyu*xc7Q=TYPC!aysp@gX*
z*X^QUhyhBC`XtPp>#Mh$t5e923=%AqAYuk=rR=BA(K&0DP)QFyy&(TwrnDg~C}1$Q
z0PR4`yr6OK2sShV73x2M)Ju`+KExw&(MB~)N%YGr<n80Nl&Yg=j$-J1sMOq1WL=C0
z?Uk@{WPTL0DInw`t(XOnBAn?Lf<AkGu9i3ei;aEquVeK|UQXYq^c!x~m?&nRl7?Ez
z+0;?$C(JqDUnj2|`~oNbG3M<0o0GnM0IC`!Ba&_+<@nb|!gbfh@#p5IkL!H%WS!hf
zf(^GdsWYtce1@G(XHZ_EXKYA44)5BjEH_GYgd@b}e8@D0bRf*0Dd(P{ZRLqvNA!P|
z8J;(ECNLcxjLMnBZ3|`39|V#Qj<Ms!lo1?vUra(;=tHSiUTOXOPM)jFvfjq3A#18?
z$3({1;SLB5oW219aA-G!^zG1ey%*pu)SbLDj(x)m7tEfb7IrEF<g3vrvGb{38F};&
zh~#DYXNsS}Z@ZuGMV2|}M#Bu4m{%9i$V;7b=+`15BCMzqB{Hgz0T5R^W~}@EXgno$
zliNq{Ene2Av5+vX(J+Q-*$=kMJf;ydz<#udeMc0#U)GLS1-e@==G?&7KewVqRou|f
zCN`SFZ$r?%%q;epMC7GkPuuq(r;#bAEO2YbqVq81xC@yK(c?cE-&Kfh!0RJJf#Z5Q
zxzjV$A{zdX%j{A}e`T<RR*-!<zFalIqrhx~ggqt_FmECv!fc1u12nLhX()|H{wyQ)
z-0lrvcYaV-jyc8iwBs`bGWo=T=s9CtnsxE7iE?oB|GE8SY~(g}Wav>3+Bf#g`#1}q
zN(5uLgEp&t0-rmu7SxN)H@u(ZFX<ooBxhC2>d=WAZg%*i>=}WE0byP;7aaD6pk-+y
zMv#kv=v7#uann1dBx8RaX<>3rVL7na5u=jBhTZa@oO_N;Ua<sduSCD}@@`EzL7<qe
zx7`KLU;Z#oW|XXD<xyI_65C93)Nc2xHlR5>CnSk<{SJJtYC&w>aZkuUI|FZU8Vf0-
zKN<Z*e7STyV>o6-N8js5`+y|@UJ~@vZbeT#Wu<xk`MI&Ic>-mh^J_}ubWk+o$+X?T
zDZ<!bFZ-M!-Tq@NV!2rEbzIS>O(L6^aY{QU=T)rmdVsO5wO)!p4zD+P2d|itVnSs|
z>C_;RGOlHQi(p83NcoZBNVIKbFY*hb<e&G7D^4?|YBpo_5V_79^c6)qM!>2!?ED-`
z?0Z%geq9`QeM~=>KiQXIHm_H^tn^Yux&YS9^r+p9T*|)OeAx!ni7FB?s$lLaRsxA@
z8tiuVq}yPlWS;RoYj0d^;10k@ukB_5Ov7wSFUTrbD7imfQiBa%A?bj_UZM}!2<-us
zOC@NT1qSsDi@Sk{6OQV+72TA^=<mA!A7Nh|R@M4-s~l7iR5mCn2%A(|x)C-dB_g4e
zl!SC6f}jX{OM?hViXuvvfP_JJNVkA=3(|4t;+*sQzWd#KpZlNq>{@HR?|kPRbBr-J
zRE0)<k?*bgFb2#~G}c&C$E;w(+)`Y~@uZetokN9KKCoL8Bpdm5nRK4J-FbQCy$o3=
z&d^d_JZIDrH#__!-9-*E%__!l4U_?yWW_oEC#3VOK3dT<*D7c@6H$f2&)k1_Nuu;J
zPQ{(qNMy(R1Tfps*v+9Bv&ZWEqx&!}EiKwn&pAfY@1GCm$r+vf{o=E>{+&CkUa4J2
z??0SEk(@+*h(qdOG}b{+oMe>XEpfmoIvj|-8foeNdZ$(EYmzTaY=ikm*7cQPu@;FS
z9*>lP>-1zNQ4Mr{E``oy1**RY-fS1FqS%;F22m7)?Kdx1b~d#c?76EoI#h3|+z!Lg
zzrc4wr%F)bM;?(8pPF)(`373Dd(Rn+%Wrh?uFWuVJ72uQIPW}6(5-+eRBSS|z%Sy~
z5CEf((l@wTkF!dczF*Hb$*2_Ui!d&&Mw;ZmlmePfDoMxdCLG`2z)Fu2{JMjgv3*&V
zhm`kN=^yy>FhQz5Ol9unhbA-cnRjA2e?_5hv7tpfiCPWEVHlD3DS5|rd1XxBzQ8vk
zReqXAuEre(xFiz4UD9xD4!1}mD^e=*GKU9a)i@Mtk9OYL42^%idR7jzSJq)DUU((>
zR1!=WuWIQ33N2EhyMKS$A?=2FE=lKa`M@L3G`bhkx`c|DK-UM|+{kTi0C>aaudJeX
zBWH27B#i_lH<M^xI&Qdu|J=;Sz%m<A{IfSOXDU0e78R}EPtkon_!cxL?D@92nLVqt
zR2Nrky%QN86?lz|z&Z}O%<e#6jEnb&xy#6BV|o4NZ|aA@jK!V>`NwQ#VuMje1nAfM
z)G(gS;D$JB=tz(iW((|=Of{TTnR8MP4(_bTN#m#9kVcDAufH>Vf!ww>f;+6KYm!ks
z!4A4rX6BvDB<`qI?-yDnpHklE*IS7_#kjb9ji3JvFLfbM#3)$gbZZ`87@^-pc5%L@
znn>;L#&TTL!W{ZD@?Ha)RvF52jEmh@^|IISCS$m@TrHk`AK;rYDQEqT_GhHrhHOiy
zrp_E?tPQtwl`r1bc){Q&LlP;VIs~>$cHpUEleM+}52~ZPf|Fb>oeXx;0ykvO(XVfi
zQO&kc_2t-*?`1h#flJ6o*QLQ8i4t>ocAuH)QHgq^$T#~E_(z_=xe-`NEap`&7Imo=
zg29T&%Y!@VKUi-ypkG(EQ_6_uXcG*4W==gS^I}e&+~w>oO_he5bGGoE<nhgUMJT?}
zA)DiKEvd}xroQ=SF>A>n{Ihp47O01ouU}!?mhLo={rz?R&*8!7K9D(p=R)GgK}N6Q
z)e_4aaR0shwb7Y*6|`tGS;WjVbLzVI$-7@FFdIg`Fmv3!{8yy%VyM`19&mRNB+3Hv
zoV*ao!F!FKcvFOY6MO|p&SxiiY>g*m*u(8YLpFR{>tN<`4?4)U&~dGEXYI@&>nw7d
zON$u<BPa`iX3M}&$4Ah*n{7q}iB-c#&~vofDk*_Hzf*A7K~@ZS`L?qiZ;HY3F2*3K
zcw5N4xlMSISZwZH)V1~qR?%i776ER&PbA^k-lb#kgqPLV18EhqKTLh~5_;{EuCK76
zJHK@J#BsA8?VsUl)|-o8A{3wxr$(VoD7&~h*Hte#>S4;7;CUJj>_)CLnCgIYwxDh<
z{(hRKUvwdB5`PuQEGvU=(BGkAv<35W>rX;zsn{ez+t*|ZOcwinUk|4S)436Dd21YB
zy|#g?7VCR4*LgD`xCZlh8v*I8ComtN7<lbvx1VBnfDzIUCB{QIRr3-(r{nBY0Euvz
zQNq;km1lK6kB8?|I-^i|yx)rmS4ji!v&^-^kR|Jj?T0ddrx9X9PriEp_)<(waK%rC
zq7$jo*`M@$(0IG5s&@JjR7m3$uONGGlzupFlL}4>zudoN26<HO&S$cp@zX(OW^RIz
zsMPCzl&YX((l~%>^`nm#IEMi0Nz6NhwQVy|gIlhEc6`5v0_{{qb<f>YEaL8+=->nC
ziA`Fxwo`gbqF5iHan6&CP6aVX31~y_LDSI&3J16P)yIcmB9^EN7)jXzGGcRqG|+QW
ze=rSNybh!B^IGHZ%(xOh_bS3>JL`Lb+ulQU5ZqB#I<<9J%SJ=yYC2GCW+>HjR-1=L
zZUg3SkdX!0*<J5^Bg5vX6@x`;HI!p@?<E-+_oAEkjT3DM9bUUbSWjMJ6IjLsX?dIW
zSl+ikp6x@aJqZ41=D?xK2dJPCD#Xbe$Bur|D7I7`JzIf3H<LP>!5hJ0v|_!`ls1Q|
zue_N@O$r7;%wvwUbPT5akEUXb>?rgd{D>yc+m(NJRvkhnUl4J|u!@q6QGQ~;`@=|&
zGLW(pRYr<k89CoPDMNfut6ymk!J_BSax%uHvQuuNMuxmog#`V=VF;x9awcDHFTKMQ
z4g8ygF}z<R_L9va_2Yr+t7I%zq7L{FZX)ROBELvVwi}X6H&u|qm78#jMKwY!c0m>5
z@}~a=;}iULL!*fd=p7Z;Q9q<~&VzpL9g$7A5rvFl;y;$Z?AdKO%M@P_ouy~Z7$^Z7
zHhVo7>qvC<3cqEzfn)8Z0Qw8m3^V$Lsi{?-4a}*U2X3<-sD{fYakDM?ca#x4ESv&w
zacKd1faQW-Qv{c4hO~{M;1K$3nV>U^rsihBBg{3utco671hs4)bYioA;^bHD@@vTj
zefB*{`r;0nJyKDPupC{DyUP~~g1mgtW~gzaCkLN^`{BK;edErVY&Zzm;Z>bTo!%@P
zVfiq(vY3dWo~vvY9S;*b$|(k}gh=epzFoG*=kK$);0jb4ZOcGj`aFn++vj>?Q)M6;
zUgJyiIgW<s@I81jii}AQn@SVUR{V|H<2Etrc$_lz<YicSY1B34BKgZ(a{q+xr=w&`
z)o%$ySTKV_;KF85l?PmsE~NRd2S!*#LGndtYr^EZXX5w2oroEsK}i5U5fh%pqD(wf
z7vCy512uJV(^Uk7U)SA}=m7UJ=TWoJ>SrNBuN~}zc~*Y>xLwjwZQ?6b)w~w4$1zUY
ziF&k}k(m$6hPM1_k1wp6iJK?#RdClfJ0kp9^xl)|44`R3-G~e@h1{=ynRuWk*;0v^
z#kQa;HRW%SIzBgI97vnMx(x$Wo%WJFAH0f*Detf$eXL>FM59y-<4MU_Et5Sa2)9g0
zA0;}siPX_qCzUfXC~IT(pplj#x>iGG;KWNVf?7u!3DInCzyqqhUFvY#wB+culP=|(
zNwb;vS>yw?b2O<mj$<%iw@(U=hRf}D{-p9i)<7{`@vs}#1`6$~GXb$IlS4zmWA%QF
zxtJacDv`5b+_I!Y&PDhEgI6WnMU*#$3~izaPEKqud_{G1#>7}R&ThCg-xY9}fD2db
z{I$H+-*ovT^Gj_J*236&!#F~*yQo&jZZ|qXo7m=es=|UMaWjHx#ZTp1t!qSw9N7#m
z@mEavyJ!y1!+v!hpEjF+pz`upB(`nTU6C_L`Z_*SYSf!|rf_31^kMj`55gg9ust-;
z^qx_1)j*4CGF1<RVqAo0m^CY};iD3-YFjJ(-E)vemwox=l62gN@jJnMhjdFoXRDAy
zzpOwBH*52w^~n%v=T7o7n@w16@h&~Z$P4%7q|x`>pB|s&z9&8<4y>rmjz7B~8QUY#
z?s~2yYwQF&r&W55iL2j>*%{K#C#sj@UJYFF1|pce3rpsTo2TdVOB59KqWQX6EGo}X
zkS{U8_ES`g7v;o5ld$5M8JmysDTNaBvDk<WF${fL)5K*8j~K9r%At3;Sq^spIOE>w
zetV($G#{elr4{=R@{k^qoWdjR)QDyNqXD;|7oVi9n|W$Q<>iN)6t6HYW%$+JUy##K
zHo&ex&<N%N<^P6!GjMM&eH4L7p1~O;KCST<$=yzzyqOkm9?y|&EQp)sX;<6z1|ezu
zIJn?hLpLUFUB14SDRuIC6m=jZQL#u$>FL%j1i6JSNeZSBEGpS$i#?6oElS-CZZ+?2
zmi*Xc9walM_<;4{LLF??i~x58@lvD_veVE0<olMtXX=u3cIgL2uhh5vR`Z$gp}&xH
z&aSE37(SE@bw3wcu8?08HXXDX6Aa{pyeV*2m^-grnM>OFkuT^6RZ|;n+Q4liB!RpM
zPbq^h-3NeKOuz|&Bsg+c8#Z*%qSrOGazk%o`iS(*)Xo-SlO()~4P1M)1f8c<_CRTB
zTluHA5J+ePjjYKw3YQ%*gTYd}vy4O5nc0h-3RgbKo)>vHx<u;^8cdko>;<aA`4M>o
z)(086t_JDv+tbyP?GRYHa=lFqLGblp_{UOe5SLTXm+Np1WpI=FS``(JtARxPRbkE4
z7q1Z?J^jS`z?~ESxyx$F#<6+?nOLBe3}pk|v^Q0bXaSUmmo8lz0Ip&164%Q=VlY3*
zpbZ<XeebGO#jm)2gv|sT7?&9igR*XdPN~_HXFtxiBS*(bVFk3vl(y1x{wnP0E6dRv
znL%a(nHIrF<smrb&0#>d`{LC-7?{H^_S5&T`L3)sl;2Y{9BsF{vcE%MHG6$x;xm|x
z*AbDLJF%$?m^Fl}7_p=;DPd+Vy0>w&pbR9O3S1e{Y7&3C6WWznxGCjPAvzRce5p%*
zXXx_aEdA6JJ`m+IXq7$y!<n<6933i6mIAMAWW6tF_MLi5wV(0)0_52yNchj!{Yk+I
zmm~BL#725AJs_|AEv*=H(a#dv1EIZ-KbAlMVqld1Aec(Erh>~5*8RILq=u`V;EC%A
zzJV^=#UbW$O93=?4tS|>#6%<>{QiBGor~+dmI<%5lX`-1Cpm}ieMk*HzT#|wF-4X@
zvvp6lFC7oCyD<qyodw|GSdB26hrmSC7UuJCZN$zjcf65hAEE{eOtOjZ4$FY%-Gw}7
z5o~B(o)q5t0S2oSw-4K&9Cm?p>>+T5lJ<ns+BQerjrt6E65=f$gOoROiH|7MOt${5
z4?Gw#?qxV`ChPivi$nwQE<3-DS_!cJj&?9x9DqF8`s>kQxo!DUah>;^?1=HRGe`OL
zbcHaowf^7$p0zGaKeh#tYA?_<``V!c@_8Le=JtRCNE^l1F5cz2w<a1!$y=2OxOn`v
zNi=>^e#OQQ<%r6o9!FHeC5Scp5xuSuE79uEDpmRpRhtD8V%l@!prBnJ0_3DAQU~)2
z(^;iEahA6f=xRM>J<rdchC5W|4XLf#hHXJ3OMv^!<b1`?prjOoliC)#WOY6!7#5jb
z*ih~f(%2rbHBEhiMF3>7P^>KGH~c@aK)e!Bzc^lZ<eG+x)b-~cARcS&Hn%X06IOjl
zJ~|^l@8O*6IZd@?is7n?4tKrA&2u`ZxoRHlKCNJ1WftZJ!(r&=P_+puXUV7Dd(DKC
zaMK16_>IEk0Jyq(g%4V77P3o<*>g&(&P#6J-&?J|Fp{bcrO6V4^0w%*A?D#^zmS@Q
zC{7B{B^#JCRtzgs$4hUr)ne`Ws@))hA9t%Eksgag`tBRPl_oH$E0bNL=dHqptrg*T
zj7x%r4!W#(pjK09$x=K~<DAvGx7Y`3D0{A?@z=y}6!7GoZvJEj!j7sz!y4H2v8%83
zoKJ$H2MUXW1fJT=#p=&wQLdh-kb3g(xV3#2{DeHG72_mP*9-vUTEFbhv(KE&<;<s?
z`S}r<&_tF=iFbr#f-{ehnkXTU#gRf=#|VieF*<*}selURPH{DzQ-&D*yGks}ufz^_
zdk*4TEBl}N44v_O;%$o^t3Csa8(sf%WYnpdGqY$(s%Gyo>@?9{pyHL;R;<;n`4CvU
zpu<nXX+q`ufdxrblzn4F4dp#><ILEjz)Fl(Ss?=|k?XrdGPk<R(z>k4HSwqWxYXww
z7)CB7Gp!*!rp8*uUvkrS6=~R6q|{)p_$~i|pP|`j#l@hiJIJysWsSsCQk>>{1KoXM
zzek{uFAE+f=y;9cfAh=V_l%s#%;Y(iggwNJWxRvHn%G{nN{9-D3?g#pqFYO0*hW$v
z*40WZGJt1T##abm%<#dkU!V_AV4dHr`FBnV%p95h%xRN^1jn7&@DVxrPcA60J|Lym
zNU)1PoG~ZKzODO>Q(e8t{x?WvsqUh}s*e_ZPqz2fIQnni#;qmXs8xDm7GQB)j(VzY
ztpa`hR2B*2n;1q9lrzUY(txFqyjg)h{VqsqmSv)&E_M2w?p5|1V$MAOMA2!etHYYw
z+}vyn#5C8bQUZxB`E7osE&^7CVLSMxT#nRxg`c$x(63h=ib-xgx@O69qXNQLdo=n{
z)61(|_!8jnVep)l&h$vw_$St6x3%gokmn_`y+`$L#B{jSA)uFHC|r_L!g2GXh482C
zbS>6Fb|oLX6aMgWQh{^57gY9>5?_s5+EgRiH%~tSPY!`sNhVUeP%tcDt^{DGEik7&
zs%e5K<#i{`TcfduQ2QkK)-|i^ZouKEJ-{02p8u8Fo_Ey7-e1?*rT|TyE4G(SCegW=
zDgO%@R-hC#`;)fqkNuaO2QmIjbpV_a_EtDbkPDN{eg9IwN0Zu#gSLz!#v<VDJtWnk
zKRNG9dJXHr@{x#P$e)HhvhQ=5RTMV#XKR8?4AdjO=F{RYn7l+c`%N{)c-0jGH84@p
z@jsLx{G}rmffj}>wqZD=baF>|l#n}hEgW6pO~q1RL`ELk_vzic8r&>+Ym^a>>H>|Z
z>ko#pf5+g)rcG7lGx=>1@9)iDfJ6K_jHH4|LC+%5RbN*C_Rn3*Z(b@B{4$xmRaKwq
zjAs@*x5kG*d8V%Ab*W<>caZGve};)=zg;Zm3(zFourbzNtA`*Mwj7>E;*RX5j?5YJ
z2k^gNh#(71HmWiqTw;#6INHueFvkPFlmPG@gfcU~Zfa`Mhlg;25!?bWBJA^WUe#o(
zA}pIqQHnuw?S_?)J}-9p8%$neK~c2Z)d;jZ`jB%tPHkb*8B<XQOtWQx!?U0Ivci~_
zPqMnNI&nQRA)j8kpP%r1sbu3kC7)?#cCOiJiz|{c2@Zdt{afn1N?K_7O>)f)!@pO6
z`a@y#IKR0_Rg7RCfyO-F#r<W!7)U4Xk*CKt2vuq1k;$wg;Ac_TUR5XpZ}H%Jd{!dv
znXctn##P-;ypNj&XW<18+-&y@kvdqNVO$)-?B+SmkAm}FSjvZ{n62<pm7<Epy#Q=j
zigb;*Bw>lQys40{9(!Azz)nS_VBudbz~%~RWUzGY>h|qj!VCOUxvSF;?qL{to=r&h
zE7Uy;Ok9#?B4?Bp^h))nwU#MrysV}x<n{UC0{aqaQ^*N~Z>@h|(R6j<`?NR*6%-8|
zH#^Rd!aW(M;QD!)e^<^$I@LmK&;4T#Jzgg(6kG=)-2@QDtQmY2r%we^pfD~xalzlT
z)KmVGr}I|cU`kyO$|v9#f8@bZtuI>!t(cxYFp^d}I87Z{NYd%gbSBE<`MZbn0mxC%
zK$h)=6dGUfflO}8&dKR&>}#gZ{4$h@vtW?$LycC&3JHQg*;6X(_XL*yi;Mc{qrzE5
zb>6wsLQYsgBF(>^-VyaY4bKQIdg)%Lm^NC}*||<qyW)rOZxv;Xg`H1N<nh6Sfiq|e
zzF3-rAY3F`!r3F<d=63~wJ@Xpc7XbZi6g!dGI+yQggZZSz0ZM~vsMn1?kAmoTpRa7
z@@YYv&jrsEZV38TZ^_r_V(!kQSz@)J`OI9qhgtdZr}cu>H936bdK5Wo4Xp~?VL*K^
zq-ED*j75^Q4SD-s1ie6iWfi3gxFE3+uKXoBmgjn$vuiN&>)y#i%1!%7R0k=iOcbRD
zKJ!U^B<&R73n{>EzAb}>>*5Rc9EZCJsIHwN9*dbAZi-}j()O?T&fZt&#o<P%01Vs4
zr22{Z7~!S|lypbwB{sAKn`7nT|436-IEI`Rr7CyI`o<XGkX$#I1VTO*A)$}N?=cy?
zb&Gg*Hy`aV>{velFsc<824@Ho@f%wJcvWKHwe;x?rO+qQwFEfXXg0wnU_hS6_7|yx
z#!BS9n7KhfZml3uNIFN!^Ns8CugI6Ly`Xkogak4VMvaUviP(T8Pv((QWFE3-fSWz_
za!`5mevS~*GKV)V0?owrZY8+-JOIs#axt#q3M*W|RpJ-3aGnn4RQ$fFa}DQJ4kyU+
zD4a%<BbBX5y)K-IEp;0YA-&v$lxPS{M(t^A^8AS6IN8|?fvl1?^yyMt*fZ8L7PeLN
zulU^`^_QJh)MP3r!OXMOBk}W@3u22rF)K2JXuiWgn$O_5{$YZwUEM)c6+eUVn_($9
z-+#MTY%^yl{VagzE)dicEF~ac_<eR2!HGa~HH6Sknw5Y8nhoGo&I-bo@dgWj2)bcu
z2UWkJ<`Hp(2KNPBNM56;`KscXB#`()mRIqZ9I6rM>;9UTKmyyc1&6lTpyd(J=<|@X
zK2&s`t#2Q){UwlGyRXNsREEaU*{ed$V37oLLohLP-J&4k2YL0UM7Kc@(&#-vs*>{b
zokc)hOQ2>D$X4st!4&?iMUO!hG5du`oC0u3Gtj{1gRS_W9EHKl-%oW7{=yYv-{<fz
zZa+(eqhVQHkn;taI*jiLsa(OPxW&4f#t__>Ie;D20+ZS<`JFiFKll1!&RoqLoY$t1
zAQphD(x=}96)=FPCVA$siI3??)={kLZWC_M;?MkX3V#DA#CFj8{{n64hgPmDKtIk0
zvDe^qdvfxIWN~IrS(9hg+G(Aw0?*#N76V9k4YLIN(<tqur>sz8t*B+KK;9M2UGeyQ
z7$o9lk`eqLNV5JDb*K_-F#JzeXD4tzz(KGrY!%FOI^|DJgn6qFRtyt$-@E?V#m)kx
zUPj!76qg3zYp#{kKG6#)$#fD82FJ81k=RxTa|F3OGzWxQW%MVK=8z`PdC}Pv7ZPoe
z5{P0if`Vj|br%767tFXua!~!1clkLK;6kPx-QmdgH4B3Lp|~Mq<S8SM*QU*kR>p@9
z4u&nTlLnS1f7dlzgwaVXBK2jQJ}MVvTm?(O$cnB<Y;BgO0x6Eq+L!?-dvkv~8=6_g
ztg4G&hVsgBJlWr5{VC{SGCxmx4j}#8VWP9f9Tg;u%e-xj?;b<>BJik23V<t$!Pe9h
zo4^hzJ47T$T-7`>Wcfj}8+Ab>cdb3Yv-s~lCF05hql13%*tbN4*<y>J3v@6o11s=E
z;aX)0n7+tb#4C;>Fs-_`7O7+5u}Uy?{?Yurb&MN9+OJdDP{FoE1>_;&W5iW=@f!_x
zb_GhhXo+B+nY7Sb+N5VLJ^fgtHElt4@&E)3{U^^{zF#uz6jXj8lP~EUlwo>P2O)XW
zUqD5ZwC7U-1A#bsP-&>g#d+!*AdD#lXRz`WYoN#^vQpg$bzCWA`_sS}R{CT;L1Z$R
zFGKvVZMxOR6I8`NZn9^0NRXf9O}aW7X+`l}b>rNaUa{q>`2BJM4Q>P}T~L2RJ-*M8
zbszf>BiO#-vZ)c6U(*9NQ^D`<-<aHD9Q(`12L^12aTi%=eAs$chrmU&kdJi6vH#)!
zh3~=et-4v6E$O!5Eu^;XHmhAVBSotUvyMl^+Z5nN9P4T8B(3fJ-AAF$+Jx;Umfs>d
zoD#c{Gy)8<TAb>j9muqap0F@^!2|@gcWx`ABb6^N!?0tdX+uGc)K+4h-c^LG{czn2
zhNYQa8frBy%G{|5#UN9k@OJ>3ZGzCb714<kxja<~a!-L>H&S_#Q4~~Q7NDw&Q%#l}
z@!6T8aBqR}CkCmMhMQFE`l~f0k}pc-i0QwGa8oJ2Hy9kB3YKsSyzGRX^h1<d#jyX%
z<w9P!c%4ax8V`)fzB8rrNi{!K-?l^h-jz0a^O0+E&k?E^>P`ymtzv$9|FOsGs;n+M
zlItUkPWZf47Vvk7p0p`1_>CE=VG2oVswjf)a3mo$w%x-TP@>G?R~ijJgo(|2RC}MI
z^36aS0-f_8&vUn+UnD_+ntXi0^VKTxRWsDNE98Qnr+2=tHJz5@58l4)*Xo?l<wtuO
z)mm96F;`tgWsw>aH<Mq{*7&%~1&&B*N(3Tv)b37KB$XDQutF<Q$UFi7$;~8$;=T>c
zkaACr30De^u@rBU>|z@=@@M9*#9Co^xT!om@)7v7a(}{4MrC~+0hOPiB7zvSQ2_)N
zyU%uZ(<Fm0U*O|Lz-#Bk1n@;$tY}?~A`ZAdd1EGRf9&Y6q?~R78ScVgu_ifAL3A2n
zf6#p0%3$1FdoX!wJ-{NQL$l7mVY+rOaC66$WU|8l%(=C4=qLyq-_|MjkrGWhznSz%
zb~Xxws}034R_sB-;O30gxsZGJO)SP`+JYL@il#}re<e<%Pm82}I1u~5emK*^P03B+
zV&H}R2O7;@SkKerTGKmK>ormo%avP(3*bFwu7~lZFTUHqzC+Y7@NK=Grk(an)QxIJ
zHO$THOHOp?*RxbPQ%jILICt`5eDg?HGUHA}5a2X;h84c&Ip7Nuy(O?o3R+d$*ni(d
zxx!nwl91B>hpO<v`Sn+v_St2NX-c2K2+H77Dl6u%EY)1eT`J%){M-D_Z9vfz$5NaY
z7o!}#R}oW@FtH|A5oypJXu(K_it>2NEB>QmAANldX2!KzR8Sgwyt5*^aB*hM2~yVe
zUq&hfiPLh4t9GP##{LmxbRi8>!iLibePAvm`y4-=s|?Ya$)w1ZJRASS>-@;r;KfRh
z-2JB~>+_Qz0d>ZqG4in!Pu0j;&Z%>`B18lpedF~+G=>E!YZK}+`6=Hoo!@wy<L24;
zaHQ;$SLbAH8%><Eadhcc{2|HN=F+?2zD5jp|K@IDyi)mu=N_-<2Fo-Q2y4f10-f@?
zJ!52JK1mV6QholokQ2|b3aw$ne`@A8$4`;(XtE)8QVBmyoC9%^qyP1>uRl{S_ia2C
z_wXi<_Ml_#;vx#fT9S|S8&0%J#cYQ%Y@2-|qhE}d@<_u?E45H)BzCaernlti|GjvD
z`Vuv1lqa)1o8!@1c{2@BsknmotiY2!2b<Qn$f<JiEMQDBaaJLG3-~0FixQjMG*eSk
z#Q<xT-!jL)-1xDfY=NwSB&m@Xt6&)Hfnw;SNWD189M$nqT%e*m?+K;xjM0ayfBa|M
z+Y&^gO(&nn-m)!F@?zAdW>6*&;WqNOAr(}8HBv@=GGvB4SEi%%^j-fo!AZ{tDit+_
zJU?4muU0kJ2Jr_Gcqo!KZ_%Gv4Q5OrR5uS}S~?x#q`5&`j*$3L<gq`ms=qh*x^@1-
zeV$-)-l|a-a-xdyOQu|kSpl=74%O!tsxFqdgY;}&^94-gqBqek{kZT2{rb6O(EOK7
z4r<3S^?1Qn+r6KAa(5IvEXP={R-3kdLiOH|{lM4fV|<>sS%!9qvw|V#S&~DN9+nH^
za($^_1RteZb2fD(o}!b3ba$92F5OU<^U)jcHcyJd?`K?#GIO$CSD##4V2l^1@(IiH
zAmzV8J5?)?%u1)JYbJ0HVL0eU_+pBd&!QSmoUFLIs3ogUYO)j(Lt^Mpn2E_cB~j*o
zcHKmolBJ4DW43xmIGI8RpIL^6p}CpgEA4>pp+BABP>OpXlez!;_?zE#u7vBDZ>Csp
ze23LodOUw|6Y?7^J}Nml>^W+0i!tU=+p4pk`Dz>5ldd4z*-mpI=kD5<idAea!R1**
z_&p@ed68cJUbKUWfbk?vmSXWsm2wUndhEc%r4XCX9@B(f57xfZIAha8oFp5Uo7=u|
zUd&)YqBkw9waV27^z9v=I~E*~`PnDmw1{8_G3xwJ1K2K*XEelrlL5~ad#ve~b&vct
z%zDC&)q<boDrcv|EO-h1LOQ0(o7}&=C)E>o_uyhkjj{!he|<~SmQy~zM$3+H&Kj~m
zbhsp}!wss8Qz!?#OZL~^^my=8=C{X0@-lVBL6>oFn8uJvymayHv|MbYkiX@l$=cYm
zAO2E>3K6%gu5c}FZT}`+kQl2FX6#?%S+4@_pWWtRfji|}&Pbw&a5i4DYu;5uAy}p?
zGH$^Gc@D1&*S+{rEB+dV$&|(|YXtX<oX%e0jmsUV4hbCe>LJ}Sn|PjTfX^?l`hfiM
zQ?IgTB^2D-94g@lmmR}x$`hX5e?R>}2qP>NDCVVZ{sgIQ_D6my>*wFXyd&_^-SNQ>
zdCr+c@%RGCx68_;IVsY0L)cm7_7m-wQsuDx#=LETa(iAwE4a-avTc!eYI~0wRI`t9
zXssye#R8o;Nha!`TaR@-djI>qx`~;2Rg%L3)qX5A(1@Hz1?Gv#?%UH7dXkdeVUSnE
zt1-kFVwGwpLnv;G-}E;n%}^zu#Y~!0X@qW1*A4}m?n-?5eqb}o9z-%NN4ZYtRF&MA
z6@vPq@`J%J#gmY=`!erM0)eNreoclpHJoqy2_6`BE8UU*z-O`#u&>)hl#jDPRqxup
z!Z8|dUhZwIMU4QVKTnNFeHG4ed}MmJBAl_G*CQg0Ms4xg^q5DjRa7$Xc@GA`iWuQI
zH?pWwN6xS52@YD3OrF)%p&tmiFlO6sgx=1iopH%%4F49D*Us8@!G*Y5PBh=P;aX*y
z3hif<ZYi4dnOri=vnltAP#F?e(Cs(j#SvUu*8ECP?`hUM?bUi{kt)*Pe5IwWb%K0M
zAXR9aYvNqGn?#V_sG_d?b%{vr{6v3!0&zZ)mW?ofoK5BZli#4Q%`|`(`j|Rt=V$Yb
zOu&w@(&&>o%MmgJO9i^1O&rBIh6dmkqS`vs%f!vUPPrty_Btl97+RuU;pCwed}r^M
z{-N{NdZAdhhLs!g4rp=SWQKT(`JYLGZIkA4F)sd6<J}%<gw68AK=6JLnGmr`%phGV
z8dtW)?^R9HO6lpR@ay+PsE#*OmTJDH{+WygYlekvzO3=2Yy?)DH@&L6LOQ0OCsw{*
zWx%H$54&}rm!5LoK&O0hCcxI@Ht#0hF2?R~aI{+eDjoMBpMw`B2lqEeERi(G++n%`
z`(6p3za8&ut#2)F69q%YKn@8%c`zGy{Zn#u!YpCr_NlK=Jf0(I*UXGh`KdRPk7Ibg
ztYaPVc~WC%VN>f~R90E6owz&`;c(%CQ2SN#F5NttmNQ%0tX(-6m(%v|m|kDwK4RZ7
z>w5p>fG!-liBO^>L5VUQmjUcUfDidkMwd42Z=8~Inbr$)3wmzID%w>@5g^(bM;A4C
zj0)rdHgbL__=D7O1h9bgI4&7_Y_H>PVk|a&qOffo>YrfAgYpVq*z0}FBSPOoSW?sW
z1UR5Nwv1`@l(3-^*?*LU!c$~`oyhfTps)X!Cq|8D?!BjHwMV*yYx_CEfZ{|YejobS
z`@V^N`ktdk#3p9uY-tW7?5J>LM>!1&SOV;P0@uW5D_6y9{R53>_WZ^%NYZY+dw7dF
zY;lN*j_6rLqW&upylalpBCo8K5F`b{7b@FJ79aDi;XZB4=()&wAC^1tPPFLgx{6g6
zBHllUs11W3y+3`2k@O^r;0lDg)mGdiCOi>F>7~9QyYg1HxJy|giG}B9W!?ZjK-*;1
zsGAP4H{klqb4r*%r+eL30Yr&Kx@!ae*23s;;*-|wnk&fEGdF?MkH)G{(Av9Px1EdK
zMqV?Pa9V*0MZ2;SjSbz}haLev-Z;QDw4t82xH0rI_#~}MLnZDLcUkam;Qhl|VkQro
zvt>ZhcHs`j@|yl-4o*n4QYXTVO@hRD#uLxQ6V)JivZ58CN1cNyzC$s$_jnmnNOgGQ
z8mI%{-77m<xJ-}s+~Y3yd3c`Vv1k6s;9pn!w&MC=jE`Z6>8u`fwE28!1t_HubT+>0
zCl*@N5?|rpNk$=vwBj;6jcVQt^Uo1sqU;3;Gca~7ZWSELa{5|`?;?+?FXP3{>Z<1g
zDa*^}7&1TpiNXL}2b$7#6QLS}Px@~Fw<u}Nt>^we;mbcu_GwGXMXaLlH97yiz<-xO
zz1M32QMk3@NDN=cpZ$%7374CqP)ZMaCS28ICi7Dvx#hO3a*NoW2CgPCUS&fCaI+g4
zwy@k`j1l}+*FdZ^J|TT9WR~-rB8(}Gc@oflB?!eBoM(gw&C_;sdY1{6DDukKMhPt%
zT$n0}wDCh=Jq$m%GkKK&4{LcypQM$VLH}Mn{n<-w+tuX+BVcsQG&n!HftyX3=&glS
zcJ?K_!B4FsfR+}|tPe3%9k~)m@arl3FBjX)3N=t4Op}Ayl`%6<3vytifLs|C^}+d$
zZCPbsQZ0W=vVsxlg)MoWUs9>fHT|!rKGt1^f?MfgawL@XcL!G6B;P~55P5{*$d>d>
zI$>Z4U{xL%S`30YNfr#vi)PNi+{Aa1lV;9wk`tBci@>&B*wQ!9LjJth2fYI}GkE87
z@jYrUOG1*WkWXO5SLA)CenJ_x#TV!1hX2Pim-8c~H;X6LLgqnkic`!=kklCi_rX#7
zdLJvodLlMeChas%xbz2h9=#o0;iTYOYV}#deUM&#LIg7sZC@}w$fEIgXvWYJ<2U`w
zYM>2hY>>Ialv;Q~0HgZqxM_r5!kzC-_`nXkV-2DEd==@zqNVQx>%lcOuEzN6K<GFB
zOS%-0RF0By9j|S-<^Q?1(RB#0ymYCe#p5mN$8{8>R{!5ePI&4hreJKP<tz9lad%#R
z5vRc#$pBy~aoQ{$BZ_w9v+{eOqBP$6!FwDQ>EIJLw&!e6x_&42;=MK(t*|qZk)%4Q
z^Ux}#xQfCO!wVWzv?@ifp$xcfB1B-7UDr$t1xOha3(LcwRo&nM>&Y6vsyL~e<m7E4
zw{xvoZ}r;yM!zc3G|7)0Wh3q6P3%89m-#mc6|KuL+LhG_i}Y6<ygi9kIwp+kWX*3-
z8ry&%#qlYG4M$|tJH|E6*Z;pZJ^BK^;`Z~jdCvD7(cBtSvc-Vk?uZ9V14&>JkiNF_
zeR)OKP_(b&8q}oVsZ?uaA4A~bNV**Ol*yL_Na&v=wtqgm8WUT!5R6^!d>fbuQ#r3D
z8-;%pfc6K3xo(%V;k&rmml1OF_&~~0jS&TTI*A6XJXE5vYxyDmDqYFLzP3@A(+Rla
z%>2dgxTz1@QcnZ{W7u_<v3Mbk{cq8s#`&dMA5&`A8;V2c=6~Zu;FFAQS%?n=5FbKU
ze|=o|!zlgUX7jq~%cKl7;>nMj%o#bKr>SQgU*V+RAd$kO9hl}7GdcsGA*lUtfhp)R
zYV}S3p%dJPnX@|Y#e2X~bl%6O-#GB|J7b8)?HU(csZb44@HF?^ZwPw9icl@vvDZV3
z9yzDbLAa?&hR3l#qszE|<)l9^o`_`+&NA65{_CZJ?4fiLh|u+3y~pwJ|BR(_uL#3&
z@fZ`FAP8$|gdOg`%FUUUqrV3`=&gX!$4zKxXqj}18lpw-t*4D5D<=xRi+@J7QqAuK
z-fN5rbn^=l4%&0R#*S7=E8vlTGlI=^F7HRA+f5GgE*hDC`?1GT-h9%a+W#3@2bh2I
zIqHDP4tq+;zylgD=x<&sP8_kiwSZ!T)r5-AyI9VSN<`;jX>*W2d0e4UK_3Y}kx|5U
z-lYIvkwDP2E|~g)k$JQh(&$APK|=tPMcHcC<D|2Y8}R6nDst?yBfGU{7Q$u~at>c!
zPcby}J7NBjO;al-yXePF`KS3n4f*&|%OG#V^9C%(&@F!D|9PBX3^9dcu0X(+H!wO>
z-L`|-k7q~gYdBh9qHX>g*+A{MQ?%<sp=`ei)mXOh20kGwJLqI=MIIJAo_Bp?j=t6K
z-@;E<Jp7RV*0F}*17G?OIhs9LNQh9Xx7lu@P@(0^Wea6b7&D#)un`Y}a>stZllKV<
z`_~oxRhXlyADKrd@u+*dXJS;bLh9)j-VA$@Si3rlfn+ZJ%S5r<?Lr?%+-w$O+rp{i
zKV2PR>EbbvP@zH^zB`Kp4bU_>Qw5)t@K(+XDFQ2UO_H8rZiM#K4TooX9lN!HGDg9+
z?)E}1#>KkAr3lICx&x(W<@m=3z)ZaeK&b{8bpH!O>SX}eM|P52e>ICN!U5VVXAw=`
z3wT(X_~iN3%&Clm##h~-(386_s((!1yFtGJ4e7$Mu@@Go3vZ7_VHkQZctc^mATcKd
zd1LZ|zB6Dmm9N!|#J!({{)QOFlmkf%)p%%Kowy*d6}c&vE~_y^j`xoo0#e;tmjdQg
zQ_=h*8tuQSdhGeS73p{1LVDT2=hqQ5E56=2Ws31#T?Pv1DD(wH2<IqUGXo+$krdAJ
zy|Fb=4pj;xcD_I_@ZZc;5p4EQss>|&*V!MQl}q`8-_W2sZA%)aq}Uge+QyELBrZwj
zNZSzZ6YO>5+X&8p0ixxmxaK;Q-&^A0@zNU;y+ouvsJ0M=M?f_mG`&(n*z0j!237$h
zrR)$E<6@c4!bl4_E`huN8p&}#krKdcvOPCw<0Mktql$Lzdk}SEZ?RHGG`<fyB;euj
zpL8~K9>2HK9yc4Sws;hv^>^0~@|*vJZD}?btoI$kmY~VO7lYF~PNLD_Qg}^miBg-N
zOAG$V8jk-gD}YBgPD1b;TgR@*CM1xebjXCH=Wq*=Om#oMy*8RT)%k|VlRgOCSG)}F
zf(XNbz+VPKao#S3KNuhDb88@zr!9E*CDL)<K_|yAK)D*={3QpCO62}smII&900Eop
z-9WFd4*j>oJs|a6i%VQShax!Md`ocp{$G>T-x%M6Z5xJ^jcC%rDJxQ!0pOnhZ3uKV
zUw8Ha3m+};N{Q?9if<bl8rRm=LMZAby!{n6zR3;xTr_oO0U-Gwxm-Bx(q_+B#c1ml
znBi_3%DGebtP_*o!etUy5gTigb}}TqBz99`&bdyYsd`)vp2PorEw!;>Btwh+ZAG*S
z{%<*7iB9laD~7N!fMZRH&9$2?Lz7U$<-Un~k>!Fxi?Zv=c7a=^<$D=ggge3OSAnx6
zjG-LULS3dN67hFI99dc*VR59`Sm<j972gA3y=7ldna6A(T}2QiJhO!m;s%_p`oE<8
zLS@ip(yWHn<nR=(BA?ZCH_>NzPVmX$<LFOKb1xT6kWF$+>>uM3pinF9%YR0CxS&#T
zM(-v#w|%_Wbpy3jyAC$(*&1)<1wj>s4ZAnOog4eGR!(k^EL-mevNH=5>Fv?4^9GPE
z>IW3(-%p<4!b$U)^X(fiO;Z2m0_eXRilTMv|4z=OKl^&k1er!I&xP&L)@#lAdiV(O
zA0M6p`b;7%Om&zuygf4$=Ud?XchNxN_mn<d2xUOxB7pD~5M~K(mO$;^IfKnnOG%cN
zEw}^j3g{HX3T|r!*kc2rt+WQ|gx%xQhTj+?uv|YI`#PYgbCQSsWAVZDO&(b+uW10%
z<>-YG$a(Hs2V{O(NPGI)Xo66v&@Rx$GA?#xW)-|ZCRuCkGH|=Jr87UIfb^RBo&X23
z-=ZP*xbk7Y9TD8Jvt9l9UC^w7gcn$qEU*}J=Po>YOQsNzV)x@6uA+nYwG(cZQuTcR
zY*dna$U{%0cID`U4{%o1?;~RWuXyy__33<PF4SUqgr08}PV(2Jzx?+=eC`q2Eda50
zmaHKjO9#%E(Jx=Jxd`rV-LjE~uZ;iLE&YQD4niwD;_<m2&$m<0&yQowQ<Nc2oy1zp
z90yV4E+&TF7XdNTBA8zkAv$m9{uf*dXc*<L>e4kxS9<^=%VEPjq(qay3>5MJL^Chn
zy9#Y1=Cn!s&r{^<-V^(<j<viJfeBQXeV!ZWs6CV7Q!h2=D{g#DS8Z*JQWI^|_4p}@
zm>Cb;0peq6)s8XU0`Q9G3-V!z&Jc6y$7cCpz<v|mlX!(dfm<jx?5rm44(=M)#W^pJ
z!SPu3I%jj;Wx4T@!7Wytv?9`3aWVO^sO>sx>G`J1pSr4xIaOBp>O2r9q!$S=w=wJJ
zTZV>qAv8k8?UH*f)clsW%uNuYpcpi_mk@#(vqY%Wp$BW($H33fX3`PEdDKFqkdw6t
zUcdM<R3Wb`9qcskL8@XP)<eZC=V8q7hDgR6k-`26cmzK+p@i<FgwroYcaA=!a$ZE7
z#y5v~!Y)Zd37(B`|9XKQEc#UiMj8~tQQqgXzoww){ilb${s>w$eM<(L#gBKK5G~E(
z=8$~};(iKG;yi0HA{+ZhX<{tJJ{$6iLUgfE3plixzU0KViwFb4j1Z%S(Tp@jM7s!|
zl0`j9Kl^^!#0&NW1=9RaZK4bPu%z`>Rk$<A#u3F>?`LNV8p+3{=cxJFrY60UNoo<^
zADK5UsTYPm>4B0V8!E9)Py~tEZ$Q8N?xZ-v3n)g)H28J)Ap0(5zcG<3YqSE5VC(jC
zd%AbKw-L^m`)97|%(l|IJm2@X!-OXY;zeEO&!&5+AF;&Sm;i&>RUo$=Hk&p5e=$%f
zM;SP33T7K7-okE3xX;J_ZxEqvpX>PmQA9?D-U$>Z+;1uoj(~x~_fu+F6Fujs=~{sG
zIsk3yMW21do-S<TA9}G?D07P!H1+rjIw!8Cxjo3J-;-n*J?2MFU0kCn0-jT^=Lb}$
zzh&PqrOlbk;Hk9<ei~xDiU(>fMiyWMYPZ;PH^W=~&$dhbBSgA}IF2CFl^~`MjmTJT
z3V{=6QVE_;eg&GhL6Nj;UDaRX`xQ)0@N4H40N7cm=R9lSY0n=Nl0iiKoJzSe430xl
zzQkjg=_4uZQngQS4?Q+%w&r`9k*J4x0Bb*UF>Xp&CSYw>64vE%k$LkgkuVpe^$rgr
z$7v4e(Y@frejm(JlgG!i+Sk6Q`#*IaK`@>&_Z34=TK^f?uoD&O{tT0Z9~z@tlsL|U
z!`%BHd`ZHaFfyteR7K&|r~vOQShunHLXSWc)eGJr)b@Xg?=EUs7{jpUHphC;ws!IL
z;3eqovL^(2jG6>-+Q6+Vn(IpM+amAsMBiG4;56w?lo#9llsl-r;2&7X8UqV;oRLK2
zSSMr3><#iiE)I__RiRIa^GahS&&v0T+0>wn#dx{WoU-{0txfhqg-6YtB`!&`a6o)%
zT72gu?rm~q8KM8;i{nw)e=Gs-j=tv}$Xs2umMHox7WL~hE+D4#97IOkn_m#>`yPJ?
z81g+}a#&Zy-<}f-dFS-F%+!17Q{S+m139&t@VGy9J&$n}zSz#MsE~uDGnXBF#oEG-
zK&O%Sr&llcx4z8e+J~RxK9F5sw%CkJw{D_LzQ#*?MF&rVymNiJ(tgT$W&m!USu(f&
zFCU%#jjrnywoGJx^8|UeShgbgj6T6Z2iYM&O|o_bhjcSv4Glrhl19Xmv@;_9x~nb}
z8c*@jtf@mK_A4@_Yw{-ke+qIJAi9<SQ+sSlsqR^$<V^K>u`H*XtGbL|P=#14;tMmt
zQCiw$BRhS$=uT+5I;BiPwD%?-<5S+ly2&OVo17ru8XD!grTO4P-t#g)>_NA1Ezp5B
zGK8V`Vd0%!HL;JElb3ZHTPnNM#CqZq_Yrj|Fnk6;t!ZU8T35OWWP%LZG7B%Di<NgA
z?8X*2_XWOg!!i1Oz`?ZRqi^q}wkp8cw0UzRlG*O&Yn!47frrH(1?(#P^n&&O9IXu>
z@r8D!@un&HmOnkU4F<^lsoY<RcGiz3Nj*%&imk1fmj(xY_LigWXyN$FBO@Rn^F7rF
zIR>_}k-KrU7WPEAGjO}Mp{@eXd8(;!lJ3!GO6XkR(^ojH5?uGE+KV7Gf7?y^rq4sR
zkv!4z<7M^ah8L>0xy3u)%7-{jgw=2o)TO4PDekqovOScOcNCxPp;D?@HPftn@yqJQ
zr0B*0#HLKw+nRG?=hBc>A&1Wxpmdgk*i-#7i43JW>E+6uwKJ95Ev)N?F&1KRGf<XZ
zf9ayaK=Mx>_*^o-U$1bicyIYh$2N_<NqOba;BxtrN%p-@MNjZkwayy1@z=GG#%>+%
zL8vTTn$j^W43pZaVdj_E&y!thU=$V0b&W*~9iqEzfJGQv{_)Fk3iyujmnx=@)x`~r
zIMpaw&$3jMF`bfi$^S~_C;wnMWrxS4qI@x=u4}J`OuaNxo5rqm{A^V0BC_A37?&$&
zsMhL}mSFZ&`Qi_P1ISUzDm%;~avB>*)D9lXyb`I8|9YOG75GCrf8Hw>uY8Fx3u~{@
z9mkJ)eLbx!9VQudJ@3dbiF2@PF^UhjU8c2X3K;U33^70pRh4!D0O~S3H!O@I@WMf+
zvpZ;)<PzWdcwBC4UlOh=Yw~dKE=Xl3`Oi@+D-u+!FMDlnPj=bV5pkD|sY<BL+JWTb
ziQ=OY6QMg3*NQaXh9Zb0sNsSBh~Es_#0k)D3JtXXWkSiITE!{e?cqyS?Q=B@qj&{<
z@g%r8Mr!xGoDvy;Ly3T}enG{FeO{jNaP=`+lj`s5!Dhw?Pcd657qljKDCSj!&GS@V
zeGR2p_%!mYadV_3VsR~w@*(2#wymp3u>A0RXy7cKb|+zpMJUe`TXKd_@!t*Cs?#4+
zdDS+0(CK?9{_(n(>~+28A&4Kbcz!%Vv+0Ki!69GAS@xFt#<J=O>jyd&Tto0~|KUB2
ze{H(0P>f~Fyl^bZ6ztSY5B3tRL#VUK$|3+px4YTR04FgeDfT7y3#g`pf2%Fh*mb1b
zm~x?ijZ)ud$iD;9(u5B;ixAD-V(X$S2;P#|uUxW8m#pt%Z4&Hcy)q%Elwn&uP}hH?
z9G87gu>YH<j?y(q8?I0+8$=<hBReUfrBfjb;R8cKf6A6?^FpIBc8fc8dj?UcUA6Zv
zH}!TF^L_SPsQ$ctHobK>8LSTl!8pFGW7Zjdnka$47DheRBWUF4XLYgW0&VunN7kYD
z@iwWSFZv~rd902rEY`)_6~9+~-7jL1Y|Q!EK5AXE+wBH!_Lgd(C^R!jH1Q%xI|<lj
zif3Bl3aec7K{^$k77k|9vrYGzsqtRj*7kqI%0icE=MyE8B#C-Nn^l6_ip#T;DMYLY
zU$fXehH5^?d4;R50$T6qRvbD?&rd`;YL{GD`0zF~gY;FB%Gk3dR%h6&%>LyD<!(K_
z3+FeV*WzLLj91OX>PrO)2cS?%w~i_=vKjcXl+5y7-<F7x7uwmn=x~jHir7dX?9+Mv
zv5Y@lbaunURA;NNBvH}kQj_`JA^aIl=h-8`6){^&kz^0at!9R*+0ypk_7m?UI`1-O
z1t9&MZydJ~iR+%(Rn}-rvn-ao6jXv^9+T$~J&=xY9qZ;iFvo7Y`bUX;DU)PKXv7aK
zHH2KwK#F9=pMf<XmDMfs-CeU-`B@?B?hn6BbDrsxvoMNNU&t*n+oiK%kDKhGW&iW$
zqn~Q2pyAHzMF3mfgGf9kb?%y9a;F55Myo)kf%Ld@)%F@jCnj9g$OWEXH-SH=SP4ak
zx7xh9g3Dp~yo~kRAD!s%yLdqqPfTMB1*;_YXBo>M_uqP0X`0z$n39xzRm@%D`y4+~
z7Ag86Tkxzm#|K$JD|~cVUpw9X^U?LPwl34%x1)Sh4gC~Ic}BpExEfxm;yg(^x|bI2
zRrhqulj3|aB=f!UA(zLSGNy)@Qr9K^D*J-e-B4hkAK=hTkCf=KNN@~TSFHbzB~9U{
zcr(DH&*-N^nJq~*SuK1X5cMC-B;?=(lmAfOD3fH_h4s#iMf5k&&)TIYyXFkNm3q`2
zeYaBkdWbTHRaU9~_?UPESw`1nOmyK#0}1b4m$$vH-CJ9cHMYQ&67t^w*9Zg8a31QU
zBOpZSJc}3Yy^b^nj7d6Q=8LASSQXm_k#pVAX2CONS)EYM?aymg@HhFcfEI0Wf3XXY
z1qlU|TJYo$Ya;U$y6a|&A<U&R{Y#h1Fi!lWPZ?6KN|6s2(iR^AdSol#o&1>epF?3v
zAisZ#QGPIp`-_>-h_Kq-(sfV1*+C_;((yXnW(Q-~;D^kdJX<|}_1y}UE4rY`Z^D4K
zj;TvEQv!JqBz{LINdH*GHX$WXm{DiJ+hO_k^wjw9OLP?AykrgTxXh9VUu_Y0#YdaM
zhOQ|@2M>Zfwn1c}v*7uok&ACLPx<sr_6*c+LK}#=2pH!zWMaWIrm)I_X!Zfo`Ym<w
zHB5|1q7>W`7rY(vPJ-EX3a)}7R#mo*j~&~L@+Z{!>E_0(x$=9n*Q2rl1vU{1DMRpW
zB|J~EPO~C?NGNQ;VjmFQS;Z>NbYx4=hvNUfnhIRszi2$_2Hc#I4SS~?q6=S49$ebu
zbrs7Av@N$KDxclI;YpQ&aWTwWNQHFplswe`)NS2=C4p>7IfeNj0N~8Oma2xE7^5fE
zs+;|jU5j7JSBk)AKj1IyBL=*yB&B~g8wLL?do8sM**7Y4g~J}@dZ!Pbu(ABK*N__f
zDk)s`7s$UtDm(XSd$U@v{_Y-!|3{N};_bAwc`rJ$LcJ!|pW@sWEUs`ag@>>OoVLV1
zjh$G~b1QHa-T0Xx3!r)eW8W#bzO&)dhH@?^fA@wTU+e}xm{-lTdW{{rPK&rL4_FAR
zm#ToU^ykYj2icACGy@&qDW)G2kamhF<PZU-eGx$&@Dg0k?bzdW2iO6FFbIn@uJ`5W
zjnKP?XRZ_zlG*2%vy53E>Do)E1`=RieJfqCFR}n&WcjPb&9-`g<eh(&r67+JaU}Vs
z)E9n){`pYW2!g?8?vn#$U8pS%6_Ag65ur0Q{CQ<4yJYy?XA(a}GLPM{F5B+ee8IK%
z3(kf!pkMk?5-Q3Kw*rOt2GvRM8ZkAI|Hu$WU>1L3U>uQVRCHW>7;QUfEoJ>L)GngV
z(R*ZMF<2HtJ&-2L9ZG%uj}CqY$?DzdK>f#lOZj_tB59icLSe5o=gVT@x4uCkz@p|B
z(!c@df?_zO@;jz)Ot$j+F~|n=LzHy=Iu%>aMR5*q=bEFd$`)BxYm!&LZ`ZE;UoRhB
z#S<XewreD1R2sGNZe9$3cA*6VZfVn1I|Bf5Q=|#~<uQ{~7;zw&Z>ja$eA$^kCCW_g
zmP0^t&;e<WzJXdF%CCW>AqFeCv6|DHwr<>8Sicrw3=Jx2SA6K59c5|G<SZLX?taJ-
zc@B+I&Eex;?T5gj0aC`Fu+%mDcnbd@m|l20q@9aX;>-PZZu(oMPEwWz8u-^$e&+7&
zP>SVvl@OQcdjY9Lq&7PUp6o?71eE1|x1+}}a_W)71IK=h*atdK@{^f!{g6NojYY=}
zf?J1^==;aG(1i21DcC9TcFL0f)BT}nNqp}IE9ciU=~;}6Bt76>EWRGg;J2WxzXdZ3
zl@#l}$^^=YNg8zAmf<6a>77*{gKQKAR0zj|zXqp%X$Jrq_gh~J6oM|f2Buh;1Hh3}
z(d~QkFB|Z@N8r4epFExYgGIIEdi~JMumHOV*DJ@y8bCGIp(_Iba{#eAiBv=gw!P)<
zE7$rIZJa^E*$di#@Sp$O8p~Vl25qF50ZC2Ux2=WWpP5<k(RN#o9lb>jw#6{%9V3Gw
z8w|0yXIOJ<L@Z*K_)ju~R2SA1t)^?Fn(x&Dd?BlzkNSo;LdcqMA5pXd&S$KabqJD&
zW7z|6gKZI$4g?dqS7bR(t*(GU`gz45Hx=Qn01p^MXT1o#e5a?6e!Nra1zD=~83)k+
z41)2AO|+JF{$!ZcC?G}=8`7GzMtR1<`=FjvIyi69=^P@^n+Mee=iHa)f247<d4MAy
z3ryz72r9AD6!X9ZpYhqb-Rus*PU!%xb|vH<R`ld%zh)f_1krYbY{fcAx|{XRK|2-f
z@9tYjf9-hu#Jz{v<{ZO_&5suKw}qi>J)s%i2O5;9_)x45)SlX<brqiLi?eUT1br-=
zDr<fvz&q2u=<J|SE1rJ+a1f5KY^YtX=HE+RJ9QFNE3|5PF)=ZHgl+&^6oLR)B;txI
z$l-R0Gmem$b_;1ZSe7j>D6E_K#y*hK_j>QD$0~bbEo*SHF&w(Nt3t>RTs|#cji9}V
z=VdXtm1ueW?*6Db(k|w{yy*Y%y&S*oWW*@!rfjdRWjEw^&YhX41bH~u@MHMP&2Sy@
z$YJBY8`ER)n6wkbY_21BcaM84smk=E=)uV;W?!vC7&)ZF(AKmE4f}+84TiDKN-E7I
zk)&y0=a*(J57R8N|2jm1-#dl>5?3ji3`6dOIR6wvc`ww+_k(CAr;TihTmRm@`At(;
z7<+X|R`!d>PN%jWSvp{+GXy?NBPJVgEjkv-3w)B$FTl+LkAUJ&N0C*O=#2ImBvF7i
zQke8RjEk!5<?CkHbF`vGP4)gcr@$gC?7~zBR_&Xv^!O2uh~v-Ry;%+vn+@I$r(^V9
zr1V^ipjQgw)pHC20^+Gs_R+>8^|J?fff`65n$eO=XF*)B63jCPYfk&o2g0}{VaL>C
zbw9|6`ubOPgTpEZ!CL_AemrJ91rabPT8eRw+ADd0y%o>DL}F{W%XS3KKSGnV4#=~J
zRk5)Mk5Q6kbm`e<+7uu7={0fE92oxb(f#j?I2~$P{xj#)%}ti9p!xTP?f2Sxh{r3q
zk7kIv^EU@72$Z{VwBWSl20Mbq+oTx7KXL2@Vz{O%I8gJ)0tjDj-%nBW!-lZK-U*)N
ztmxg}Z{cgt${+hfH3qWndo!=tH?d^*Mstm%;;)<E%pUgWJz7sbQh9Bwd;TT%e6z}j
zE68#S_r%L5>5QvSB}Y!f6bC?9(*?K8Mp+m);hLg0aE&i4No_)?`073_fVhpk&HINE
z>%d<Z#P1tDUXwHb!4sLpX;9|!{JB}h47j%j4k^<TAE;f%WK6E#ZS)p1V^v^_5*el$
zs8#y53E)2B)T6cI+=N2s4Gb+ILRr;oC-41Ob^c%NeR(*QYxh4=M6nSKB3mItRHV!s
zTaqC|=CLGGDD#l16cQ;UnRjF+Q!;B%$}B@>k}*@6Vu#<l9rb?CdCzya-uJKHb-m}0
zb3I*np8H<+TKD>_*)aj{7=rtHd}B=%fY{u8vK8UdL>(vok$dxA;Z-RS{05((wmMj~
zYvS&a)6$9VeQu*7jSDcuB9=?6SrWBzfLWHAcKsE?Fq%9GFVvB+C>U4^*)Ga1A?Jj(
zx>Z>{=hY$&%Q!O{f84ME=v;JYCaG@!X5kxzzywdV-9hYj9uiITPHvO{2H+S`*sE*j
zpu`(Lt|7Us22Q0%CJ=F`>;>&(OM8NBA65c}PK!Z52CY-K{ET5*NnVCsKiKB5f{Tw#
z8=<ci>a5r+5`OE;J}3u{uKmDL029BeBOYALnFkM%@le7atk1#pdW)C(Ok8`4OJl3;
zB^IdqKqn;xa-XC&jzie;@D3nRTD_pr7$D9U<hp)Gh@S5%81pV_Slzqw2|4}d&M>^H
zTTkn*GcvPM^3aFwuuhYonk&{;tc!{|wPRC{#1J8MV8e;rmH;Ihk%sEI;D*!%k8tp0
z7o^UZ6y7B4XA(J9Gr}?{J3Fv)_Z+((yJ(@^{Sb&IVgA&lfj!g-E$1MrYl_v5RQfbI
z+G`eRc(<+{)m_1Ks|HO>f|ihnZ}}jWli`F%NLq)|!z&*a2XC!GT*dGPw8j+6{O&41
zNb8>L#zt{syP=VR!>i@&(rJ6b-Y1@mMnhYrpzBjQ%PvCP<K|0^BQuXPC5IGaFP1gb
zbi`S>5r&)&SPB7^22tKygPU3oKqx$$9T*m0{nwx%!tn;9G_lh>zIe4+PEIb{fPu^j
z)@q!1;SD#GT2G;{E^2gU4Z*Zpy<Ql7#PiOBj^!uh2kbN)nE)5!CzxY4BvW0^a<z(d
z5xNn+=xUk_%WimN{lVA<(8_7DXI-$Oj05joV>swo?zy!dMk|2c=Jd@a0_YJ32<w+&
zA)fn5+TZ!P?YsSE?_3&f*g2IeQSXhyyyE!c$kGEIN>8J63d?`zQN^ftY$R`?Y_G)n
z7jHMlCpZCI2+U|OyqCL0Y}$@1zE#oT5ZQgsDzABddFT8v;(Q*z|2oTq_#kOI()*9_
z{JT5aPHZeG6r%LITdr7VZZH+O@AbejBhL?4qO*$f9AN5CpwR>d8Hcxmm~>|(Q@TXV
zV{KW^c8?Z3`7eM7LEu=|o%(27R_vhybe?@%cX#K-dp|$+<g}GNo~?!4$a_e+)MA#y
zc_HF8D!Zv4*MUIS9sS*485~vxvTPOowr``mPAfhk)6SdM#`wXUMDt0N7oDu$pA|g5
zy!1FB!j7>dbLn@8S1y^NDV>Qcy^?jJqi$U1AGw#|CH#QtR4w43scqh#Fha7CdJ|Ux
zTY0oh*jafoDVWtM`9%L^=K3&%#0L^z8yuNs&aM<5HHFCSZ<7EhOR7?Gdn2S*NCz<U
z>BzTZy&hjUp5#tLqvc(s6E|JP-S4tEn#l8t4JRp`h;|gCZ&rBH;)|%LY6tsSIGw_<
zVNt{(iy~2E>}L?FTPwKMVFbRteVUWo#ii!9GGD9K6h@YNulu{0dyv{NTH_=Q;zS0F
zYv>b}jfGA&0XQ3Efu%?+hX)rgx|iC&F>AN?pE{w&ju9c-O<UXwkt&C1$GKp)acFP&
z+W>>kMpc{!uNjeamQaQ-Fq<TEOxefj-1VZPQarQNkmXV^w=LFCTYI6H((g_c#R0EN
zTh^SO#=5?`=3Lh-Kay-WJj|$NV37IdqFZ845VCcCm33&KgMM%*G>2Zh!Xs?vu~pbi
z%*7|!C9j^_G>gAC`g#;Hj(lrx8(Z3yEkEpaBOYmgcHwoI;~9?Z?>7v0!-@_cMJFyi
zxDTTEpvL9c`q@{Y2DOm2T;=%@;r_bY#?A_cy=|lH7%H-y3gOz@2KG@4lh?{vJkNKO
zEkG`s-I&t_L?+a&1{ao<xS<-3lS6~$@u7m0xaY@A-*s|xEj@4WcRKsD8ye1jKRr;?
zzt*@CC)}iE^DI5M(pIT&<>f8tdw9oqH}udyj6o|p-n{QATG1iJUpyEo(fm@9rt~M3
z$G7B2#`f6BH{&nL==M0(>v*h0jq}?R7DBSVx6|?){7#t2$xjG^Q+pE&Ow?npGlWcE
z-(B0k2#UEp%K!z@y5{uO_<eOvd+&>H+qgsU{z16K?L8+JSW8z!#O7GvPRXZF_H@QO
z7;Xg8A)HJL3qslli##bQBWvr6)8uFGeD7KRadT~F5T@$ZaOzQK1<dQmu7+eP=SqD7
zu7I{;r~K4#^dQd-u{+EGcL+W{l(uiS_UyF9Zj%nq8*7`s5VR6`<TM7Oh9_GpOI9|C
zNiNAKoyaPx@KjN;m(?t9oO62|L(DJf)2om<ZRk+`WqA1gqiDNOFgA`IAuphFUAq-|
zX<~kF%W|EHz1y`!CMP;V;hrsWY(74NWPTQ}DU>`Y*?e#%WTLk$G>M<96^GwOzdK`O
zq)}|!+GgLXCjYdX{@uY)r=w+Np}%qgd|O$@G)&d@ihTZ%X)N`Tnw}sSF(gio!BrKL
zV7rPL<=K5m$PzJ4NASG!W(Pd^T<)?6K<488L5|l+vcxdiW7LmZ&*5y?7YCBK7X_O{
zdwo?D=m-nM230yT%12zd0AahQj(#L$lrQK5c1N7ydptvYd(kXfC;xI*kKhFYpK-2?
zyW>}$-Lx-M!Fe~GuB~#KjfHe!UxvSV-Gjof?Ur*p96MJSp!e3p>bDoicRVzu!;$z3
zZKZ6sym9Rm86}%Zi4RTAi}X6zmAg^Lmx^~Md9AuH5?wVDMsDKxw`yhU9cgTEQ|Z%v
zaVA7k^xSP4)-Oc<9>#@Q-fa72<$*3BSXCnsuVOnqk&qF$kFgRobKkcNA15}RtNqNO
zICZo)ZD=ojy_8%?gY^d?x|%yxfKM+Cmd?EYxolTUae)8f;}h0r<NL}dy{|r*^T%Y(
zIi~I;C0C+j!$%)iSizBylII=5_+etLoI3OF4M(`mU1}yQO|GX~e&lQ^F;lc`pWI{F
z_2sZg*~>w7;y~B;=>GlT94!2%kE^D8OU<BMJkPC`Nw?DLCK#lni&9Ct&nFLd!VIaF
z7itH`TfaGuw-*MrrfVt^7q>Te-3uV2!&Q-x$`p{wCT)dtWQ!*I_Ud9FBfijsvm}r8
zBsj-{`aF#v@)>f*bJFK|5|=s;4eDVTYOK5PUC%8Ka>ZROE3obmR_*0_kbF^&U7Ud>
zHBF6=jFMd02g6bjs3rhJ4>cuaaT!%9GAsP{v8(XwrJ=XV@Fr(9iEqogCNTf51=_|~
z_E$(8nkas5)s}QQ>#=%{t)b<o1Ye<gYo0<{$y3jSCmUEkHLJl9lAnPWCtOhDffo<l
z^4X5<qE1Q3zE*?vuK}BL()-)7b3PcGKBaL=Qu28QHvHF<3_LPG)$5FGc;binx$5oB
zB3z43u;rYed?6%cSfITBrKW;F;JQ=rJ7N)3BiH=KOd9Gf+x#q2)c5<t4=(ZJpo>F@
zAaE1u!L6U=IOu~$D6CA69i+Jb<SAvd$c?ryd4YI!zS@hX$CO8;_X49m1%aq5Zec|G
z&&7orU5mIDhLyaJv00pTtB&ZLOBN%?G|Bp4vfk}aB_oCB)9_q>*j}9yE6SDGWe|@3
zb(ox#d<PR76r~vG$-*X%!(z<|Ofe3W%`c-gIU_*+)Q0w>bCV#IadbD@9}J7#BY+X$
zd#EEFZcf-4D}?Q8db!r3#jeYvb?B8~^W}R{(;%+qu>Zy_CT{SjxRa#LqwO~Aomu{v
zSi^I@*>9(NN|gQF;kWScTY(zMTu8ug!G5+Hk*ILsDNN4rpFM;xtr|Lyevk-1!0z(L
zf;?P6Tr6QJQ_wlb0T0F{ux1Hz%4YXEUmgsQ&IX>l?y_^;!Vnx@rPs>=2j0CQ1Me__
z550Vyd;;x+!PiwS@C{-43g5%+MkEc`Kfb4Q=)px@Cgz~xqhHWh6L?_1nle8{fV`OF
zq(;}w7OS>5Uv7-lHi-IHzYA~Vw0bGxk70??HB;xtiZYyNkOrc+02if<RyTvl`+VCJ
z4>!)PCItq<PL@dl8;(uRw^@8NP6Lbv=S37Jc_<Hm>d0W?V%y94px2$UxpSd)%4)J9
zs~ew60Rq1ZNOeln9kHeXC5B))uyx?Pj3deIF9f_y0kt^H4Iw4pPlX|{$XwflHpv7e
z>I~yVWJUD-7un~JTeb1+#VGjR{y)C=2G7p@&@4Ci5w+~bSJUa)@Y)=2;t#@!Vi^V)
zwJ+1g0NY;w58M8SZU4iz|G919wjExr`U=CnR|%|HKOnqN1H($h^@7b@s~P)2mEqWy
z$o$57nDfcd#dR1iuB9P%yoa6b#2r5i4K77vO16YB60BZ|r7AW2MuIPB5F!>sC_j19
zNlvC9Ei738^q~Ui!;eC#Taj+Wu^H*`<=!~li1#=Z1Z|5Iz)SH3I(UQ+*GATYZC}<u
zAy77ZV}7zz1NlpIb1&X<9BYz>=FuEmw(U4vbaugf3k6L$I6=>#xxdzttEneKL`L>N
zKOx00FgSyOv4X$i$3-{jo6S3gdUArdFlSO?1X9bw--mJ4ey$EOgO-kq?pwERwFme1
zS(x(R7w_QlLyrDFKJR?y8R!69Ze|Ypi!zD?D|%^5is2th%6T;V>|vsJ991&~t82pD
zJ;!9;hOp>g2#|!n@=0<*OwKy7Qr=$0)o2=Op7WHRZn1rGrcrqpP4h$v4@rt7go7f$
z>M~??AH(j2%2cIjVP(`ySQq+8^_gauFY1Zyfu5PyX1qHRq_-@;KCou8-mh(N?N$$X
za$9mu<wc-f-EfmEZDB_Yv@Fatt~45{i#nLAbJv`ZzaEh`duOs?3M;IL^}$&HN={^v
z2u9cbFg2XU`sE~<f)d`J&~T{p!gIH|5fAFlC6xlIW~k(gG=R|8cSyrk1i#M`?+<X;
zgqM|x7ekvFv({8K583(o@sCr_3>F0;yB77I2hbO+zGyy-`cq!%%&V@bg`c@h8;nEq
zR6~C~>;6U<i99)*d1D@agQqQBbB^^y5-&YLR@AlYGzsZBSX`WaKGK8<m55#TCd<r2
zSb|H|pC;mT@V@uVc4cZ80@k4W#bmq}q~+;BUe4`RD%7WS7&7#j7@xNFPPE33-H|Dv
zYT_x&80Z6+(Q3A)Zot~d__agpD;<@W;H!k`gveDC%31WznKsB5D{|@QmML&)&jh1n
z5YVB&%c3KIM#^wrT)^U5(0!_Aa70zGoN5!So=9l|zgf1SQm0t+MCDhgTRKF)UF^*?
zQO)HUxc79_8hJ9#X=0(_d(J~xxi<KqnA7x&vY%kG;vBLyQbKm^-dzjRiFmdSMOoNY
zp7XP?E7t|n;1xG6fM&F3d>;rxsw$h&dG=fjN#-E8A1rVh^9o$ChoIeX7#ac)eHV6+
z(Suoqgi8hJq-h4hBEmFP#=VlTthIcqCP;7X7QknWhC{|jK~V-YxYRQ8S~d%OH`D^*
zCHs@RGR9Qqp@$R}QtAqNAQYjlt}YX>I9saTroM@8bJG)kH~?$^0S8%AkiExAHG7A5
zCPhyepJ5)@#$v{(+SZs~Tgp&w-NA1OdDJK=u&+kZvz5Pl;u8#t|5R#|ac)Az(akW8
zrG&UNV`oRB1zBUpEb3|jTG98?;=?jZn_y5D7v2Ln^s1cEwdCfbhv6Q$3kSd~rnhec
z2?HoI9~l|R%){y#D%!9jw|f_}Lt4B|3jDq>6MXKek(G<h!j-(PGDV3kS#fE7E-o|Y
z1GE}dGQ>J2ARx**-d&XA`A&VR$*?(vfXO1e4)HoUo3l@i>@++5;~h$uFDH5PV?{j$
z8<1Jn0$)$zeftf;Kx9DAP#74&+n$0XcJ*17rl!_Z3}yB)l-mVp<%FrbWTMFl#D;wL
z+j`~lD=wA9-abwqcg_{)Hx&kz?X?hPnp%>LTuQrQdTc?DmvOd@X0R<od%bPk%5cG;
zh-v`@v)APgW-9gsFG!~=2*6;hMC^uiQoW%L;?TkjM_JHT9vHr5QTQ|3uEMhNrmoGv
z0(1oX$%D@@RS_B9<#BT6CXHx~*U5P(B3jHS%L}e<&2NGPf`^B~6o~uGq#|yOfn2}W
zHIL;${i{4rUhlu>-iK2B^DeyJMqJPtp6Z*z+NC}`dq}bR4&Ieh^t?ujE;(ja&j&MA
zc<L(x{g0`ii4`qvNk5FwINRE5R!I8ju_6+IMsv7*!5f=@a{^kKnN|6ahR7DUFX~Nw
z@qD}2JivX%K@_XUlHtXk9oK2wUr}!0*euub?1HRE(JZ&d(;7&{f{WJZK6uN4y=W`d
z|EG)UJQJaf7zbVSv1n9FG1eVj*7;2nPvU~LO5J9NX%mdvR{l1HBB(18Z@K$I*G$ic
zX?zOJd6y(E<;RDhzVUv2^vQ1KkTkJY7}omiq~!l3gs}P5$R{v(s83*`8Fbpc1aIBj
zn5nFj8E{gk&`HIJ8KRT&vk0!(0{^>BH>*zBOnQKA_@EDt4XhyFJq&V4cKWuz2|9a-
zsovc5*i6kW%oO(dpdtso0<jI^#p~`*PQBJokwD@m2ujqvl@GZ_$Yp)B4cw%-K&y^C
z&933iR*L2G#QONCz}I4*N*_V|xef8Bzv`3II~$qM^dV4&FZ<NGGa5lrm+A4$f5i}F
z*BMEGtHm*Fcn_aCdiX0`C@f6+vDQi<t9=djIP{Yu8egviIg382AqsZND*Sk*-tP{w
z<czm|zEw@n8}MDJAV;I7_cN&b&WWqTzjUF~o^2;!8M&|wQ+Y*G^zzGOWuJKjixOhw
z0%$%Y9nT2sEf#bb;zMYG1uQR7z5PM3IX@g2?~9=urW~?4kU=#MOZ<9_A{_}~0a%#U
zf_Gwe$pkcP&k)|t0so?+0xRt%P<<P1WIHU*R|ysxmAZ8l?3=5XBNUST#*|u5fz5?E
z1FnJCxZU#TXx>u)ro1eda)7Yov&w>qGl#14_=K4RJ?b36?3V;B=*db_i+;UiJ8<ED
zwdBe^pDhYhr3JUuVQ^brOP}9xTftIGtEj8e{KiOjyq62~v)Jt#a|iZy3d}t*zt2a}
zcKLuR`RJ1#74R^s=mI*;AK=hE5!J0ozXjvUp6OiNsuma6ewn@V2>61KRsQhI!#j+D
zvq}ZFHxpUOF-Ok=p;}K_Zi8)I6{wo-ncnGCL59^E9e<G%+*{nsY>(VIe=(E6aNNT6
zZ(5N5M+XGYeI23r;OZU!a6sJORh#!or*87tM;0>5AAF!c-AMTE0d2-Ot9Y{=qt7^h
z#p2C7G>S65Axg_gfD?rPs{Tz{3j>g4KnA3aOn@YXp0MTUq1_zxU@~9Za002q4={IL
zqmzFImUMj__&g?;>ES`37vzE!$SpB~V?=Tx{M&rjiw?Y0gpqM{mT)T7`{f)d*?Ake
zJrH758%o-X1h1T=)S>ib>oOGxSISU@u=A+1T@54pRyuH*=Js!+h`DRvdDBYXU|n#r
z;Ar73tPTlO@D?01JPSXmQNa3*#XPfeP(%Mpz`!Z$z$wmNwFEVfslfwt2I^f!9uIS^
zsPkt78t}mm=u7kP_HnmsLO=zRGF(|I{$lfWU{c2xKMPq3qN4sU&r|+8CE%rjd?4B=
z=`(S}OVK1bFIxVqZwFDDasfyY2xMVMIeQL~g&LMeT}BdfHh`uo)FAv!fV1zrH1q>m
z_<`(U8l-Svu>+fXz$bxY5$hrWaR+iQJldj(e4$;Gzm<{=I6P!F9Ug&~(sb@qHsZeu
z@0kEGeF9rK9UOKUwne%`23LBY>L@RU;1m}^?D`bjU5?d8QvPQT1oA0e{WpC||3}{r
zfj&@fu2ZM(AHE%Wg<QU%;OaV7LBvbQ_Zv@9^&Qd17A`cn0xlJ?{=r52HYnYC69FO&
zK{<v;0%=En*f9Sr*;O8_D2c4S8ap`A{y<GMaADCTp4tH?PZ<O!HVN(|;8?`EEg&Fc
zSX@B6Db0B(!=|)2_}igZ7<I;(dQ(h(`JJ-5O^`?FF1T%&PpCXbE~_4-lRac(bKoUz
zqSMx<F1Aaz4M^czsc>%?Mqs=Vi1kxKm+c>Zv;SPS|NmdMWRl~Mw}enF==%$mDj%qu
zk^KnO+h?HFEH@;5Y<C?U<d?dC=8l=}6+Wiv9MWFEBO$<vBHKm~Q{$S*U8^;lIJLaW
z0U$qwV0tIk{Y%Y#kJs}RSW(6}7#YeAZSNz{9OJtqvOQaKUQm6TwPXPt0XV_qbiE9E
z$x-0>myg22c=QXcIKH-L<<lzP8i&3P?`H>JCqsDZIAmRPKDUWp9Yy^<P+9DIGm+D#
zaM!$johSsiBDlhB>JD3i^BYqW0`|06h^PUF(@DKH!mBn2r%dLZ_d#h0Q|s5ONGXs-
zKMa$on-ectT?Jdod3((`lFu#hzAcAnkT|?L^jm;-?ky={UPTo=JDZS64#}76ojGFD
zGR()X&?#)9+^(4WE!#ovK(n`BJsgfS`wwo=v0fAA-cz14rz6Wts9Is}>Sun(#ZJ#s
z_EHDp>Bd*!>tKg_Cg8Sm2Kqh&e4ioNghgY~cD|y`VYjGGerCGfZ?E-$5vZ-cP+oL*
zp5Bfj1n*1fRdREAQcP{r7o#cfXCXLxRvT{cNI(WTZmK)sHXkU3dt!<1OOns`>C^B+
zuGR{paLg31R1lov86Y}#uaFHwl`5Mdiu~M18qa+87tQoqOv_MQve+9QwTW<Qt3ou;
z9h?Fe+GdQy4-dF9O3cFPq&uMYJdY*YK5Cn>N8t5MyqSCioG@^6WVM|-j1b4HF)V(4
zhrOosgzY>`BYqjuQ!KN%T=MI7#83et4!>0jLxgYuLO5iwv35RHX5M?kDjW`6AYfR-
zff2r4!HyVa8aS}{<mHV3l<KC6-m<JT|9zPEkXng>R*umT>yE5QsDspdZntjKdfD<i
z=I}Wfh(Z%O(UzeI_HWXKo7)~lcAO8f=`-jmu;pHuXrtc$Td(^@@81d}`UBrG6(SO`
zQj*)aOf&LoeK1~Me9mX^Bl~*3TmF&O)C;mA6R8J>A?-g5$|Gx}*Js%1I){8&70H!x
zp1jc8m7=rEEkCn=R$%$(oWLch+IMEQ=`MI^;YA@bH2)FbJGk?ee{b>4qa0Rv65j-X
zkT)fzCr~gTj}H@$DR=CYKW;ufM_4oLRj!jtz8T~C>RZcGWor=#Me=1_Ej|ikw{vA4
z^67HUR4mah!yqdK=v$pEzm#p?S1d7A+Aiu%*(`d_zYD<0E}$%TSQ0Hlm5O8~{$o%R
zR8;q9IhD`VJW<G^3Fq!8rQa~=dK&_MyozK7+kDrle$H^kf5^yb2&=5jZ(4I(@(JP(
zh4M!$MSMWJv(T>FBxKuw8+Y#G%tD;miD?qSSFVldA0JM%%$s``dwhVfX74p))M*iA
z&8C`+%X2IiUwd;!cd_|hOX6yL(u=X<Y5Ge=nsK8Gq<$7T)w;AGejrCQL#^X`k^aH|
z;lZeIW)4#J{y`T-{2BD!-`OU4HZRfs3!VH?d2;dHu4~m0x-Qs@VwQW|79v_ce;SGF
zQ0=_&3+ZG%l{p4r=rPDcL0}7jaJB(;+&gW9;L1_S<*z?0-xrV0hhifk7>rFhIG}>-
zjbQa&BJYIE#2&}jPWC^)d0%yT$ro48o1GW%J{e&wDiELRCy+>xFET*<I)!XL^ySSj
zJ#oEJOk{eL*H;%#7cVU4=2<OR6%RevPcrNl)@zjuXur5)7pUmlaLhXo9GyZMf1GM;
z7ePa6kQ0|8CKs|KV*-WA_ES5bm_gXne&$E+qIKKPH4I@jj^r!w<R19$g-6FF5Kmfe
zLa2q+D2u|tPkuz!0j?g)x)9m&-6#TXaO`6E>fJTZrM_WLQZp^3UzgRAf+}24j{s2%
zyOqqt0a9LKOL#rqGjT#Il8Gj4NUz|TH%~1EY3x=ee)n^>rKXuo>s7#_PHRt9nc)u+
zurqrwkC4VC1)M*|^WCR!c~=M7qlBI4ORt(+W*cx8k<lu9>KM7s6U0xEknBC7fR{c$
zZdSZBea+q=$kd@!heMkS|F?HwrR;o**@5>?C+|M=-NOGxqDf`j+hcl^lb^ml*|A@|
zb&Y$$#`H8%ivinp(w*rvi7y72PgdxP9twZOQ2t;cX!;nH+Lnt8JCuaki)@1sVr#60
zll)3U_uDh2&TEtH$^q>oUvXIAhG)PH1A{I5fEz*sH(|1|Fq3}n#bg7o;kM(BHEbd&
z6gl}HE_uZa<>30)wtRUpq*oK%VK}6S{l!_v^SZnK0J}7`N#hK_u~!cdjPho4N(m_&
zxjDtvxveOqX=XfpeMe*msVDhmiA6v0h57vNUrg`SpgMWC3uT`_IM_3R>|+21%ARu6
z1NS9455ue=e(9@sZh>v@+3dY;jIuIpp=9yi-`0oo2=v@{X(Mb6OI`c6V7vbLZyk3u
z*GA@q4r&L7t7#isR;E0rC6i`+QSeM>--#=Si|$suoMQs-XjncQKsqpnSY9Q5JPt%q
zjRHgR6HP8l<EeU$)Z)g<!ygZL;V(-aQ=~~V?*CY*yTkVimC|Ez&5_F0=XIVVjh$tg
zPpkn|F47D_wz|EeI0fQo$|M0?fJ*;eK_nbcw*Ksj_2~KGkyPM#tIqw><K_w>^NK2F
zyEV<}ayx>!xJ};ZjEK35?Odg?V*m-+8VJT}JSii(OZ2w?U?c?j8*?m7v`6UM6)W)4
z-jQZA^ljyzgE>v*F^-?NkH<|<xXKjpN~$2D?FU|9Id%;Ld%>`hR?0ZWQ}aKNioO0I
zd<X9iTH)Az+_Uu~^#tMf%vyUQ^Gh8U3*==AQhNA61hC+&^LLyrK=JVNhX%9*rn{a*
z#(c~#-Ld~F_O&!y6PcLH+M!<e#k|Ve(WU8OO94NNbH%!V#$|ZmvKk+-WJZ_Ona#^;
zlsPDEeYDcMvZ7`EhU5+k-zw^#KRu@pt_qBOwG{ZpJLlTa3tn-;3%cKo??x|>smfWx
z6B$K+z;RQPR#r5E6J+<tFONa%RjncYM+Z+Rkpx4KgE@^HQ1l;a)t3GqGSt8LF?f}W
zw8B8fBT!!G4~enT>DoEdwMB<D&CP;Pjczy^i5R)^K_BI!MnciW2jty?*7H-=tz~$?
zV`=qzNC6kYhyL&-W10YUtoCEFZiA<oH%iL8V!!KvFR`A#8@p0A*AQkQ!~?qD;U|m2
ztua`R0;F5lO3Z*RjD$CLe>;)yY0_Dtm8pZh2W1;O_McLPWR{0h!v)2xSiNX9RZ1j!
zkXqr`u22lj$cDM%>v|y<TIotkFifoqCJig|GOCn1Pw}8O^><msv~JU|JYK-8#;^aB
zU!3ebhzh{*>~wpNUBsWpmOxYZ<EC$SOjWpoce)`}U~1p{1UBG;U)K8Gy1W03VXAU^
zi&v>wz_AGfoD3y@3~X8M?qB0P;PoRhwX`IyvBCw_NqMSCFQQ=qGtZ`pQLqIAx8aEI
z&{)0(fbC&a@4gz(rSymm4RKBVY5=*m#Mn8Qn^OazQviC*KZKTZHO6b-cOw+*IjYs0
zrsd+&#>Kc8>o*2|RE3RL5|krMiS?y5LZL3tZX?&hzV$Uov404oALU!Ub0%D2e(D!k
zV+8c3uZ6r`9%u#D?_q1Uz#WzkO(C>BVYbP*w$vyZ{fgJEgw7Xv*M?4~8CO=sK$)dI
zxVz87QB7GVX++X^Yp$+HLa>77W6Cyw&xQb>SNz67^J30up!2>e3hlqdo~4@;N}axP
z!UzTallYAK7#JRWifGJ}1R)Ai?NvWW5|WB=>2rHW)4jaBY!5-3v(nPOF^(+?>Uw%s
z{$FePOO+s}_8^S9xVs<bP`9LMa4iFTw(L)|?J7u1WfO;ZPAQF(0l!Q^dM;Ek0IXpa
zt4Viqp8SM`9NzDc&8xtEKNMiqCu?o3`0r3s(wwuR_sry_7uFqs)S~D_7aDW1I_EZ=
zOjf$$&y*eqZKpo1PbXyyRZfeD#b9Cu_4x0CO#gs{oJCLaUFd!q<G=;1*#%U<0H90I
z6yF{O5OgN0^u}r?YI}Z=WRL(yW)uvk%`_-h2efE@;K_6eEA*<NzxQrx4mIKT|3<?i
z`pNgy!ZI{sPloP1i-x1Eg3S}Q3klLUzrb9#ZlY5D%J>p@N(_7D>XhA7>BnNq=Cpm#
zPoEw5<wxH>Gz+ctUTf?Xu7!vW?att8G(e1afK@40)M<vrxiEEQ?<!ROe1TyZq15~@
zOmYl0oyNZx1!A&l52bE_W2&hDD~I^hfDWE13rMgl9az2IBbrjCS}ypu#yf4Cre?D9
zQfuF6$Dxu?>9NrJijV1%R$H=oVS1Wm3Rq+wHxsB@CmL+cuzI&#7%z5Y>6_>1_i-^2
z+DeAr-4j@P1U=om6T8P7i!~r#GV?%@ztcn6iK1Bk9_x3AqaqSZa$!qgY&dGYqV_Ib
zPvd~o5)q9IIdTA|3zR!+wk6jZOUe}Rhb7P;yh(%*?_{*S9C~gd`?l#h$&}ur&>N_=
zMYB;Z9(wOBT4%MgbV7)xmqSa+*S0e+xemI>tkI;#+h%B+q>bJLp;1jyj^+fHe7X=m
zBi<#$WDSC@=YM$Gf4{@4nUOBi3*oV0Eth(Y`14b!xy@=_LD))+pM_b;rS}i)i3NO$
ztqnsCBXynD+M0$XKZoad@m%x@YN(-1%dH!tNNEiL#(EnZej7czfY)ZC!-b$hm>8}0
zUMKcX)xT{uyjZxmy<#R%t5G$>r_6o9iYW3K2Ab5=+&?&F{&;C;v)N4(w+kA#&O0q>
z8&(p|@!(;~!Fz~~sZU>Ub8}mKY61+Q**1CzpW*!|Aug6b=Uky_v3p~odx|5^8wLXJ
zIx#D*QM)m74q2m^npik(m*)lu4^t+QU`sw@$|3eE0wZAMU?4Q~v@B0;DzgiKVKn!k
zKWk^>=?s>vw(M?*OEWTF`na=s!hB$2bO2(|51{3x<BW?X7}?*UtBoR<!XMo^MHlYo
zc0E#Rd$f||OnXk^>!eVoXj-E}fELu+aK$}afB72gzdx^tzikN3n3STGxb_)xj$W^y
z!2a@4|2e#eA@I^PTlE2+oS-RK*_%g|M7(XWFuFb`wQg!Bxj5s!4l{W>*{u#{wWbh!
zN-o#$Bc=@Sgr^v}M0CBdh6KOk&=|G60}b@^G;}&C9vfV(UaEgC>Ui}UxAVl4>Ff-j
zSla++bz?Wrh+Z9~tc<vFexmy`h^DG+qh;m$m~yTtEB3aB;hGA}RkZ~c>?fIkqYMM9
zk@H++9~`Ui4)R~|2*)UMf=2j+VWU-8F3jOGXmw};*+J88#A+r&a=UEwx4ul5nFm&4
zL#1suA)Y;<sXGy&HNmL<{!&Zd8^zT_`cpMoakMHnMh{g2)}YF0GCne+!#r&c=3{VM
zy|Y>1AA`D!WEO%3q>^S<d|y#_iD|ohVS=kG1qSByX`7)=|6bJ*C9iXgo+QT&Y0Xq?
zEu?jL(h*`;%~XED_Px915xm+9s$cNp&>@hcDg(&G%l@fS@c5^`T{%a&$I|mkax>Y2
zmF2>f_eCWKQ^`hPtfH1#fEGonVM^QA7(fbUmFi!iiYOeSSJ17chaHMR!r%()OjMt`
z9NQJn7mLYiS9N*P)@#eC)8>#r!SV@BMvj5@1I`IrGuA3xnOd4acc`edsl;T3sJXP;
zl}-s0881#0noO)VrE@Z-YvfQXrtq_@Ij=Zz>1B!ic;I{d%L8B0qpDj##hU>CcGcvz
z>_V!)7=|CLWM)ew+r{_U4{nvJWW9r+>Zfv#qSTrE6@KHUoO>xY7=py`<zvA5X&?qk
zH&bk2>j~@#-!&R}Ce1~$n2ilc1IEHprz$r$h~(t#U^#xKVZ6EbqtUM<0KfA9b6Z}1
z6o#)R=m)S09pbO|?oc{y=*hSN?!LC?L@M4k;5;aAfK7QzB|ZhAHSUu^5S=>%OoL?+
zw?Q!-#?9c{(sk2Z>zqo3iuFN?;`P378{fYE!Zonxl>6NthmP3q#Qp=L{~ONJHc*b0
zlr>tdwf^2R*LeIOIoXlfc|nIuBTw$t#+YALoVrQ7Sg;#%XZ~&>fQ;8GOcHxUY|DCF
zW(@7G%#G}z^sS=0TQMxW3>gDo65rGQH|7DD7QFb3j{OV8CO!gU`{sgiZ2Dvqg{GtP
z69>(mBsD%qt}U?)c9wW)c3^+!^&yYT^v+_Gpkorg8E<Ng57Y|iFmQQtrF$_}$=&3J
z+EXD0R^OuCWd}{)^&WBQa}I)3-}bq;4DZ!*+3Df}7Y*XJ(&5;YKocBQjy6Kcu@q!p
z<)6<+2*X&V;WY}*T0>L2$o4Ef(bR{sm?p)6r_pv1kOA<aZm)(X_ubxN&kN>izzBcN
znUMmH=gwSFSUS*M7-5}LT#~N7>_A>XuTyMQ7`~(OEPq96<-EwcxUtniJn>Zn0dVGV
zz??PPR7#OENt4X$wuNRlLw@`NuDGB1JswM^M6#=MH6v={4<-axO}X@O1V!>ia2@c#
zgZvCD8v0one?mr|3<B<*RhApT{cP0?W)*7nKKMw>4v&>RwI7`_P3MV}lE+c|F0Fs_
z?GYbw-g|05Mx@e3px=u9Z;jv0MAiz(WKFm0`)XIS%E0v=%D3;w^_GfoN1I}1l}$z)
z;*+j^8n$n;qj|th+c%@nVm0_HA2Ub8<l0~1Tvd*?LBx@{=VEW)zml8zo<A)#`fjj~
zx<~8+q3m0`@szPjfpzy=BH5zueQkr~4~?IJI`lxDGuxn@CfEWRq=qu{?OFXFW_i8)
z43wtYJz-jrh>h<MW>(2$QKPZO^q$l>vGIk*P6mPmvjjDWOaKxYR>>?RGQ-=)=n0xO
z$+g-wsn*$%kn*$FVQ!~-j*-Oe+YnuAtekzI1X3Ux->JYaz~4#$D9B`*Knc4r08skF
z$8rOdYX6bWm73FtpV6QD?@JH<T5iC(nQ0r#%{GeeP8v|`G@+-EpO`Ytr@z!8(!6}~
z$-Uu_+#xe)*>$|^(j!L;Z~I55WpegyPgQ=y8Y5KgwM{QYtcjZ)a-PR(Ww?R=GvNmN
z)ZBt{i$bG(La=&Hd6(<;ip^i)bt^93njdS|<Y&E_2w6=M`J5uuplVvJGg;$~DYj?p
zr_>8JnRP=*^r(&sQtd4e=1tlDb`Vi|oUG*bSU7Qhub|nqgTNVZQcf;?OUKny_!>}U
z8Wifq#m>c)<zKzO-_tQ}2a9;klHm*9iAYRrtsPkhvVOfFk9bZQWQrW~=^SnelXXY=
zA%A42x0fE1fUcGQxC%>SsKpw$y`z!F(?FaSIWixq`#sN%G<xex{<xaknyIU?GCys$
zev>&?3PmqvZD6x^V6)*0Efi?89GeW<b%jq$7Y1T`^5xV;a|JP#n;&Q|uw=08%}W-t
zcD^fQV;{c1Z{TX0=T)2KdOMmA4`?EfDVsooQJ(2Ln$9w_AV@2dMioxP11)^|i}xpg
zj&oV|q~3R&a4m8C+9d~Kmh4IVWhGB>$@v0PUIL~JO_85MB1=Q==w2Q1aS`?6_)EEg
z%DLfF<X#-19R+~}HXl!`h!472c4XTm_(7Z^;jLUQyh#<_grkt}L~qI_zp;l<tCt!a
z54GrAFu?o)m{mb5fuaY<KQN?fWj$`q)YO=0%i`V1EEM|GO*t7V11Xq-F#mmV0arT?
zCl7ws5RF7mU}I%q<1YL-$|ktV{YO+m2-Ao!y%O<}?x+geeLhBB`oSoRAx=eEP)&cz
za>QYP=X?U`F-__|y~Tz5spTM|cmZtl*W1pDA&$|9&*)1x$X7|hgZE-47&?QfI>Ef%
zaU+KS0(@c@KeLe+0GC;dFx`QijMpFmw|9m+BWR1v{GuB%_k6tJVSQ2;;N`1>aHTb7
zj>GMSxe7s$&8Yl-OK_^VZ$Gpd+??M>-3Fik^HM-jX@$<w&7Om(z_NMc2Ou;n5Zd*!
zatd1OL0IBJP8u1UB0M}aO*{;=LE*$tILi&%Kz^JvZ_P604=CzcY>bNlJs}QyVz%zF
z3epquzXS1I_n9cL36Z#8R=s=hEh~URN8z;>wj{G4Sd@ig@Qh!slmA5N^nbTA_}H5_
z(v%hu{%N=8+jR>(vH+`gG9R8o-W6N8z-+gQ93Wi>YOQtVgLBMJ9sop~?niErg8`^0
znF<KJiOvO+HLP=(3W_cuoW(4x+J%I=hw68xNKq(|mzL}oD<7EDrT;6DA(vr4@>%Jj
zthgiZWYj?D9)Zw3^KGF)AJc-5Z4wqU6)(XHK44c^)?n0$SmXl^f#;A%rzlB}z@qg+
zZV~=Ui{Hs1&eZRLylkPr@<M1A`Jjg7&kO_Pl2_XA=kpLk?SH#74563GUXpALsI{7J
zZU*=0Jpg6m`A;<v;qSw`=@{U6k|79Ho2v>SZx1L*;VTIR5ay<Q0Zz8fbAghFzu}SJ
zl^i2yf`Gh=Y@3nCryV+}5Jvhlk%3&B+zTF5DGx1h2G2r%!D~Tl(T!RNSmK5=P!HUy
z|BCPg>}FETWcwi498v(}EhnenL^18weCJ^|e`XlSMaW_sJva~#toL_2`**qD&tmgj
z{4D;CM}B8{7T*H4zm7SxS^ctX()i$?1Q8^4lY6xdEdPaT;eQjJUzSyY{u#KND13h<
zd3ioFSNJD}DRtPqqJZ`OZfE}{_X9(2o(s0??|9_5q{{07+Z+9s)bh77C;lpkxbn|D
z3zi>CB6<Y9CcPrdCMK-E46U#Fnc~S0R4}JiQi`sYr$SzC_xj#}>mUv!zGp!=uS=l2
z7J@`NC61ka>!r@IUEfPD*Dn(1A~LkzyL49!?V)?=KXZCQ;D)WYz>j!T(?sIiv;F6R
YkO*f}%>HOC68KL+P6eMWV{+sF0bP`Rb^rhX

literal 0
HcmV?d00001

diff --git a/torch/csrc/jit/codegen/cuda/docs/main_page.md b/torch/csrc/jit/codegen/cuda/docs/main_page.md
new file mode 100644
index 000000000000..7464f577fe00
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/docs/main_page.md
@@ -0,0 +1,8 @@
+
+This is the implementation reference for the CUDA PyTorch JIT Fuser
+
+- [PyTorch GitHub Page](https://github.com/pytorch/pytorch)
+- [Fuser Source Tree](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/cuda)
+- Main documentation indexes: [Namespaces](namespaces.html) and [Classes](annotated.html)
+
+![Fuser Architecture Overview](images/ir_architecture.png)
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index a546ee5cf2f6..f33079bcbab5 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -1,11 +1,15 @@
 
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
 #include <torch/csrc/jit/codegen/cuda/executor_kernel_arg.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
@@ -29,13 +33,43 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
     std::cout << "\n==== codegen output for kernel: " << kernelName()
               << " ====" << std::endl
               << code << std::endl
-              << "=====*===============================" << std::endl;
+              << "======================================\n"
+              << std::endl;
   }
 
   return code;
 }
 
+void FusionExecutor::debugCompileFusionFromStr(
+    Fusion* fusion,
+    const std::string& code,
+    const std::string& name,
+    int id,
+    CompileOptions options) {
+  fusion_ = *fusion;
+  FusionGuard fg(&fusion_);
+  options_ = options;
+
+  const char* debug_env = getenv("PYTORCH_CUDA_FUSER_DEBUG");
+  if (debug_env && atoi(debug_env)) {
+    std::cout << "\n==== codegen output for kernel: " << kernelName()
+              << " ====" << std::endl
+              << code << std::endl
+              << "======================================\n"
+              << std::endl;
+  }
+
+  fusion_id_ = id;
+  lowered_ = GpuLower(&fusion_);
+
+  compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_);
+  TORCH_INTERNAL_ASSERT(
+      fusion_id_ > 0, "assign a fusion_id_ <= 0 is not accepted.");
+}
+
 void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
+  FUSER_PERF_SCOPE("compileFusion");
+
   TORCH_INTERNAL_ASSERT(
       !fusion->outputs().empty(), "No output found for this kernel, aborting.");
 
@@ -45,40 +79,65 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
         "Output types from fusions that are not tensors are not supported at this point.");
   }
 
+  // Clone the fusion so we can store it
   fusion_ = *fusion;
   FusionGuard fg(&fusion_);
   options_ = options;
 
+  TORCH_INTERNAL_ASSERT(
+      options.device.is_cuda(), "Provided device to CUDA fuser is the CPU.");
+  max_device_smem =
+      at::cuda::getDeviceProperties(options.device.index())->sharedMemPerBlock;
+
+  setUsedTVs();
+
   fusion_id_ = ++fusion_id_counter_;
-  has_random_ = fusion->hasRNG();
   lowered_ = GpuLower(&fusion_);
-  const auto kernel = lowered_.getKernel(kernelName());
-  const auto structured_code = getStructuredCode(kernel);
+  const auto kernel = lowered_.kernel();
+  const auto kernel_code = codegen::generateCudaKernel(kernel, kernelName());
+  const auto structured_code = getStructuredCode(kernel_code);
+
+  const auto& kernel_summary = kernel->summary();
+  has_block_reductions = kernel_summary.has_block_reductions;
+  has_grid_reductions = kernel_summary.has_grid_reductions;
+  has_block_broadcasts = kernel_summary.has_block_broadcasts;
+
+  if (!kernel_summary.static_smem_allocations.empty()) {
+    StatefulExpressionEvaluator static_evaluator(&fusion_);
+    unsigned static_smem_size = computeSharedMemory(
+        static_evaluator, kernel_summary.static_smem_allocations);
+    TORCH_INTERNAL_ASSERT(
+        static_smem_size < max_device_smem,
+        "The static shared memory allocation is larger than available memory.");
+  }
 
   compiled_kernel_ = executor_utils::nvrtcCompile(
       structured_code,
       (kernelNamespace() + "::" + kernelName()).c_str(),
       fusion_id_);
-  compiled_ = true;
+  TORCH_INTERNAL_ASSERT(
+      fusion_id_ > 0, "failed to assign a fusion_id_ after compilation.");
 }
 
 namespace {
 
 at::Tensor inferAndAlloc(
     const TensorView* tv,
-    EvaluationContext& ec,
+    StatefulExpressionEvaluator& see,
     const CompileOptions& options,
     bool zero_init = false) {
+  FUSER_PERF_SCOPE("inferAndAlloc");
+
   std::vector<int64_t> sizes;
-  for (auto id : TensorDomain::noReductions(tv->getRootDomain())) {
-    auto infered_val = ExpressionEvaluator::evaluate(id->rawExtent(), &ec);
+  for (auto id : TensorDomain::noReductions(tv->getMaybeRFactorDomain())) {
+    auto inferred_val = see.inferValue(id->rawExtent());
     TORCH_INTERNAL_ASSERT(
-        infered_val.has_value(),
+        inferred_val.has_value(),
         "Could not launch kernel as program could not infer ",
         id->rawExtent(),
         " for the buffer ",
         tv);
-    sizes.push_back(infered_val.value());
+    sizes.push_back(inferred_val.value());
   }
 
   auto at_type = data_type_to_aten(tv->getDataType().value());
@@ -90,38 +149,60 @@ at::Tensor inferAndAlloc(
     return at::zeros(isizes, tensor_options);
   } else {
     c10::IntArrayRef isizes(sizes);
-    return at::empty(isizes, tensor_options);
+    // Non Variable type guard for empty_cuda call
+    at::AutoNonVariableTypeMode non_variable_type_mode;
+    return at::native::empty_cuda(isizes, tensor_options);
   }
 }
 
 } // namespace
 
+uint64_t FusionExecutor::computeSharedMemory(
+    StatefulExpressionEvaluator& see,
+    const std::vector<kir::Allocate*>& buffers,
+    bool align_padding,
+    uint64_t total) {
+  FUSER_PERF_SCOPE("computeSharedMemory");
+  for (auto smem_alloc : buffers) {
+    auto inferred_val = see.inferValue(smem_alloc->size());
+    if (inferred_val.has_value()) {
+      const uint64_t data_size = dataTypeSize(smem_alloc->buffer_type());
+      // Add padding to align dynamic shared memory
+      if (align_padding) {
+        total = ceilDiv(total, data_size) * data_size;
+      }
+      total += inferred_val.value() * data_size;
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Failed to evaluate the size ",
+          smem_alloc->size(),
+          " of shared memory buffer - T",
+          smem_alloc->buffer()->name());
+    }
+  }
+  return total;
+}
+
 LaunchParams FusionExecutor::computeLaunchParams(
-    const at::ArrayRef<IValue>& aten_inputs,
     const LaunchParams& launch_constraints,
-    EvaluationContext& ec) {
-  LaunchParams launch_params;
+    StatefulExpressionEvaluator& see) {
+  FUSER_PERF_SCOPE("computeLaunchParams");
 
-  // Grab all values that are actually used in the fusion
-  auto unordered_vals = DependencyCheck::getAllValsBetween(
-      {fusion_.inputs().begin(), fusion_.inputs().end()}, fusion_.outputs());
+  LaunchParams launch_params;
 
   // Lets collect all IterDomains that are bound to a thread binding
   std::unordered_map<ParallelType, std::vector<IterDomain*>, TypeHash>
       parallel_iter_domains;
-
-  for (auto val : unordered_vals) {
-    if (val->getValType().value() == ValType::TensorView) {
-      TensorView* tv = val->as<TensorView>();
-      for (auto id : tv->domain()->domain()) {
-        if (id->isThread() && !id->isBroadcast()) {
-          if (parallel_iter_domains.find(id->getParallelType()) !=
-              parallel_iter_domains.end()) {
-            parallel_iter_domains.at(id->getParallelType()).push_back(id);
-          } else {
-            parallel_iter_domains[id->getParallelType()] =
-                std::vector<IterDomain*>({id});
-          }
+  for (auto tv : getUsedTVs()) {
+    for (auto id : tv->domain()->domain()) {
+      if (id->isThread() && !id->isBroadcast()) {
+        if (parallel_iter_domains.find(id->getParallelType()) !=
+            parallel_iter_domains.end()) {
+          parallel_iter_domains.at(id->getParallelType()).push_back(id);
+        } else {
+          parallel_iter_domains[id->getParallelType()] =
+              std::vector<IterDomain*>({id});
         }
       }
     }
@@ -129,32 +210,31 @@ LaunchParams FusionExecutor::computeLaunchParams(
 
   // If any dimension was set in launch constraints we need to run through
   // IterDomains that have been parallelized, and bind those values. Or make
-  // sure if they could be infered the inference matches what was set.
+  // sure if they could be inferred the inference matches what was set.
   if (launch_constraints.nBlocks() * launch_constraints.nThreads() != -1) {
     for (auto& entry : parallel_iter_domains) {
       auto p_type = entry.first;
       if (launch_constraints.hasDim(p_type)) {
         auto parallel_ids = entry.second;
         for (auto parallel_id : parallel_ids) {
-          auto infered_val =
-              ExpressionEvaluator::evaluate(parallel_id->rawExtent(), &ec);
-          if (infered_val.has_value()) {
-            // This value could have been infered, make sure it was set right.
+          auto inferred_val = see.inferValue(parallel_id->rawExtent());
+          if (inferred_val.has_value()) {
+            // This value could have been inferred, make sure it was set right.
             TORCH_CHECK(
-                infered_val.value() == launch_constraints.getDim(p_type) ||
+                inferred_val.value() == launch_constraints.getDim(p_type) ||
                     launch_constraints.getRawVal(p_type) == -1,
-                "Infered that ",
+                "inferred that ",
                 p_type,
                 " should be set to ",
-                infered_val.value(),
+                inferred_val.value(),
                 " but launch constraints specified ",
                 launch_constraints.getDim(p_type));
           } else {
             // Bind the launch constraint into our evaluation context
-            executor_utils::safeBind(
-                ec,
+            see.safeBind(
                 parallel_id->rawExtent(),
-                launch_constraints.getDim(entry.first));
+                launch_constraints.getDim(entry.first),
+                &lowered_);
             launch_params.bind(launch_constraints.getDim(p_type), p_type);
           }
         }
@@ -167,7 +247,7 @@ LaunchParams FusionExecutor::computeLaunchParams(
     auto p_type = entry.first;
     auto parallel_ids = entry.second;
     for (auto parallel_id : parallel_ids) {
-      auto val = ExpressionEvaluator::evaluate(parallel_id->rawExtent(), &ec);
+      auto val = see.inferValue(parallel_id->rawExtent());
       TORCH_INTERNAL_ASSERT(
           val,
           "Tried to evaluate the extent of ",
@@ -177,100 +257,225 @@ LaunchParams FusionExecutor::computeLaunchParams(
     }
   }
 
+  const auto kernel = lowered_.kernel();
+  const auto& kernel_summary = kernel->summary();
+
+  // Calculate Dynamic Shared Memory Size
+  // Add workspace for reduction and broadcast
+  uint64_t reduction_broadcast_workspace = 0;
+  if (has_block_reductions || has_grid_reductions || has_block_broadcasts) {
+    // Not using nThreads here since it does not handle uninitialized value
+    reduction_broadcast_workspace =
+        dataTypeSize(kernel_summary.largest_smem_data_type) *
+        launch_params.bdimx() * launch_params.bdimy() * launch_params.bdimz();
+  }
+
+  const uint64_t dynamic_smem_size = computeSharedMemory(
+      see,
+      kernel_summary.dynamic_smem_allocations,
+      true,
+      reduction_broadcast_workspace);
+
+  const uint64_t static_smem_size =
+      computeSharedMemory(see, kernel_summary.static_smem_allocations);
+
+  TORCH_INTERNAL_ASSERT(
+      (dynamic_smem_size + static_smem_size) < max_device_smem,
+      "The total shared memory allocation is larger than available memory.");
+  launch_params.setSmem(dynamic_smem_size);
+
   return launch_params;
 }
 
-std::vector<at::Tensor> FusionExecutor::allocGlobalVals(EvaluationContext& ec) {
-  std::vector<at::Tensor> global_buffers;
-  for (auto alloc : lowered_.global_allocations()) {
-    TORCH_INTERNAL_ASSERT(
-        alloc->buffer()->getValType() == ValType::KirTensorView,
-        "Cannot allocate global buffers that are not tensors.");
-    global_buffers.push_back(inferAndAlloc(
-        alloc->buffer()->as<kir::TensorView>()->fuserTv(),
-        ec,
-        options_,
-        false));
-  }
-
-  for (auto alloc : lowered_.sync_allocations()) {
+FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
+    StatefulExpressionEvaluator& see) {
+  FUSER_PERF_SCOPE("allocGlobalVals");
+  GlobalBuffers global_buffers;
+  const auto& kernel_summary = lowered_.kernel()->summary();
+  for (auto alloc : kernel_summary.global_allocations) {
     TORCH_INTERNAL_ASSERT(
         alloc->buffer()->getValType() == ValType::KirTensorView,
         "Cannot allocate global buffers that are not tensors.");
-    global_buffers.push_back(inferAndAlloc(
-        alloc->buffer()->as<kir::TensorView>()->fuserTv(), ec, options_, true));
+    if (!alloc->zeroInit()) {
+      global_buffers.empty_buffers.push_back(inferAndAlloc(
+          alloc->buffer()->as<kir::TensorView>()->fuserTv(),
+          see,
+          options_,
+          false));
+    } else {
+      global_buffers.zero_buffers.push_back(inferAndAlloc(
+          alloc->buffer()->as<kir::TensorView>()->fuserTv(),
+          see,
+          options_,
+          true));
+    }
   }
 
   return global_buffers;
 }
 
-std::vector<at::Tensor> FusionExecutor::allocOutputs(EvaluationContext& ec) {
+std::vector<at::Tensor> FusionExecutor::allocOutputs(
+    StatefulExpressionEvaluator& see) {
+  FUSER_PERF_SCOPE("allocOutputs");
   std::vector<at::Tensor> outputs;
   for (auto output : fusion_.outputs()) {
     TORCH_INTERNAL_ASSERT(
         output->getValType() == ValType::TensorView,
         "Cannot allocate outputs that are not tensors.");
     outputs.push_back(
-        inferAndAlloc(output->as<TensorView>(), ec, options_, false));
+        inferAndAlloc(output->as<TensorView>(), see, options_, false));
   }
   return outputs;
 }
 
+void FusionExecutor::setUsedTVs() {
+  used_tvs_.clear();
+  auto used_vals = DependencyCheck::getAllValsBetween(
+      {fusion_.inputs().begin(), fusion_.inputs().end()}, fusion_.outputs());
+  for (auto val : used_vals) {
+    if (val->getValType().value() == ValType::TensorView) {
+      used_tvs_.push_back(val->as<TensorView>());
+    }
+  }
+}
+
 std::vector<at::Tensor> FusionExecutor::runFusion(
     const at::ArrayRef<IValue>& inputs,
     const std::vector<at::Tensor>& outputs,
-    const LaunchParams& launch_constraints) {
+    const LaunchParams& launch_constraints,
+    const c10::optional<size_t>& opt_code) {
+  FUSER_PERF_SCOPE("runFusion");
+
   TORCH_INTERNAL_ASSERT(
       fusion_id_ > 0, "Cannot run fusion, it was not compiled.");
+  TORCH_INTERNAL_ASSERT(
+      !opt_code.has_value() || outputs.empty(),
+      "short cut input cache is not compatible with pre-allocated output");
 
-  FusionGuard fg(&fusion_);
-
-  executor_utils::validateKernelInputs(&fusion_, inputs, options_.device);
+  ExecutorEntry* executor_entry = nullptr;
+  if (opt_code.has_value()) {
+    executor_entry = &executor_entry_lookup_[*opt_code];
+  }
 
+  FusionGuard fg(&fusion_);
   c10::DeviceGuard dg(options_.device);
   auto stream = at::cuda::getCurrentCUDAStream();
 
-  EvaluationContext evaluation_context =
-      executor_utils::bindInputs(inputs, &fusion_);
+  LaunchParams launch_params;
+  std::vector<at::Tensor> alloced_outputs = outputs;
+  GlobalBuffers global_buffers;
+  uint64_t rand_offset = 0;
+
+  if (executor_entry && executor_entry->init) {
+    {
+      // context manager to disable auto grad for `empty_cuda` calls later;
+      at::AutoNonVariableTypeMode non_variable_type_mode;
+      // take the short-cut for launch if we see a recorded input set again;
+      launch_params = executor_entry->launch_params;
+      for (size_t i = 0; i < executor_entry->output_sizes.size(); i++) {
+        auto tensor_options = at::TensorOptions()
+                                  .dtype(executor_entry->output_types[i])
+                                  .device(options_.device);
+        alloced_outputs.push_back(at::native::empty_cuda(
+            executor_entry->output_sizes[i], tensor_options));
+      }
+      for (size_t i = 0; i < executor_entry->empty_buffer_sizes.size(); i++) {
+        auto tensor_options = at::TensorOptions()
+                                  .dtype(executor_entry->empty_buffer_types[i])
+                                  .device(options_.device);
+        global_buffers.empty_buffers.push_back(at::native::empty_cuda(
+            executor_entry->empty_buffer_sizes[i], tensor_options));
+      }
+    }
+    for (size_t i = 0; i < executor_entry->zero_buffer_sizes.size(); i++) {
+      auto tensor_options = at::TensorOptions()
+                                .dtype(executor_entry->zero_buffer_types[i])
+                                .device(options_.device);
+      global_buffers.zero_buffers.push_back(
+          at::zeros(executor_entry->zero_buffer_sizes[i], tensor_options));
+    }
+    rand_offset = executor_entry->rand_offset;
+  } else {
+    // code path to take when either:
+    //   1. no opt_code is provided or;
+    //   2. `executor_entry` is not initialized
+    executor_utils::validateKernelInputs(&fusion_, inputs, options_.device);
 
-  LaunchParams launch_params =
-      computeLaunchParams(inputs, launch_constraints, evaluation_context);
+    StatefulExpressionEvaluator evaluator =
+        executor_utils::statefulBindInputs(inputs, &fusion_, &lowered_);
 
-  std::vector<at::Tensor> alloced_outputs = outputs;
-  if (outputs.empty() || outputs.size() != fusion_.outputs().size()) {
-    alloced_outputs = allocOutputs(evaluation_context);
-  }
+    launch_params = computeLaunchParams(launch_constraints, evaluator);
 
-  executor_utils::validateKernelOutputs(
-      &fusion_, alloced_outputs, options_.device);
+    if (outputs.empty() || outputs.size() != fusion_.outputs().size()) {
+      alloced_outputs = allocOutputs(evaluator);
+    } else {
+      executor_utils::validateKernelOutputs(
+          &fusion_, alloced_outputs, options_.device);
+    }
+
+    global_buffers = allocGlobalVals(evaluator);
+
+    if (lowered_.kernel()->summary().is_stochastic) {
+      // NOTE: this is how we map offset to PW kernels in order to have
+      // identical random number generator to match native PyTorch results.
+      // But it doesn't really work as it takes assumption how threads are
+      // binded but is not generally how we handle that in scheduler.
+      // Refer to `Philox` in generated kernel to understand how the mapping
+      // works.
+      rand_offset = 4 *
+          (std::ceil(
+               alloced_outputs[0].numel() /
+               (4.0 * 128 * launch_params.gdimx())) + // NOLINT
+           1);
+    }
+
+    // This is the entry when we have provided `opt_code` but the entry has not
+    // been initialized yet.
+    if (executor_entry) {
+      // record the the short-cut executor entry for the given input set;
+      executor_entry->launch_params = launch_params;
+      for (const auto& output : alloced_outputs) {
+        executor_entry->output_sizes.push_back(output.sizes().vec());
+        executor_entry->output_types.push_back(output.scalar_type());
+      }
+      for (const auto& buffer : global_buffers.empty_buffers) {
+        executor_entry->empty_buffer_sizes.push_back(buffer.sizes().vec());
+        executor_entry->empty_buffer_types.push_back(buffer.scalar_type());
+      }
+      for (const auto& buffer : global_buffers.zero_buffers) {
+        executor_entry->zero_buffer_sizes.push_back(buffer.sizes().vec());
+        executor_entry->zero_buffer_types.push_back(buffer.scalar_type());
+      }
+      executor_entry->rand_offset = rand_offset;
+      executor_entry->init = true;
+    }
+  }
 
   KernelArgumentHolder kernel_arguments;
   kernel_arguments.push(inputs);
   kernel_arguments.push(alloced_outputs);
-  auto buffers = allocGlobalVals(evaluation_context);
-  kernel_arguments.push(buffers);
-
-  if (has_random_) {
-    const auto rand_offset = 4 *
-        (std::ceil(
-             alloced_outputs[0].numel() / (4.0 * 128 * launch_params.gdimx())) +
-         1);
+  kernel_arguments.push(global_buffers.empty_buffers);
+  kernel_arguments.push(global_buffers.zero_buffers);
+  if (lowered_.kernel()->summary().is_stochastic) {
     kernel_arguments.appendPhiloxRNGSeed(rand_offset);
   }
 
-  AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
-      compiled_kernel_.function,
-      launch_params.gdimx(),
-      launch_params.gdimy(),
-      launch_params.gdimz(),
-      launch_params.bdimx(),
-      launch_params.bdimy(),
-      launch_params.bdimz(),
-      0, // smem
-      stream,
-      kernel_arguments.getBuffer(),
-      nullptr));
-  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  {
+    FUSER_PERF_SCOPE("cuLaunchKernel");
+    AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuLaunchKernel(
+        compiled_kernel_.function,
+        launch_params.gdimx(),
+        launch_params.gdimy(),
+        launch_params.gdimz(),
+        launch_params.bdimx(),
+        launch_params.bdimy(),
+        launch_params.bdimz(),
+        launch_params.smem(),
+        stream,
+        kernel_arguments.getBuffer(),
+        nullptr));
+    AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
 
   return alloced_outputs;
 }
diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
index 10e71827a37b..ad6a1f643296 100644
--- a/torch/csrc/jit/codegen/cuda/executor.h
+++ b/torch/csrc/jit/codegen/cuda/executor.h
@@ -23,26 +23,66 @@ struct TORCH_CUDA_API CompileOptions {
 
 class TORCH_CUDA_API FusionExecutor : public NonCopyable {
  public:
+  // Unsafe compilation that's useful for debugging kernels, iterating over
+  // slight modifications of a generated kernel
+  void debugCompileFusionFromStr(
+      Fusion* fusion,
+      const std::string& code,
+      const std::string& name,
+      int id,
+      CompileOptions options = CompileOptions());
+
   void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions());
 
   std::vector<at::Tensor> runFusion(
       const at::ArrayRef<IValue>& inputs,
       const std::vector<at::Tensor>& outputs,
-      const LaunchParams& launch_constraints = LaunchParams());
+      const LaunchParams& launch_constraints = LaunchParams(),
+      const c10::optional<size_t>& opt_code = c10::nullopt);
 
   std::vector<at::Tensor> runFusion(
       const at::ArrayRef<IValue>& inputs,
-      const LaunchParams& launch_constraints = LaunchParams()) {
-    return runFusion(inputs, {}, launch_constraints);
+      const LaunchParams& launch_constraints = LaunchParams(),
+      const c10::optional<size_t>& opt_code = c10::nullopt) {
+    return runFusion(inputs, {}, launch_constraints, opt_code);
   }
 
   // function to query whether a `FusionExecutor` has a compiled kernel to
   // execute
   bool compiled() const {
-    return compiled_;
+    return fusion_id_ != -1;
+  };
+
+  void evictCache(size_t cache_id) {
+    executor_entry_lookup_.erase(cache_id);
+  }
+
+  // TODO: strides would also be important when we handle permutations in
+  //       codegen.
+  // struct used to hold necessary information to launch compiled kernel on a
+  // given input set.
+  struct ExecutorEntry {
+    bool init = false;
+    LaunchParams launch_params;
+    std::vector<std::vector<int64_t>> output_sizes;
+    std::vector<at::ScalarType> output_types;
+    std::vector<std::vector<int64_t>> empty_buffer_sizes;
+    std::vector<at::ScalarType> empty_buffer_types;
+    std::vector<std::vector<int64_t>> zero_buffer_sizes;
+    std::vector<at::ScalarType> zero_buffer_types;
+    uint64_t rand_offset;
   };
 
+  Kernel* kernel() const {
+    return lowered_.kernel();
+  }
+
  private:
+  struct GlobalBuffers {
+    std::vector<at::Tensor> empty_buffers;
+    std::vector<at::Tensor> zero_buffers;
+  };
+
   std::string kernelName() const {
     std::stringstream ss;
     ss << "kernel" << fusion_id_;
@@ -57,31 +97,51 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable {
   std::string getStructuredCode(const std::string& kernel);
 
   LaunchParams computeLaunchParams(
-      const at::ArrayRef<IValue>& aten_inputs,
       const LaunchParams& launch_constraints,
-      EvaluationContext& ec);
+      StatefulExpressionEvaluator& see);
 
-  std::vector<at::Tensor> allocGlobalVals(EvaluationContext& ec);
+  uint64_t computeSharedMemory(
+      StatefulExpressionEvaluator& see,
+      const std::vector<kir::Allocate*>& buffers,
+      bool align_padding = false,
+      uint64_t total = 0);
 
-  std::vector<at::Tensor> allocOutputs(EvaluationContext& ec);
+  // return a pair of vector of tensors, where tensors in the first vector are
+  // not initialized, while the second vector contains zero-initiliazed tensors
+  GlobalBuffers allocGlobalVals(StatefulExpressionEvaluator& see);
 
- private:
-  bool compiled_ = false;
+  std::vector<at::Tensor> allocOutputs(StatefulExpressionEvaluator& see);
+
+  void setUsedTVs();
+
+  const std::vector<TensorView*>& getUsedTVs() const {
+    return used_tvs_;
+  };
 
+ private:
   Fusion fusion_;
 
-  CompileOptions options_;
+  // TODO(kir): caching the values here is no longer needed
+  bool has_block_reductions = false;
+  bool has_grid_reductions = false;
+  bool has_block_broadcasts = false;
 
+  CompileOptions options_;
+  size_t max_device_smem = std::numeric_limits<size_t>().max();
   executor_utils::NvrtcFunction compiled_kernel_;
 
-  // State of the fusion that's important
-  bool has_random_ = false;
+  // TensorViews actually used in the kernel.
+  std::vector<TensorView*> used_tvs_;
 
   // Counter to be used for kernel name.
   int fusion_id_ = -1;
   static int fusion_id_counter_;
 
   GpuLower lowered_;
+
+  // lookup table to take short cut to retrieve recorded information in order to
+  // launch kernels without re-inference parameters.
+  std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
index 1f3f44dbf551..76358eb7868f 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp
@@ -58,7 +58,7 @@ void KernelArgumentHolder::push(const IValue& val) {
       arguments_.push_back(std::make_unique<FloatArg>((float)val.toDouble()));
       return;
     case c10::ScalarType::Long:
-      arguments_.push_back(std::make_unique<IntArg>((int)val.toInt()));
+      arguments_.push_back(std::make_unique<LongArg>(val.toInt()));
       return;
     default:
       TORCH_INTERNAL_ASSERT(
diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
index ca9a83c60a56..44d0eeacc7df 100644
--- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
+++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h
@@ -61,6 +61,14 @@ struct ULongArg : public ArgAbstract {
   }
 };
 
+struct LongArg : public ArgAbstract {
+  int64_t val_;
+  LongArg(int64_t _val) : val_(_val){};
+  void* arg() {
+    return &val_;
+  }
+};
+
 struct IntArg : public ArgAbstract {
   int val_;
   IntArg(int _val) : val_(_val){};
diff --git a/torch/csrc/jit/codegen/cuda/executor_launch_params.h b/torch/csrc/jit/codegen/cuda/executor_launch_params.h
index 872fa2d06b86..981352e4839b 100644
--- a/torch/csrc/jit/codegen/cuda/executor_launch_params.h
+++ b/torch/csrc/jit/codegen/cuda/executor_launch_params.h
@@ -24,9 +24,14 @@ class TORCH_CUDA_API LaunchParams {
         bdimy_(bdimy),
         bdimz_(bdimz) {}
 
+  void setSmem(int64_t smem) {
+    smem_ = smem;
+  }
+
   int64_t smem() const {
     return smem_;
   }
+
   int64_t nBlocks() const {
     return gdimx_ * gdimy_ * gdimz_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index c32538070a60..9670968b8fe1 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -3,12 +3,11 @@
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
-#include <torch/csrc/jit/resource_guard.h>
-
+#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_resource_strings.h>
-
-#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
+#include <torch/csrc/jit/resource_guard.h>
 
 #include <fstream>
 
@@ -32,14 +31,16 @@ std::string kernelPreamble() {
 
 namespace {
 
+// return false if arg's type, number of dimensions, and device, doesn't match
+// param and provided c10:device
 bool validateKernelArgTensor(
     const at::Tensor& arg,
     const Val* param,
-    c10::Device device,
+    const c10::Device& device,
     std::stringstream& msg) {
   // Arg is a tensor. Param must be a tensor too.
   if (*param->getValType() != ValType::TensorView) {
-    msg << "Argument is a tensor, but the parameter is not.";
+    msg << "Argument is a tensor, but the parameter is not.\n";
     return false;
   }
 
@@ -54,12 +55,13 @@ bool validateKernelArgTensor(
   // check as necessary.
   if (arg_dim > param_dim) {
     msg << "Argument tensor's rank is " << arg_dim << ", but the parameter is "
-        << param_dim;
+        << param_dim << "\n";
     return false;
   }
 
   if (arg.device() != device) {
-    msg << "Argument is on device that is not compiled for";
+    msg << "Argument is on device that is not compiled for."
+        << "\n";
     return false;
   }
   // Check element type
@@ -77,22 +79,24 @@ bool validateKernelArgTensor(
       match = param_data_type == DataType::Bool;
       break;
     default:
-      msg << "Argument element type, " << arg_data_type
-          << ", is not supported.";
+      msg << "Argument element type, " << arg_data_type << ", is not supported."
+          << "\n";
       return false;
   }
   if (!match)
     msg << "Argument element type is " << arg_data_type
-        << ", but the parameter is " << param_data_type;
+        << ", but the parameter is " << param_data_type << "\n";
   return match;
 }
 
+// Return false if  arg_type doesn't match the type in param
 bool validateKernelArgScalar(
     const c10::TypePtr& arg_type,
     const Val* param,
     std::stringstream& msg) {
   if (!param->isScalar()) {
-    msg << "Argument is a scalar, but the parameter is not.";
+    msg << "Argument is a scalar, but the parameter is not."
+        << "\n";
     return false;
   }
   DataType param_type = *param->getDataType();
@@ -112,20 +116,22 @@ bool validateKernelArgScalar(
   }
   if (!match) {
     msg << "Argument type is " << *arg_type << ", but the parameter is "
-        << param_type;
+        << param_type << "\n";
   }
   return match;
 }
 
+// Return false if arg and param don't match up and if arg's device (if a
+// tensor) doesn't match provided device
 bool validateKernelArg(
     const c10::IValue& arg,
     const Val* param,
-    c10::Device device,
+    const c10::Device& device,
     std::stringstream& msg) {
-  if (arg.type()->kind() != c10::TypeKind::TensorType) {
-    return validateKernelArgScalar(arg.type(), param, msg);
-  } else {
+  if (arg.isTensor()) {
     return validateKernelArgTensor(arg.toTensor(), param, device, msg);
+  } else {
+    return validateKernelArgScalar(arg.type(), param, msg);
   }
 }
 
@@ -134,30 +140,33 @@ bool validateKernelArg(
 void validateKernelInputs(
     Fusion* fusion,
     const at::ArrayRef<IValue>& inputs,
-    c10::Device device) {
+    const c10::Device& device) {
+  FUSER_PERF_SCOPE("validateKernelInputs");
+
   // This is necessary as we were traversing the fusion graph later in the check
   FusionGuard fg(fusion);
   // Check inputs
   TORCH_INTERNAL_ASSERT(
       inputs.size() == fusion->inputs().size(),
       "Wrong number of kernel inputs.");
+
+  std::stringstream msg;
+  bool mismatch = false;
   for (size_t i = 0; i < inputs.size(); ++i) {
     const IValue& arg = inputs[i];
     const Val* param = fusion->inputs()[i];
-    std::stringstream msg;
-    TORCH_INTERNAL_ASSERT(
-        validateKernelArg(arg, param, device, msg),
-        "Input argument at position ",
-        i,
-        " is invalid; ",
-        msg.str());
+    mismatch = !validateKernelArg(arg, param, device, msg) || mismatch;
   }
+  TORCH_INTERNAL_ASSERT(
+      !mismatch, "Found one or more invalid arguments: ", msg.str());
 }
 
 void validateKernelOutputs(
     Fusion* fusion,
     const std::vector<at::Tensor>& outputs,
-    c10::Device device) {
+    const c10::Device& device) {
+  FUSER_PERF_SCOPE("validateKernelOutputs");
+
   TORCH_INTERNAL_ASSERT(
       fusion->outputs().size() != 0,
       "Kernel should have at least one output tensor.");
@@ -165,47 +174,30 @@ void validateKernelOutputs(
   TORCH_INTERNAL_ASSERT(
       outputs.size() == fusion->outputs().size(),
       "Wrong number of kernel outputs.");
+
+  std::stringstream msg;
+  bool mismatch = false;
   for (size_t i = 0; i < outputs.size(); ++i) {
     const at::Tensor& arg = outputs[i];
     const Val* param = fusion->outputs()[i];
-    std::stringstream msg;
-    TORCH_INTERNAL_ASSERT(
-        validateKernelArgTensor(arg, param, device, msg),
-        "Output argument at position ",
-        i,
-        " is invalid; ",
-        msg.str());
-  }
-}
-
-void safeBind(
-    EvaluationContext& ec,
-    const Val* value,
-    Int::ScalarType concrete_value) {
-  auto already_concrete_val = ec.concreteValue(value);
-
-  if (already_concrete_val.has_value()) {
-    TORCH_INTERNAL_ASSERT(
-        concrete_value == already_concrete_val.value(),
-        "Tried to bind ",
-        value,
-        " to ",
-        " concrete value, but it's already set to ",
-        already_concrete_val.value());
-  } else {
-    ec.bind(value, concrete_value);
+    mismatch = !validateKernelArg(arg, param, device, msg) || mismatch;
   }
+  TORCH_INTERNAL_ASSERT(
+      !mismatch, "Found one or more invalid arguments: ", msg.str());
 }
 
-EvaluationContext bindInputs(
+StatefulExpressionEvaluator statefulBindInputs(
     const at::ArrayRef<IValue>& aten_inputs,
-    Fusion* fusion) {
+    Fusion* fusion,
+    GpuLower* lower) {
+  FUSER_PERF_SCOPE("statefulBindInputs");
+
   TORCH_INTERNAL_ASSERT(
       fusion->inputs().size() == aten_inputs.size(),
       "Something went wrong configuring launch. Inputs no longer match.");
 
   auto fusion_inputs = fusion->inputs();
-  EvaluationContext eval_context(fusion);
+  StatefulExpressionEvaluator evaluator(fusion);
 
   // This should probably move to EvaluationContext as we may want to bind
   // input values frequently. Bind fusion input values to runtime values.
@@ -224,18 +216,26 @@ EvaluationContext bindInputs(
           "Something went wrong configuring launch. Inputs no longer match.");
 
       for (size_t dim = 0; dim < root_dom.size(); dim++) {
-        safeBind(
-            eval_context, root_dom[dim]->extent(), aten_tensor.sizes()[dim]);
+        evaluator.safeBind(
+            root_dom[dim]->extent(), aten_tensor.sizes()[dim], lower);
       }
+    } else if (
+        fusion->inputs()[i]->getValType().value() == ValType::Scalar &&
+        fusion->inputs()[i]->getDataType().value() == DataType::Int) {
+      TORCH_INTERNAL_ASSERT(
+          aten_inputs[i].type()->kind() == c10::TypeKind::IntType);
+      evaluator.safeBind(fusion->inputs()[i], aten_inputs[i].toInt(), lower);
     }
   }
-  return eval_context;
+  return evaluator;
 }
 
 NvrtcFunction nvrtcCompile(
     const std::string& code,
     const std::string& func_name,
     int id) {
+  FUSER_PERF_SCOPE("NVRTC");
+
   // lazily construct context if non-existing yet;
   CUcontext pctx = nullptr;
   AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuCtxGetCurrent(&pctx));
@@ -258,44 +258,87 @@ NvrtcFunction nvrtcCompile(
   const int major = prop->major;
   const int minor = prop->minor;
   nvrtcProgram program;
-  AT_CUDA_NVRTC_CHECK(at::globalContext().getNVRTC().nvrtcCreateProgram(
-      &program, code.c_str(), nullptr, 0, nullptr, nullptr));
+
+  {
+    FUSER_PERF_SCOPE("nvrtcCreateProgram");
+    AT_CUDA_NVRTC_CHECK(at::globalContext().getNVRTC().nvrtcCreateProgram(
+        &program, code.c_str(), nullptr, 0, nullptr, nullptr));
+  }
+
   ResourceGuard holdProgram([&] {
+    FUSER_PERF_SCOPE("nvrtcDestroyProgram");
     AT_CUDA_NVRTC_CHECK(
         at::globalContext().getNVRTC().nvrtcDestroyProgram(&program));
   });
 
   const std::string compute = "--gpu-architecture=compute_" +
       std::to_string(major) + std::to_string(minor);
-  const std::vector<const char*> args = {
+  std::vector<const char*> args = {
       "--std=c++14", compute.c_str(), "-default-device"};
 
+  const char* disable_fma = getenv("PYTORCH_CUDA_FUSER_DISABLE_FMA");
+  // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0;
+  if (disable_fma && atoi(disable_fma)) {
+    args.push_back("--fmad=false");
+  }
+
+  const char* ptxas_opt_level = getenv("PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL");
+  uint32_t jit_opt_level;
+
+  std::vector<CUjit_option> options;
+  std::vector<void*> option_vals;
+
+  if (ptxas_opt_level) {
+    int val = atoi(ptxas_opt_level);
+    if (val <= 4 && val >= 0) {
+      jit_opt_level = static_cast<uint32_t>(val);
+      options.push_back(CU_JIT_OPTIMIZATION_LEVEL);
+      option_vals.emplace_back(&jit_opt_level);
+    } else {
+      TORCH_WARN_ONCE(
+          "acceptable range for PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL is between 0 and 4, but received ",
+          jit_opt_level,
+          ", ignoring the option");
+    }
+  }
+
   at::globalContext().getNVRTC().nvrtcAddNameExpression(
       program, func_name.c_str());
-  const auto result = at::globalContext().getNVRTC().nvrtcCompileProgram(
-      program, args.size(), args.data());
 
-  if (result != NVRTC_SUCCESS) {
-    size_t logsize;
-    at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize);
-    std::vector<char> log(logsize);
-    at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data());
+  {
+    FUSER_PERF_SCOPE("nvrtcCompileProgram");
+
+    const auto result = at::globalContext().getNVRTC().nvrtcCompileProgram(
+        program, args.size(), args.data());
+
+    if (result != NVRTC_SUCCESS) {
+      size_t logsize;
+      at::globalContext().getNVRTC().nvrtcGetProgramLogSize(program, &logsize);
+      std::vector<char> log(logsize);
+      at::globalContext().getNVRTC().nvrtcGetProgramLog(program, log.data());
+
+      TORCH_INTERNAL_ASSERT(
+          false, code.c_str(), "\nCUDA NVRTC compile error: ", log.data());
+    }
 
-    TORCH_INTERNAL_ASSERT(
-        false, code.c_str(), "\nCUDA NVRTC compile error: ", log.data());
+    AT_CUDA_NVRTC_CHECK(result);
   }
-  const char* lowered_kernel_name;
+
+  const char* lowered_kernel_name = nullptr;
   at::globalContext().getNVRTC().nvrtcGetLoweredName(
       program, func_name.c_str(), &lowered_kernel_name);
 
-  AT_CUDA_NVRTC_CHECK(result);
-  size_t ptx_size;
-  AT_CUDA_NVRTC_CHECK(
-      at::globalContext().getNVRTC().nvrtcGetPTXSize(program, &ptx_size));
+  size_t ptx_size = 0;
   std::vector<char> ptx;
-  ptx.resize(ptx_size);
-  AT_CUDA_NVRTC_CHECK(
-      at::globalContext().getNVRTC().nvrtcGetPTX(program, ptx.data()));
+
+  {
+    FUSER_PERF_SCOPE("get PTX");
+    AT_CUDA_NVRTC_CHECK(
+        at::globalContext().getNVRTC().nvrtcGetPTXSize(program, &ptx_size));
+    ptx.resize(ptx_size);
+    AT_CUDA_NVRTC_CHECK(
+        at::globalContext().getNVRTC().nvrtcGetPTX(program, ptx.data()));
+  }
 
   NvrtcFunction compiled_kernel_;
 
@@ -303,6 +346,8 @@ NvrtcFunction nvrtcCompile(
   // has an impact on generated binary.
   const char* prefix_env = getenv("PYTORCH_CUDA_FUSER_CUBIN");
   if (prefix_env) {
+    FUSER_PERF_SCOPE("load CUBIN");
+
     // Output ptx file
     std::stringstream ptx_file_name;
     ptx_file_name << prefix_env << "_" << id << ".ptx";
@@ -323,9 +368,9 @@ NvrtcFunction nvrtcCompile(
         ptx.data(),
         ptx_size,
         "compiling PTX",
-        0,
-        nullptr,
-        nullptr));
+        options.size(),
+        options.data(),
+        option_vals.data()));
 
     size_t cubinSize;
     void* cubin;
@@ -347,9 +392,15 @@ NvrtcFunction nvrtcCompile(
     AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData(
         &(compiled_kernel_.module), cubin));
   } else {
+    FUSER_PERF_SCOPE("load PTX");
+
     // load ptx directly
-    AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData(
-        &(compiled_kernel_.module), ptx.data()));
+    AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadDataEx(
+        &(compiled_kernel_.module),
+        ptx.data(),
+        options.size(),
+        options.data(),
+        option_vals.data()));
   }
   AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleGetFunction(
       &(compiled_kernel_.function),
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
index d7f50ff7813b..76b8a9a145f1 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.h
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -11,6 +11,7 @@
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 namespace torch {
 namespace jit {
@@ -24,23 +25,17 @@ std::string kernelPreamble();
 void validateKernelInputs(
     Fusion* fusion,
     const at::ArrayRef<IValue>& inputs,
-    c10::Device device);
+    const c10::Device& device);
 
 void validateKernelOutputs(
     Fusion* fusion,
     const std::vector<at::Tensor>& outputs,
-    c10::Device device);
+    const c10::Device& device);
 
-// Check if a value is already bound, if so validate we're trying to bind to the
-// same value
-void safeBind(
-    EvaluationContext& ec,
-    const Val* value,
-    Int::ScalarType concrete_value);
-
-EvaluationContext bindInputs(
+StatefulExpressionEvaluator statefulBindInputs(
     const at::ArrayRef<IValue>& aten_inputs,
-    Fusion* fusion);
+    Fusion* fusion,
+    GpuLower* lower = nullptr);
 
 struct NvrtcFunction {
   CUmodule module = CUmodule();
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
index b82813748a0b..17fb81ceaf6a 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
+++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
@@ -1,6 +1,6 @@
-
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 
@@ -10,41 +10,61 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-void EvaluationContext::bind(const Val* value, Int::ScalarType concrete_value) {
-  TORCH_INTERNAL_ASSERT(
-      value->isAnInt(),
-      "Expressoin Evaluation does not support values other than integers at this time.");
+void StatefulExpressionEvaluator::safeBind(
+    Val* value,
+    Int::ScalarType concrete_value,
+    GpuLower* lower) {
+  auto already_concrete_val = getValue(value);
 
-  if (value->isConstScalar()) {
-    auto const_value = value->as<Int>()->value().value();
+  if (already_concrete_val.has_value()) {
     TORCH_INTERNAL_ASSERT(
-        concrete_value == const_value,
+        concrete_value == already_concrete_val.value(),
         "Tried to bind ",
-        concrete_value,
-        " to ",
         value,
-        " however ",
-        value,
-        " is set to a constant ",
-        const_value);
-  }
+        " to ",
+        " concrete value, but it's already set to ",
+        already_concrete_val.value());
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        value->getOrigin() == nullptr,
+        "Tried to bind to a value that is computed in the fusion IR. ",
+        "Can only bind to symbolic values to the fusion that do not have an origin expr.");
 
-  TORCH_INTERNAL_ASSERT(
-      fusion_->origin(value) == nullptr,
-      "Tried to bind to a value that is computed in the fusion IR. ",
-      "Can only bind to symbolic values to the fusion that do not have an origin expr.");
+    bindings_[value] = concrete_value;
+  }
 
-  bindings_[value] = concrete_value;
+  if (lower != nullptr) {
+    // TODO(kir): we should not need to lower (or mutate the IR in any way)
+    //  during expression evaluation
+    auto lowered_val = lower->getLowerValue(value);
+    already_concrete_val = getValue(lowered_val);
+
+    if (already_concrete_val.has_value()) {
+      TORCH_INTERNAL_ASSERT(
+          concrete_value == already_concrete_val.value(),
+          "Tried to bind ",
+          lowered_val,
+          " to ",
+          " concrete value, but it's already set to ",
+          already_concrete_val.value());
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          lowered_val->getOrigin() == nullptr,
+          "Tried to bind to a value that is computed in the fusion IR. ",
+          "Can only bind to symbolic values to the fusion that do not have an origin expr.");
+
+      bindings_[lowered_val] = concrete_value;
+    }
+  }
 }
 
-c10::optional<Int::ScalarType> EvaluationContext::concreteValue(
-    const Val* value) const {
-  const auto it = bindings_.find(value);
-  return (it != bindings_.end()) ? c10::optional<Int::ScalarType>(it->second)
-                                 : c10::nullopt;
+c10::optional<Int::ScalarType> StatefulExpressionEvaluator::inferValue(
+    Val* value) {
+  FUSER_PERF_SCOPE("inferValue");
+  return maybeHandle(value);
 }
 
-void EvaluationContext::print() const {
+void StatefulExpressionEvaluator::print() const {
   std::cout << "\nEvaluation context\n";
   std::cout << "--------------------\n";
   for (const auto& kv : bindings_) {
@@ -53,61 +73,59 @@ void EvaluationContext::print() const {
       std::cout << " ; original value = "
                 << kv.first->as<Int>()->value().value();
     }
-    std::cout << "\n";
+    std::cout << " ; " << *kv.first->getValType() << "\n";
   }
   std::cout << "--------------------\n\n";
 }
 
-c10::optional<Int::ScalarType> ExpressionEvaluator::evaluate(
-    Val* val,
-    const EvaluationContext* context) {
-  TORCH_CHECK(context != nullptr);
-  ExpressionEvaluator evaluator(context);
-  evaluator.traverseFrom(context->fusion(), {val}, false);
-  return evaluator.value(val);
-}
-
-c10::optional<Int::ScalarType> ExpressionEvaluator::value(
-    const Statement* stmt) const {
-  const auto it = values_.find(stmt);
-  return (it != values_.end()) ? c10::optional<Int::ScalarType>(it->second)
-                               : c10::nullopt;
-}
+c10::optional<Int::ScalarType> StatefulExpressionEvaluator::getValue(
+    Val* value) {
+  TORCH_INTERNAL_ASSERT(
+      value->isAnInt(),
+      "Expressoin Evaluation does not support values other than integers at this time.");
 
-void ExpressionEvaluator::handle(NamedScalar* i) {
-  if (i->isAnInt()) {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
-    }
+  switch (value->getValType().value()) {
+    case ValType::Scalar:
+      if (value->as<Int>()->value().has_value()) {
+        return value->as<Int>()->value();
+      }
+      break;
+    case ValType::KirScalar:
+      if (value->as<kir::Int>()->value().has_value()) {
+        return value->as<kir::Int>()->value();
+      }
+      break;
+    default:
+      break;
   }
+
+  const auto it = bindings_.find(value);
+  return it != bindings_.end() ? c10::optional<Int::ScalarType>(it->second)
+                               : c10::nullopt;
 }
 
-void ExpressionEvaluator::handle(Int* i) {
-  if (i->value().has_value()) {
-    values_[i] = *i->value();
-  } else if (const auto* def = context_->fusion()->origin(i)) {
-    const auto& def_result = value(def);
-    if (def_result.has_value()) {
-      values_[i] = *def_result;
-    }
-  } else {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
+c10::optional<Int::ScalarType> StatefulExpressionEvaluator::maybeHandle(
+    Val* val) {
+  auto maybe_concrete_value = getValue(val);
+  if (!maybe_concrete_value.has_value()) {
+    auto origin = val->getOrigin();
+    if (origin != nullptr) {
+      handle(origin);
+      maybe_concrete_value = getValue(val);
     }
   }
+  return maybe_concrete_value;
 }
 
-void ExpressionEvaluator::handle(UnaryOp* uop) {
-  const auto in = value(uop->in());
+void StatefulExpressionEvaluator::handle(UnaryOp* uop) {
+  const auto in = maybeHandle(uop->in());
   if (in.has_value()) {
     switch (uop->getUnaryOpType()) {
       case UnaryOpType::Neg:
-        values_[uop] = -*in;
+        bindings_[uop->out()] = -*in;
         break;
       case UnaryOpType::Cast:
-        values_[uop] = *in;
+        bindings_[uop->out()] = *in;
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
@@ -115,34 +133,34 @@ void ExpressionEvaluator::handle(UnaryOp* uop) {
   }
 }
 
-void ExpressionEvaluator::handle(BinaryOp* bop) {
-  const auto lhs = value(bop->lhs());
-  const auto rhs = value(bop->rhs());
+void StatefulExpressionEvaluator::handle(BinaryOp* bop) {
+  const auto lhs = maybeHandle(bop->lhs());
+  const auto rhs = maybeHandle(bop->rhs());
   if (lhs.has_value() && rhs.has_value()) {
     switch (bop->getBinaryOpType()) {
       case BinaryOpType::Add:
-        values_[bop] = *lhs + *rhs;
+        bindings_[bop->out()] = *lhs + *rhs;
         break;
       case BinaryOpType::Sub:
-        values_[bop] = *lhs - *rhs;
+        bindings_[bop->out()] = *lhs - *rhs;
         break;
       case BinaryOpType::Mul:
-        values_[bop] = *lhs * *rhs;
+        bindings_[bop->out()] = *lhs * *rhs;
         break;
       case BinaryOpType::Div:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs / *rhs;
+        bindings_[bop->out()] = *lhs / *rhs;
         break;
       case BinaryOpType::Mod:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs % *rhs;
+        bindings_[bop->out()] = *lhs % *rhs;
         break;
       case BinaryOpType::CeilDiv:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = (*lhs + *rhs - 1) / *rhs;
+        bindings_[bop->out()] = (*lhs + *rhs - 1) / *rhs;
         break;
       case BinaryOpType::And:
-        values_[bop] = Int::ScalarType(*lhs && *rhs);
+        bindings_[bop->out()] = Int::ScalarType(*lhs && *rhs);
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
@@ -150,40 +168,15 @@ void ExpressionEvaluator::handle(BinaryOp* bop) {
   }
 }
 
-void ExpressionEvaluator::handle(kir::NamedScalar* i) {
-  if (i->isAnInt()) {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
-    }
-  }
-}
-
-void ExpressionEvaluator::handle(kir::Int* i) {
-  if (i->value().has_value()) {
-    values_[i] = *i->value();
-  } else if (const auto* def = context_->fusion()->origin(i)) {
-    const auto& def_result = value(def);
-    if (def_result.has_value()) {
-      values_[i] = *def_result;
-    }
-  } else {
-    const auto& bound_value = context_->concreteValue(i);
-    if (bound_value.has_value()) {
-      values_[i] = *bound_value;
-    }
-  }
-}
-
-void ExpressionEvaluator::handle(kir::UnaryOp* uop) {
-  const auto in = value(uop->in());
+void StatefulExpressionEvaluator::handle(kir::UnaryOp* uop) {
+  const auto in = maybeHandle(uop->in());
   if (in.has_value()) {
     switch (uop->getUnaryOpType()) {
       case UnaryOpType::Neg:
-        values_[uop] = -*in;
+        bindings_[uop->out()] = -*in;
         break;
       case UnaryOpType::Cast:
-        values_[uop] = *in;
+        bindings_[uop->out()] = *in;
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
@@ -191,34 +184,34 @@ void ExpressionEvaluator::handle(kir::UnaryOp* uop) {
   }
 }
 
-void ExpressionEvaluator::handle(kir::BinaryOp* bop) {
-  const auto lhs = value(bop->lhs());
-  const auto rhs = value(bop->rhs());
+void StatefulExpressionEvaluator::handle(kir::BinaryOp* bop) {
+  const auto lhs = maybeHandle(bop->lhs());
+  const auto rhs = maybeHandle(bop->rhs());
   if (lhs.has_value() && rhs.has_value()) {
     switch (bop->getBinaryOpType()) {
       case BinaryOpType::Add:
-        values_[bop] = *lhs + *rhs;
+        bindings_[bop->out()] = *lhs + *rhs;
         break;
       case BinaryOpType::Sub:
-        values_[bop] = *lhs - *rhs;
+        bindings_[bop->out()] = *lhs - *rhs;
         break;
       case BinaryOpType::Mul:
-        values_[bop] = *lhs * *rhs;
+        bindings_[bop->out()] = *lhs * *rhs;
         break;
       case BinaryOpType::Div:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs / *rhs;
+        bindings_[bop->out()] = *lhs / *rhs;
         break;
       case BinaryOpType::Mod:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = *lhs % *rhs;
+        bindings_[bop->out()] = *lhs % *rhs;
         break;
       case BinaryOpType::CeilDiv:
         TORCH_CHECK(*rhs != 0);
-        values_[bop] = (*lhs + *rhs - 1) / *rhs;
+        bindings_[bop->out()] = (*lhs + *rhs - 1) / *rhs;
         break;
       case BinaryOpType::And:
-        values_[bop] = Int::ScalarType(*lhs && *rhs);
+        bindings_[bop->out()] = Int::ScalarType(*lhs && *rhs);
         break;
       default:
         TORCH_CHECK(!"Unexpected operator type");
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
index 1e107ff129b2..40ba53380fae 100644
--- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h
+++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h
@@ -4,6 +4,7 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
 #include <c10/util/Optional.h>
 
@@ -13,68 +14,67 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-// Encapsulates a set of value bindings on top of a Fusion IR
-// (used to provide known values to ExpressionEvaluator)
-//
-// NOTE: currently it only supports Int values
-//
-class TORCH_CUDA_API EvaluationContext {
+class TORCH_CUDA_API StatefulExpressionEvaluator : private OptOutDispatch {
  public:
-  explicit EvaluationContext(Fusion* fusion) : fusion_(fusion) {}
-
-  // Set the concrete value for a Int*
-  void bind(const Val* value, Int::ScalarType concrete_value);
-
-  // Retrieves the concrete value, or nullopt if not set
-  c10::optional<Int::ScalarType> concreteValue(const Val* value) const;
+  explicit StatefulExpressionEvaluator(Fusion* fusion) : fusion_(fusion) {}
 
   Fusion* fusion() const {
     return fusion_;
   }
 
-  // Debugging helper, prints all the currently set values
-  void print() const;
+  void safeBind(
+      Val* value,
+      Int::ScalarType concrete_value,
+      GpuLower* lower = nullptr);
 
- private:
-  std::unordered_map<const Val*, Int::ScalarType> bindings_;
-  Fusion* fusion_ = nullptr;
-};
+  // Returns value if found in mapping, otherwise returns c10::nullopt
+  c10::optional<Int::ScalarType> getValue(Val* value);
 
-// Evaluates expressions in a Fusion IR, using the passed in
-// context (EvaluationContext) to query for concrete_values. The
-// evaluation context may override concrete values in the IR as well.
-class TORCH_CUDA_API ExpressionEvaluator : private IterVisitor {
- public:
-  // Returns the result of the specified expression, or nullopt if
-  // the result cannot be evaluated
-  static c10::optional<Int::ScalarType> evaluate(
-      Val* val,
-      const EvaluationContext* context);
+  // Checks if value is already infered, returns infered value if so, otherwise
+  // runs traversal on value. Warning: should not be called in traversal.
+  c10::optional<Int::ScalarType> inferValue(Val* value);
 
- private:
-  explicit ExpressionEvaluator(const EvaluationContext* context)
-      : context_(context) {}
-
-  ~ExpressionEvaluator() override = default;
-
-  c10::optional<Int::ScalarType> value(const Statement* stmt) const;
+  // Debugging helper, prints all the currently set values
+  void print() const;
 
-  using IterVisitor::handle;
+ private:
+  using OptOutDispatch::handle;
+
+  void handle(Expr* expr) override {
+    switch (expr->getExprType().value()) {
+      case ExprType::UnaryOp:
+        handle(expr->as<UnaryOp>());
+        break;
+      case ExprType::BinaryOp:
+        handle(expr->as<BinaryOp>());
+        break;
+      case ExprType::KirUnaryOp:
+        handle(expr->as<kir::UnaryOp>());
+        break;
+      case ExprType::KirBinaryOp:
+        handle(expr->as<kir::BinaryOp>());
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(
+            false,
+            "Cannot handle Expr type: ",
+            expr->getExprType().value(),
+            " in stateful expression evaluator.");
+    }
+  }
 
-  void handle(NamedScalar*) override;
-  void handle(Int*) override;
   void handle(UnaryOp*) override;
   void handle(BinaryOp*) override;
 
   // TODO(kir): remove this
-  void handle(kir::NamedScalar*) override;
-  void handle(kir::Int*) override;
   void handle(kir::UnaryOp*) override;
   void handle(kir::BinaryOp*) override;
 
+  c10::optional<Int::ScalarType> maybeHandle(Val*);
+
  private:
-  const EvaluationContext* context_ = nullptr;
-  std::unordered_map<const Statement*, Int::ScalarType> values_;
+  std::unordered_map<const Val*, Int::ScalarType> bindings_;
+  Fusion* fusion_ = nullptr;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp
index 2f6f06c6359c..fcb12a978d2a 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.cpp
+++ b/torch/csrc/jit/codegen/cuda/fusion.cpp
@@ -1,11 +1,17 @@
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/codegen.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
 #include <torch/csrc/jit/codegen/cuda/ir_printer.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
+// TODO(kir): only needed until we can fix Fusion::origin()
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -25,36 +31,9 @@ Fusion* FusionGuard::getCurFusion() {
   return ACTIVE_FUSION;
 }
 
-void ExprSort::handle(Expr* expr) {
-  exprs.push_back(expr);
-}
-
-std::vector<Expr*> ExprSort::getExprs(Fusion* fusion, bool from_outputs_only) {
-  ExprSort es;
-  es.traverse(fusion, from_outputs_only);
-  return es.exprs;
-}
-
-std::vector<Expr*> ExprSort::getExprs(
-    Fusion* fusion,
-    const std::vector<Val*>& from) {
-  ExprSort es;
-  es.traverseFrom(fusion, from, false);
-  return es.exprs;
-}
-
-void InputsOf::handle(Val* v) {
-  if (FusionGuard::getCurFusion()->origin(v) == nullptr)
-    inputs.emplace(v);
-}
-
-std::unordered_set<Val*> InputsOf::output(Fusion* fusion, Val* output_) {
-  InputsOf io;
-  io.traverseFrom(FusionGuard::getCurFusion(), {output_}, false);
-  return io.inputs;
-}
-
 void swap(Fusion& a, Fusion& b) noexcept {
+  FUSER_PERF_SCOPE("Fusion swap");
+
   using std::swap;
 
   // Swap the content
@@ -90,6 +69,7 @@ void swap(Fusion& a, Fusion& b) noexcept {
   // Lowered IR nodes
   swap(a.lowered_val_set_, b.lowered_val_set_);
   swap(a.lowered_expr_set_, b.lowered_expr_set_);
+  swap(a.lowered_origin_, b.lowered_origin_);
 
   for (auto val : a.lowered_val_set_) {
     val->fusion_ = &a;
@@ -106,6 +86,8 @@ void swap(Fusion& a, Fusion& b) noexcept {
 }
 
 Fusion::Fusion(const Fusion& other) {
+  FUSER_PERF_SCOPE("Fusion copy");
+
   IrCloner ir_cloner(this);
 
   for (auto val : other.val_set_) {
@@ -140,22 +122,15 @@ Fusion::Fusion(const Fusion& other) {
 
   inputs_ = ir_cloner.clone(other.inputs_);
   outputs_ = ir_cloner.clone(other.outputs_);
-
-  // Lowered nodes
-  for (auto val : other.lowered_val_set_) {
-    lowered_val_set_.insert(ir_cloner.clone(val));
-  }
-
-  for (auto expr : other.lowered_expr_set_) {
-    lowered_expr_set_.insert(ir_cloner.clone(expr));
-  }
 }
 
 Fusion::Fusion(Fusion&& other) noexcept {
+  FUSER_PERF_SCOPE("Fusion move");
   swap(*this, other);
 }
 
 Fusion& Fusion::operator=(const Fusion& other) {
+  FUSER_PERF_SCOPE("Fusion copy assign");
   Fusion copy(other);
   clear();
   swap(*this, copy);
@@ -163,6 +138,7 @@ Fusion& Fusion::operator=(const Fusion& other) {
 }
 
 Fusion& Fusion::operator=(Fusion&& other) noexcept {
+  FUSER_PERF_SCOPE("Fusion move assign");
   clear();
   swap(*this, other);
   return *this;
@@ -173,6 +149,8 @@ Fusion::~Fusion() {
 }
 
 void Fusion::clear() noexcept {
+  FUSER_PERF_SCOPE("Fusion clear");
+
   // Free the owned values
   for (auto ptr : val_set_) {
     delete ptr;
@@ -208,6 +186,7 @@ void Fusion::clear() noexcept {
   }
   lowered_val_set_.clear();
   lowered_expr_set_.clear();
+  lowered_origin_.clear();
 }
 
 void Fusion::removeExpr(Expr* expr) {
@@ -263,39 +242,34 @@ void Fusion::removeVal(Val* val) {
   delete val;
 }
 
-void Fusion::addInput(Val* const input) {
+void Fusion::addInput(Val* input) {
   assertInFusion(input, "Cannot register input ");
 
   if (input->getValType().value() == ValType::TensorView) {
     auto tv = input->as<TensorView>();
-    if (tv->hasReduction())
+    if (tv->hasReduction()) {
       TORCH_WARN_ONCE(
           "Registered input ",
           input,
           " has a reduction axis, but this does nothing in the fusion.");
+    }
+    tv->setMemoryType(MemoryType::Global);
   }
 
-  TORCH_CHECK(
+  TORCH_INTERNAL_ASSERT(
       input->getOrigin() == nullptr,
       input,
       " cannot be registered as an input as it is used as an output of an expression (",
       input->getOrigin(),
       ").");
-
   inputs_.push_back(input);
 }
 
-void Fusion::addOutput(Val* const output) {
+void Fusion::addOutput(Val* output) {
   assertInFusion(output, "Cannot register output ");
   if (output->getValType().value() == ValType::TensorView) {
     auto tv = output->as<TensorView>();
-    if (TensorDomain::hasBroadcast(tv->getRootDomain()))
-      // Go to the root as we can merge bcast and
-      // non-bcast dims, making a non-bcast dim.
-      TORCH_CHECK( // Should we warn instead?
-          false,
-          output,
-          " cannot be registered as an output as it has a broadcast axis.");
+    tv->setMemoryType(MemoryType::Global);
   }
   outputs_.push_back(output);
 }
@@ -367,29 +341,35 @@ void Fusion::validateInputs() {
 }
 
 void Fusion::print() {
+  FUSER_PERF_SCOPE("Fusion::print");
+
   FusionGuard fg(this);
   std::cout << "%kernel {\n";
-  IRMathPrinter op_exprs(std::cout);
+  IrMathPrinter op_exprs(std::cout);
   op_exprs.handle(this);
-  IRTransformPrinter t_exprs(std::cout);
+  IrTransformPrinter t_exprs(std::cout);
   t_exprs.handle(this);
   std::cout << "}\n";
 }
 
 void Fusion::printKernel() {
-  GpuLower lower(this);
-  lower.printKernel(std::cout);
+  FUSER_PERF_SCOPE("Fusion::printKernel");
+  std::cout << codegen::generateCudaKernel(GpuLower(this).kernel());
 }
 
 void Fusion::printMath() {
+  FUSER_PERF_SCOPE("Fusion::printMath");
+
   FusionGuard fg(this);
   for (auto expr : exprs(true))
     std::cout << expr;
 }
 
 void Fusion::printTransforms() {
+  FUSER_PERF_SCOPE("Fusion::printTransforms");
+
   FusionGuard fg(this);
-  IRTransformPrinter t_exprs(std::cout);
+  IrTransformPrinter t_exprs(std::cout);
   t_exprs.handle(this);
 }
 
@@ -478,13 +458,11 @@ StmtNameType Fusion::registerLoweredExpr(Expr* expr) {
 
   for (Val* input : expr->inputs()) {
     TORCH_CHECK(inKernelIr(input));
-    assertInFusion(input);
   }
 
   for (Val* output : expr->outputs()) {
     TORCH_CHECK(inKernelIr(output));
-    assertInFusion(output);
-    TORCH_CHECK(origin_.insert({output, expr}).second);
+    TORCH_CHECK(lowered_origin_.insert({output, expr}).second);
   }
 
   lowered_expr_set_.insert(expr);
@@ -518,20 +496,17 @@ std::unordered_set<Expr*> Fusion::unordered_uses(Val* val) const {
   return std::unordered_set<Expr*>();
 }
 
-Expr* Fusion::origin(Val* val) const {
-  assertInFusion(val, "Cannot detect the origin of val, ");
-  auto it = origin_.find(val);
-  if (it == origin_.end())
-    return nullptr;
-  return it->second;
-}
-
-const Expr* Fusion::origin(const Val* val) const {
-  assertInFusion(val, "Cannot dettect the origin of val, ");
-  auto it = origin_.find(const_cast<Val*>(val)); // NOLINT
-  if (it == origin_.end())
-    return nullptr;
-  return it->second;
+Expr* Fusion::origin(const Val* val) const {
+  // TODO(kir): remove the lowered branch
+  if (kir::isLoweredVal(val)) {
+    TORCH_INTERNAL_ASSERT(inKernelIr(val));
+    auto it = lowered_origin_.find(val);
+    return it != lowered_origin_.end() ? it->second : nullptr;
+  } else {
+    assertInFusion(val, "Cannot detect the origin of val, ");
+    auto it = origin_.find(val);
+    return it != origin_.end() ? it->second : nullptr;
+  }
 }
 
 bool Fusion::hasInput(const Val* val) const {
@@ -559,7 +534,7 @@ StmtNameType Fusion::getExprName() {
 }
 
 // Indicate to kernel to set itself up to generate random numbers
-bool Fusion::hasRNG() {
+bool Fusion::isStochastic() {
   for (auto expr : exprs(true))
     if (expr->getExprType() == ExprType::UnaryOp)
       if (expr->as<UnaryOp>()->getUnaryOpType() == UnaryOpType::RandLike)
@@ -567,8 +542,9 @@ bool Fusion::hasRNG() {
   return false;
 }
 
-// Indicate to kernel to set itself up to generate random numbers
 bool Fusion::hasReduction() {
+  FUSER_PERF_SCOPE("Fusion::hasReduction");
+
   for (auto expr : exprs(true))
     for (auto out : expr->outputs())
       if (out->getValType() == ValType::TensorView)
@@ -579,6 +555,8 @@ bool Fusion::hasReduction() {
 }
 
 bool Fusion::hasBlockReduction() {
+  FUSER_PERF_SCOPE("Fusion::hasBlockReduction");
+
   for (auto expr : exprs(true))
     for (auto out : expr->outputs())
       if (out->getValType() == ValType::TensorView)
@@ -589,6 +567,8 @@ bool Fusion::hasBlockReduction() {
 }
 
 bool Fusion::hasGridReduction() {
+  FUSER_PERF_SCOPE("Fusion::hasGridReduction");
+
   for (auto expr : exprs(true))
     for (auto out : expr->outputs())
       if (out->getValType() == ValType::TensorView)
@@ -598,7 +578,32 @@ bool Fusion::hasGridReduction() {
   return false;
 }
 
+bool Fusion::hasBlockBroadcast() {
+  for (auto expr : exprs(true)) {
+    for (auto out : expr->outputs()) {
+      if (out->getValType() == ValType::TensorView) {
+        if (out->as<TensorView>()->hasBlockBroadcast()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool Fusion::hasBroadcast() {
+  for (auto expr : exprs(true))
+    for (auto out : expr->outputs())
+      if (out->getValType() == ValType::TensorView)
+        if (out->as<TensorView>()->hasBroadcast())
+          return true;
+
+  return false;
+}
+
 std::vector<Val*> Fusion::getTerminatingOutputs() {
+  FUSER_PERF_SCOPE("getTerminatingOutputs");
+
   FusionGuard fg(this);
 
   std::unordered_set<Val*> used_vals;
diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h
index 4d0d50b78dc9..99c97cc91943 100644
--- a/torch/csrc/jit/codegen/cuda/fusion.h
+++ b/torch/csrc/jit/codegen/cuda/fusion.h
@@ -4,7 +4,6 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 
 #include <unordered_map>
 #include <unordered_set>
@@ -14,14 +13,6 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-// https://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key
-struct TypeHash {
-  template <typename T>
-  std::size_t operator()(T t) const {
-    return static_cast<std::size_t>(t);
-  }
-};
-
 /*
  * Usage: FusionGuard and Fusion are required user interfaces for any operation
  * underlying the code generator. In order to create values, expressions, and
@@ -65,32 +56,6 @@ class TORCH_CUDA_API FusionGuard {
   static Fusion* getCurFusion();
 };
 
-// Expr sort will take a fusion and return a topologically sorted list of
-// expressions.
-class ExprSort : public IterVisitor {
- private:
-  std::vector<Expr*> exprs;
-
-  void handle(Expr* expr) override;
-
- public:
-  static std::vector<Expr*> getExprs(Fusion* fusion, bool from_outputs_only);
-
-  static std::vector<Expr*> getExprs(
-      Fusion* fusion,
-      const std::vector<Val*>& from);
-};
-
-class InputsOf : public IterVisitor {
- private:
-  std::unordered_set<Val*> inputs;
-
-  void handle(Val* v) final;
-
- public:
-  static std::unordered_set<Val*> output(Fusion* fusion, Val* output_);
-};
-
 /*
  * Fusion is mutable but unique. Nodes cannot be copied in any way from one
  * Fusion to another. If anything like that is desired, it would require
@@ -125,10 +90,10 @@ class TORCH_CUDA_API Fusion final {
   void removeVal(Val* val);
 
   // Register input as an input of the fusion
-  void addInput(Val* const input);
+  void addInput(Val* input);
 
   // Register output as an output of the fusion
-  void addOutput(Val* const output);
+  void addOutput(Val* output);
 
   // Check if stmt is properly registered with this fusion
   bool inFusion(const Statement* stmt) const;
@@ -162,8 +127,10 @@ class TORCH_CUDA_API Fusion final {
 
   // Print transformations used in fusion (can be very verbose)
   void printTransforms();
+
   // Lower the fusion and print a kernel
   void printKernel();
+
   // Register the Val with this fusion
   StmtNameType registerVal(Val* val);
 
@@ -177,10 +144,12 @@ class TORCH_CUDA_API Fusion final {
   StmtNameType registerStatement(Statement* stmt);
 
   // Lowered nodes
+  // TODO(kir): to be removed
   StmtNameType registerLoweredVal(Val* val);
   StmtNameType registerLoweredExpr(Expr* expr);
 
   // Lowered counterpart to inFusion()
+  // TODO(kir): to be removed
   bool inKernelIr(const Statement* stmt) const;
 
   // Check if val is used in this fusion. Not equivelent to DCE
@@ -198,17 +167,17 @@ class TORCH_CUDA_API Fusion final {
   std::unordered_set<Expr*> unordered_uses(Val* val) const;
 
   // Return the Expr that produces val
-  Expr* origin(Val* val) const;
-
-  // Return the Expr that produces val (const version)
-  const Expr* origin(const Val* val) const;
+  Expr* origin(const Val* val) const;
 
   // Indicate to kernel to set itself up to generate random numbers
-  bool hasRNG();
+  bool isStochastic();
 
+  // TODO(kir): revisit to see how many of these are still needed
   bool hasReduction();
   bool hasBlockReduction();
   bool hasGridReduction();
+  bool hasBlockBroadcast();
+  bool hasBroadcast();
   size_t gridReductionTempBufferSize();
 
   const auto& inputs() const {
@@ -247,7 +216,7 @@ class TORCH_CUDA_API Fusion final {
   StmtNameType expr_name_counter_ = 0;
 
   // Dependency tracking for Vals. Where did it come from? Where is it used?
-  std::unordered_map<Val*, Expr*> origin_;
+  std::unordered_map<const Val*, Expr*> origin_;
   std::unordered_map<Val*, std::unordered_set<Expr*>> uses_;
 
   // Fusion inputs and outputs
@@ -257,6 +226,7 @@ class TORCH_CUDA_API Fusion final {
   // Lowered IR
   std::unordered_set<Val*> lowered_val_set_;
   std::unordered_set<Expr*> lowered_expr_set_;
+  std::unordered_map<const Val*, Expr*> lowered_origin_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
index 2356314c66db..1dfdc7b1edcd 100644
--- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
+++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/passes/cuda_graph_fuser.h>
 
 #include <c10/util/Exception.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <torch/csrc/jit/codegen/cuda/partition.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
@@ -847,6 +848,8 @@ struct CudaGraphFuser {
 };
 
 void compileFusionRecursive(Block* block) {
+  FUSER_PERF_SCOPE("compileFusionRecursive");
+
   for (auto node : block->nodes()) {
     if (node->kind() == prim::CudaFusionGroup) {
       fuser::cuda::compileFusionGroup(node);
@@ -858,6 +861,8 @@ void compileFusionRecursive(Block* block) {
 }
 
 void PeepholeOptimizeShapeExpressions(Block* block) {
+  FUSER_PERF_SCOPE("PeepholeOptimizeShapeExpressions");
+
   auto nodes = block->nodes();
   for (auto it = nodes.begin(); it != nodes.end(); ++it) {
     Node* node = *it;
@@ -912,6 +917,8 @@ void PeepholeOptimizeShapeExpressions(Block* block) {
 } // anonymous namespace
 
 void CudaFuseGraph(std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("CudaFuseGraph");
+
   CudaGraphFuser(graph->block(), graph).run();
   // After FuseGraph some common subexpressions may come back
   EliminateCommonSubexpression(graph);
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
index ba5976dc53c8..9b757661e12d 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -1,8 +1,13 @@
+
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
@@ -53,8 +58,8 @@ class ContigIDs : public OptInDispatch {
     // If either input is non-contiguous so is output.
     auto inner = merge->inner();
     auto outer = merge->outer();
-    if (!isContig(kir::lowerValue(inner)->as<kir::IterDomain>()) ||
-        !isContig(kir::lowerValue(outer)->as<kir::IterDomain>())) {
+    if (!isContig(GpuLower::lowerValue(inner)->as<kir::IterDomain>()) ||
+        !isContig(GpuLower::lowerValue(outer)->as<kir::IterDomain>())) {
       return;
     }
 
@@ -117,9 +122,11 @@ class ContigIDs : public OptInDispatch {
     // If we matched all inputs, the output is contiguous. Only want to keep the
     // top contig ID, lower ids should be placed in the "within_contig_ids" map
     // of top id.
-    auto kir_inner = kir::lowerValue(merge->inner())->as<kir::IterDomain>();
-    auto kir_outer = kir::lowerValue(merge->outer())->as<kir::IterDomain>();
-    auto kir_out = kir::lowerValue(merge->out())->as<kir::IterDomain>();
+    auto kir_inner =
+        GpuLower::lowerValue(merge->inner())->as<kir::IterDomain>();
+    auto kir_outer =
+        GpuLower::lowerValue(merge->outer())->as<kir::IterDomain>();
+    auto kir_out = GpuLower::lowerValue(merge->out())->as<kir::IterDomain>();
     if (ordered_inputs.empty()) {
       if (contig_ids.find(kir_inner) != contig_ids.end()) {
         contig_ids.erase(kir_inner);
@@ -175,7 +182,7 @@ class ContigIDs : public OptInDispatch {
     for (size_t i = 0; i < root_domain_.size(); i++) {
       if (root_contiguity_[i]) {
         auto kir_root_domain_i =
-            kir::lowerValue(root_domain_[i])->as<kir::IterDomain>();
+            GpuLower::lowerValue(root_domain_[i])->as<kir::IterDomain>();
         contig_ids.emplace(kir_root_domain_i);
         within_contig_ids[kir_root_domain_i] =
             std::unordered_set<kir::IterDomain*>();
@@ -204,9 +211,9 @@ class ContigIDs : public OptInDispatch {
 } // namespace
 
 void IndexCompute::handle(Split* split) {
-  auto in_id = kir::lowerValue(split->in())->as<kir::IterDomain>();
-  auto outer_id = kir::lowerValue(split->outer())->as<kir::IterDomain>();
-  auto inner_id = kir::lowerValue(split->inner())->as<kir::IterDomain>();
+  auto in_id = GpuLower::lowerValue(split->in())->as<kir::IterDomain>();
+  auto outer_id = GpuLower::lowerValue(split->outer())->as<kir::IterDomain>();
+  auto inner_id = GpuLower::lowerValue(split->inner())->as<kir::IterDomain>();
 
   auto outer_it = index_map_.find(outer_id);
   auto inner_it = index_map_.find(inner_id);
@@ -236,8 +243,11 @@ void IndexCompute::handle(Split* split) {
     }
   }
 
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (outer_zero && inner_zero) {
-    index_map_[in_id] = new kir::Int(0);
+    index_map_[in_id] = ir_builder.create<kir::Int>(0);
+    extent_map_[in_id] = ir_builder.create<kir::Int>(0);
   } else if (outer_zero) {
     index_map_[in_id] = inner_ind;
     zero_merged_in_.emplace(in_id);
@@ -247,15 +257,20 @@ void IndexCompute::handle(Split* split) {
     zero_merged_in_.emplace(in_id);
     extent_map_[in_id] = getExtent(outer_id);
   } else {
-    index_map_[in_id] =
-        kir::addExpr(kir::mulExpr(outer_ind, getExtent(inner_id)), inner_ind);
+    index_map_[in_id] = ir_builder.addExpr(
+        ir_builder.mulExpr(outer_ind, getExtent(inner_id)), inner_ind);
+    if (extent_map_.find(outer_id) != extent_map_.end() ||
+        extent_map_.find(inner_id) != extent_map_.end()) {
+      extent_map_[in_id] =
+          ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id));
+    }
   }
 }
 
 void IndexCompute::handle(Merge* merge) {
-  auto out_id = kir::lowerValue(merge->out())->as<kir::IterDomain>();
-  auto outer_id = kir::lowerValue(merge->outer())->as<kir::IterDomain>();
-  auto inner_id = kir::lowerValue(merge->inner())->as<kir::IterDomain>();
+  auto out_id = GpuLower::lowerValue(merge->out())->as<kir::IterDomain>();
+  auto outer_id = GpuLower::lowerValue(merge->outer())->as<kir::IterDomain>();
+  auto inner_id = GpuLower::lowerValue(merge->inner())->as<kir::IterDomain>();
 
   auto out_it = index_map_.find(out_id);
   if (out_it == index_map_.end())
@@ -263,7 +278,8 @@ void IndexCompute::handle(Merge* merge) {
 
   auto out_ind = out_it->second;
 
-  auto zero = new kir::Int(0);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  auto zero = ir_builder.create<kir::Int>(0);
 
   if (out_ind->isZeroInt()) {
     index_map_[outer_id] = zero;
@@ -281,11 +297,11 @@ void IndexCompute::handle(Merge* merge) {
     TORCH_INTERNAL_ASSERT(!input_ids.empty());
 
     for (auto root_id : input_ids) {
-      index_map_[kir::lowerValue(root_id)->as<kir::IterDomain>()] = zero;
+      index_map_[GpuLower::lowerValue(root_id)->as<kir::IterDomain>()] = zero;
     }
 
-    index_map_[kir::lowerValue(*(input_ids.end() - 1))->as<kir::IterDomain>()] =
-        out_ind;
+    index_map_[GpuLower::lowerValue(*(input_ids.end() - 1))
+                   ->as<kir::IterDomain>()] = out_ind;
     return;
   }
 
@@ -314,8 +330,8 @@ void IndexCompute::handle(Merge* merge) {
   } else {
     Val* I = inner_extent;
 
-    Val* outer_ind = kir::divExpr(out_ind, I);
-    Val* inner_ind = kir::modExpr(out_ind, I);
+    Val* outer_ind = ir_builder.divExpr(out_ind, I);
+    Val* inner_ind = ir_builder.modExpr(out_ind, I);
 
     index_map_[outer_id] = outer_ind;
     index_map_[inner_id] = inner_ind;
@@ -346,6 +362,8 @@ IndexCompute::IndexCompute(
       index_map_(std::move(initial_index_map)),
       extent_map_(std::move(_extent_map)),
       zero_merged_in_(std::move(_zero_merged_in)) {
+  FUSER_PERF_SCOPE("IndexCompute::IndexCompute");
+
   // Make sure we recompute any indices we can that map to a contiguous access
   // in physical memory.
   if (std::any_of(root_contiguity.begin(), root_contiguity.end(), [](bool b) {
@@ -389,6 +407,8 @@ IndexCompute IndexCompute::updateIndexCompute(
     const std::unordered_map<IterDomain*, IterDomain*>& id_map,
     std::unordered_map<kir::IterDomain*, Val*> new_index_entries,
     const std::vector<bool>& root_contiguity) {
+  FUSER_PERF_SCOPE("updateIndexCompute");
+
   std::unordered_map<kir::IterDomain*, Val*> updated_index_map =
       std::move(new_index_entries);
   std::unordered_map<kir::IterDomain*, Val*> updated_extent_map;
@@ -396,9 +416,9 @@ IndexCompute IndexCompute::updateIndexCompute(
 
   for (auto id_entry : id_map) {
     kir::IterDomain* prev_id =
-        kir::lowerValue(id_entry.first)->as<kir::IterDomain>();
+        GpuLower::lowerValue(id_entry.first)->as<kir::IterDomain>();
     kir::IterDomain* new_id =
-        kir::lowerValue(id_entry.second)->as<kir::IterDomain>();
+        GpuLower::lowerValue(id_entry.second)->as<kir::IterDomain>();
 
     if (index_map_.find(prev_id) != index_map_.end()) {
       updated_index_map[new_id] = index_map_.at(prev_id);
@@ -443,6 +463,8 @@ std::vector<bool> IndexCompute::contiguityAnd(
 std::vector<bool> IndexCompute::contiguityPasC(
     TensorDomain* producer,
     TensorDomain* consumer) {
+  FUSER_PERF_SCOPE("contiguityPasC");
+
   const std::vector<bool>& producer_contiguity = producer->contiguity();
   std::vector<bool> as_consumer_contiguity;
 
@@ -578,7 +600,7 @@ generateIndexAndExtentMap(
 
   std::transform(
       td.begin(), td.end(), std::back_inserter(kir_td), [](IterDomain* id) {
-        return kir::lowerValue(id)->as<kir::IterDomain>();
+        return GpuLower::lowerValue(id)->as<kir::IterDomain>();
       });
 
   // Map from all IterDomain's to corresponding index as we process each tv in
@@ -617,7 +639,7 @@ generateIndexAndExtentMap(
     kir_td.clear();
     std::transform(
         td.begin(), td.end(), std::back_inserter(kir_td), [](IterDomain* id) {
-          return kir::lowerValue(id)->as<kir::IterDomain>();
+          return GpuLower::lowerValue(id)->as<kir::IterDomain>();
         });
 
     // Match loops to this TV if the loop matchis this TV's ID (could reduce
@@ -668,7 +690,7 @@ generateIndexAndExtentMap(
     auto first_id_map = c2p_ID_maps.front();
     for (auto id_entry : first_id_map) {
       kir::IterDomain* this_id =
-          kir::lowerValue(id_entry.first)->as<kir::IterDomain>();
+          GpuLower::lowerValue(id_entry.first)->as<kir::IterDomain>();
       if (initial_extent_map.find(this_id) == initial_extent_map.end()) {
         initial_extent_map[this_id] = this_id->extent();
       }
@@ -705,7 +727,19 @@ generateIndexAndExtentMap(
 
   // PROPAGATE CONSUMER -> PRODUCER END
 
-  return std::make_pair(index_compute.indexMap(), index_compute.extentMap());
+  // Fill in extent map as some mapped indices may not have their extent filled
+  // in it, but consumers of this function expect it to be there
+
+  std::unordered_map<kir::IterDomain*, Val*> extent_map(
+      index_compute.extentMap());
+  for (auto ind_entry : index_compute.indexMap()) {
+    auto id = ind_entry.first;
+    if (extent_map.find(id) == extent_map.end()) {
+      extent_map[id] = id->extent();
+    }
+  }
+
+  return std::make_pair(index_compute.indexMap(), extent_map);
 }
 
 } // namespace
@@ -714,6 +748,10 @@ kir::TensorIndex* Index::getGlobalProducerIndex(
     TensorView* producer_tv,
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("getGlobalProducerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // Replay producer to look like consumer so we can index on producer since our
   // loop nests look like consumer
   auto producerAsC = TransformReplay::replayPasC(
@@ -762,7 +800,8 @@ kir::TensorIndex* Index::getGlobalProducerIndex(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -783,15 +822,16 @@ kir::TensorIndex* Index::getGlobalProducerIndex(
     } else {
       std::stringstream ss;
       ss << "T" << producer_tv->name() << ".stride[" << stride_i++ << "]";
-      strided_inds.push_back(kir::mulExpr(
-          root_ind, new kir::NamedScalar(ss.str(), DataType::Int)));
+      strided_inds.push_back(ir_builder.mulExpr(
+          root_ind,
+          ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int)));
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(producer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(producer_tv, strided_inds);
 }
 
 namespace {
@@ -807,7 +847,8 @@ std::unordered_map<kir::ForLoop*, Val*> indexMapFromTV(
     within_alloc = true;
   }
 
-  Val* zero = new kir::Int(0);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  Val* zero = ir_builder.create<kir::Int>(0);
 
   bool is_shared = tv->getMemoryType() == MemoryType::Shared;
   bool is_local = tv->getMemoryType() == MemoryType::Local;
@@ -831,6 +872,7 @@ std::unordered_map<kir::ForLoop*, Val*> indexMapFromTV(
   }
   return loop_to_ind_map;
 }
+
 } // namespace
 
 // Producer index for either shared or local memory
@@ -838,6 +880,8 @@ kir::TensorIndex* Index::getProducerIndex_impl(
     TensorView* producer_tv,
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // producer_tv->domain() is not replayed as the loop strucutre we were
   // provided, so replay it to match consumer_tv which is.
   auto producerAsC = TransformReplay::replayPasC(
@@ -874,7 +918,8 @@ kir::TensorIndex* Index::getProducerIndex_impl(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -899,7 +944,8 @@ kir::TensorIndex* Index::getProducerIndex_impl(
         continue;
       }
 
-      auto kir_root_dom_j = kir::lowerValue(root_dom[j])->as<kir::IterDomain>();
+      auto kir_root_dom_j =
+          GpuLower::lowerValue(root_dom[j])->as<kir::IterDomain>();
 
       TORCH_INTERNAL_ASSERT(
           index_map.find(kir_root_dom_j) != index_map.end() &&
@@ -920,27 +966,31 @@ kir::TensorIndex* Index::getProducerIndex_impl(
         if (stride == nullptr) {
           stride = root_ext_j;
         } else {
-          stride = kir::mulExpr(stride, root_ext_j);
+          stride = ir_builder.mulExpr(stride, root_ext_j);
         }
       }
     }
 
     if (stride != nullptr) {
-      strided_inds.push_back(kir::mulExpr(root_ind_i, stride));
+      strided_inds.push_back(ir_builder.mulExpr(root_ind_i, stride));
     } else {
       strided_inds.push_back(root_ind_i);
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(producer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(producer_tv, strided_inds);
 }
 
 kir::TensorIndex* Index::getGlobalConsumerIndex(
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("getGlobalConsumerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // grab all tensor views from producer_tv <- computeAtRoot
   std::deque<TensorView*> tv_stack = getComputeAtTVStackFrom(consumer_tv);
 
@@ -977,7 +1027,8 @@ kir::TensorIndex* Index::getGlobalConsumerIndex(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -996,22 +1047,24 @@ kir::TensorIndex* Index::getGlobalConsumerIndex(
     } else {
       std::stringstream ss;
       ss << "T" << consumer_tv->name() << ".stride[" << stride_i++ << "]";
-      strided_inds.push_back(
-          kir::mulExpr(ind, new kir::NamedScalar(ss.str(), DataType::Int)));
+      strided_inds.push_back(ir_builder.mulExpr(
+          ind, ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int)));
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(consumer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(consumer_tv, strided_inds);
 }
 
 // Consumer index for either shared or local memory
 kir::TensorIndex* Index::getConsumerIndex_impl(
     TensorView* consumer_tv,
     const std::vector<kir::ForLoop*>& loops) {
-  // grab all tensor views from producer_tv <- computeAtRoot
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
+  // grab all tensor views from consumer_tv <- computeAtRoot
   std::deque<TensorView*> tv_stack = getComputeAtTVStackFrom(consumer_tv);
 
   std::unordered_map<kir::ForLoop*, Val*> loop_to_ind_map =
@@ -1026,7 +1079,7 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
   auto index_map = index_and_extent_map.first;
   auto extent_map = index_and_extent_map.second;
 
-  // Indices should now be mapped onto IterDomains in producer, so just grab
+  // Indices should now be mapped onto IterDomains in consumer, so just grab
   // and use them.
   auto root_dom = consumer_tv->getMaybeRFactorDomain();
 
@@ -1036,7 +1089,8 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
 
     TORCH_INTERNAL_ASSERT(
         index_map.find(kir_root_dom_i) != index_map.end(),
@@ -1060,7 +1114,8 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
         continue;
       }
 
-      auto kir_root_dom_j = kir::lowerValue(root_dom[j])->as<kir::IterDomain>();
+      auto kir_root_dom_j =
+          GpuLower::lowerValue(root_dom[j])->as<kir::IterDomain>();
 
       TORCH_INTERNAL_ASSERT(
           index_map.find(kir_root_dom_j) != index_map.end() &&
@@ -1079,22 +1134,22 @@ kir::TensorIndex* Index::getConsumerIndex_impl(
         if (stride == nullptr) {
           stride = root_ext_j;
         } else {
-          stride = kir::mulExpr(stride, root_ext_j);
+          stride = ir_builder.mulExpr(stride, root_ext_j);
         }
       }
     }
 
     if (stride != nullptr) {
-      strided_inds.push_back(kir::mulExpr(root_ind_i, stride));
+      strided_inds.push_back(ir_builder.mulExpr(root_ind_i, stride));
     } else {
       strided_inds.push_back(root_ind_i);
     }
   }
 
   if (strided_inds.size() == 0)
-    strided_inds.push_back(new kir::Int(0));
+    strided_inds.push_back(ir_builder.create<kir::Int>(0));
 
-  return new kir::TensorIndex(consumer_tv, strided_inds);
+  return ir_builder.create<kir::TensorIndex>(consumer_tv, strided_inds);
 }
 
 // Producer is the inputs of an expression
@@ -1102,8 +1157,12 @@ kir::TensorIndex* Index::getProducerIndex(
     TensorView* producer,
     TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("Index::getProducerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (producer->domain()->noReductions().size() == 0) {
-    return new kir::TensorIndex(producer, {});
+    return ir_builder.create<kir::TensorIndex>(producer, std::vector<Val*>{});
   }
 
   if (producer->getMemoryType() == MemoryType::Global) {
@@ -1117,8 +1176,12 @@ kir::TensorIndex* Index::getProducerIndex(
 kir::TensorIndex* Index::getConsumerIndex(
     TensorView* consumer,
     const std::vector<kir::ForLoop*>& loops) {
+  FUSER_PERF_SCOPE("Index::getConsumerIndex");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (consumer->domain()->noReductions().size() == 0) {
-    return new kir::TensorIndex(consumer, {});
+    return ir_builder.create<kir::TensorIndex>(consumer, std::vector<Val*>{});
   }
 
   if (consumer->getMemoryType() == MemoryType::Global) {
@@ -1135,6 +1198,10 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
     const std::vector<kir::ForLoop*>& loops,
     const std::vector<bool>& root_contiguity,
     bool unroll) {
+  FUSER_PERF_SCOPE("Index::getConsumerRootPredIndices");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   // grab all tensor views from producer_tv <- computeAtRoot
   std::deque<TensorView*> tv_stack = getComputeAtTVStackFrom(consumer_tv);
 
@@ -1148,7 +1215,7 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
 
   if (unroll) {
     bool within_unroll = false;
-    Val* one = new kir::Int(1);
+    Val* one = ir_builder.create<kir::Int>(1);
     for (auto loop : loops) {
       if (loop->iter_domain()->getParallelType() == ParallelType::Unroll) {
         within_unroll = true;
@@ -1156,7 +1223,7 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
 
       if (within_unroll && !loop->iter_domain()->isThread()) {
         loop_to_ind_map[loop] =
-            kir::subExpr(loop->iter_domain()->extent(), one);
+            ir_builder.subExpr(loop->iter_domain()->extent(), one);
       }
     }
   }
@@ -1179,7 +1246,7 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
     for (auto rfactor_id : rfactor_dom) {
       if (rfactor_id->isReduction()) {
         auto kir_rfactor_id =
-            kir::lowerValue(rfactor_id)->as<kir::IterDomain>();
+            GpuLower::lowerValue(rfactor_id)->as<kir::IterDomain>();
         if (index_map.find(kir_rfactor_id) != index_map.end()) {
           if (!index_map.at(kir_rfactor_id)->isZeroInt()) {
             use_rfactor = false;
@@ -1193,13 +1260,14 @@ std::pair<std::vector<Val*>, bool> Index::getConsumerRootPredIndices(
   auto root_dom = use_rfactor ? consumer_tv->getMaybeRFactorDomain()
                               : consumer_tv->getRootDomain();
 
-  std::vector<Val*> root_inds(root_dom.size(), new kir::Int(0));
+  std::vector<Val*> root_inds(root_dom.size(), ir_builder.create<kir::Int>(0));
   for (size_t i = 0; i < root_dom.size(); i++) {
     if (root_dom[i]->isBroadcast()) {
       continue;
     }
 
-    auto kir_root_dom_i = kir::lowerValue(root_dom[i])->as<kir::IterDomain>();
+    auto kir_root_dom_i =
+        GpuLower::lowerValue(root_dom[i])->as<kir::IterDomain>();
     if (index_map.find(kir_root_dom_i) != index_map.end()) {
       auto ind = index_map.at(kir_root_dom_i);
       TORCH_INTERNAL_ASSERT(kir::isLoweredScalar(ind))
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.cpp b/torch/csrc/jit/codegen/cuda/instrumentation.cpp
new file mode 100644
index 000000000000..80a0c66075f0
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/instrumentation.cpp
@@ -0,0 +1,71 @@
+
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace inst {
+
+Trace::Trace() {
+  const char* trace_filename = getenv("PYTORCH_CUDA_FUSER_TRACE");
+  if (trace_filename != nullptr) {
+    log_file_ = fopen(trace_filename, "w");
+    TORCH_CHECK(log_file_ != nullptr, "Can't open trace file");
+
+    // Disable the file stream buffering, since it may result
+    // in torn writes in multi-threaded tracing
+    setbuf(log_file_, nullptr);
+
+    // Print the trace prologue
+    // (including a dummy TRACE_START event)
+    fprintf(log_file_, "{\n\"traceEvents\": [\n");
+    start_timestamp_ = Clock::now();
+    logEvent('I', "TRACE_START");
+  }
+}
+
+Trace::~Trace() {
+  if (log_file_ != nullptr) {
+    // Print trace epilogue
+    logEvent('I', "TRACE_END", ' ');
+    fprintf(log_file_, "],\n\"displayTimeUnit\": \"ms\"\n}\n");
+    fclose(log_file_);
+  }
+}
+
+void Trace::logEvent(char ph, const char* name, char sep) {
+  const std::chrono::duration<double> d = Clock::now() - start_timestamp_;
+  const double elapsed = d.count() * 1e6;
+
+#ifdef _WIN32
+  const unsigned int pid = GetCurrentProcessId();
+  const unsigned int tid = GetCurrentThreadId();
+#else
+  const unsigned int pid = getpid();
+  const unsigned int tid = pthread_self();
+#endif // _WIN32
+
+  fprintf(
+      log_file_,
+      "{ \"name\": \"%s\", \"ph\": \"%c\", \"pid\": %u, \"tid\": %u, \"ts\": %.0f }%c\n",
+      name,
+      ph,
+      pid,
+      tid,
+      elapsed,
+      sep);
+}
+
+} // namespace inst
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.h b/torch/csrc/jit/codegen/cuda/instrumentation.h
new file mode 100644
index 000000000000..b3c2454570ee
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/instrumentation.h
@@ -0,0 +1,93 @@
+
+#pragma once
+
+#include <torch/csrc/jit/codegen/cuda/utils.h>
+
+#include <stdio.h>
+#include <chrono>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace inst {
+
+//! An optional record of selected timestamped operations, events and counters
+//!
+//! This class is not intended to be used directly. Instead, the operations
+//! to be traced are marked (for example using the FUSER_PERF_SCOPE macro)
+//!
+//! In order to enable tracing, the `PYTORCH_CUDA_FUSER_TRACE` environment
+//! variable is set to point to a trace file (ex `test.trace`). The file name
+//! may be a relative or an absolute path.
+//!
+//! The trace uses the Chrome Tracing (Catapult) format, which is a well
+//! documented JSON based format supported by multiple tools:
+//! https://chromium.googlesource.com/catapult/+/HEAD/tracing/README.md
+//!
+//! An easy way to view traces is to type `about://tracing` in Chrome or
+//! Chromium.
+//!
+class Trace : public NonCopyable {
+ public:
+  using Clock = std::chrono::steady_clock;
+
+ public:
+  static Trace* instance() {
+    static Trace trace;
+    return &trace;
+  }
+
+  void beginEvent(const char* name) {
+    if (log_file_ != nullptr) {
+      logEvent('B', name);
+    }
+  }
+
+  void endEvent(const char* name) {
+    if (log_file_ != nullptr) {
+      logEvent('E', name);
+    }
+  }
+
+ private:
+  Trace();
+  ~Trace();
+
+  void logEvent(char ph, const char* name, char sep = ',');
+
+ private:
+  FILE* log_file_ = nullptr;
+  Clock::time_point start_timestamp_;
+};
+
+//! \internal Automatic scope for a perf marker
+//!   (normally used through the FUSER_PERF_SCOPE macro)
+class TraceScope : public NonCopyable {
+ public:
+  explicit TraceScope(const char* event_name) : event_name_(event_name) {
+    Trace::instance()->beginEvent(event_name_);
+  }
+
+  ~TraceScope() {
+    Trace::instance()->endEvent(event_name_);
+  }
+
+ private:
+  const char* event_name_ = nullptr;
+};
+
+#define FUSER_MACRO_CONCAT2(a, b) a##b
+#define FUSER_MACRO_CONCAT(a, b) FUSER_MACRO_CONCAT2(a, b)
+#define FUSER_ANONYMOUS(prefix) FUSER_MACRO_CONCAT(prefix, __COUNTER__)
+
+//! Defines a scope we want to measure and record in a perf trace
+//!
+//! \param name The name of the scope, normally a simple string literal
+//!
+#define FUSER_PERF_SCOPE(name) \
+  fuser::inst::TraceScope FUSER_ANONYMOUS(_perf_scope_)(name)
+
+} // namespace inst
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp
index 62a5fe5ceca5..42dfed02b114 100644
--- a/torch/csrc/jit/codegen/cuda/interface.cpp
+++ b/torch/csrc/jit/codegen/cuda/interface.cpp
@@ -1,3 +1,4 @@
+
 #include <torch/csrc/jit/codegen/cuda/interface.h>
 #include <ATen/core/dispatch/OperatorOptions.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
index f743db767d9a..9f6b3fdb50b6 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp
@@ -36,7 +36,7 @@ Expr* Statement::asExpr() {
 }
 
 void Statement::print() const {
-  IRPrinter ir_printer(std::cout);
+  IrPrinter ir_printer(std::cout);
   ir_printer.handle(this);
   std::cout << std::endl;
 }
@@ -121,6 +121,26 @@ class ConstCheck : OptOutConstDispatch {
     is_const_ = is_const_ && false;
   }
 
+  void handle(const kir::Bool* b) override {
+    is_const_ = is_const_ && b->isConst();
+  }
+
+  void handle(const kir::Float* f) override {
+    is_const_ = is_const_ && f->isConst();
+  }
+
+  void handle(const kir::Half* h) override {
+    is_const_ = is_const_ && h->isConst();
+  }
+
+  void handle(const kir::Int* i) override {
+    is_const_ = is_const_ && i->isConst();
+  }
+
+  void handle(const kir::NamedScalar* ns) override {
+    is_const_ = is_const_ && false;
+  }
+
   void handle(const Expr* expr) override {
     for (auto inp : expr->inputs()) {
       handle(inp);
diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
index 06d78dc48fae..2719cd056f95 100644
--- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h
@@ -36,6 +36,7 @@ namespace jit {
 namespace fuser {
 
 using StmtNameType = unsigned int;
+
 constexpr StmtNameType UNINITIALIZED_STMTNAMETYPE =
     std::numeric_limits<unsigned int>::max();
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
index ad85dc4642ab..17efc3e692e7 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
@@ -114,82 +114,6 @@ void IrCloner::handle(const Merge* merge) {
   clone_ = new Merge(merge, this);
 }
 
-void IrCloner::handle(const kir::Bool* node) {
-  clone_ = new kir::Bool(node, this);
-}
-
-void IrCloner::handle(const kir::Float* node) {
-  clone_ = new kir::Float(node, this);
-}
-
-void IrCloner::handle(const kir::Half* node) {
-  clone_ = new kir::Half(node, this);
-}
-
-void IrCloner::handle(const kir::Int* node) {
-  clone_ = new kir::Int(node, this);
-}
-
-void IrCloner::handle(const kir::NamedScalar* node) {
-  clone_ = new kir::NamedScalar(node, this);
-}
-
-void IrCloner::handle(const kir::IterDomain* node) {
-  clone_ = new kir::IterDomain(node, this);
-}
-
-void IrCloner::handle(const kir::TensorDomain* node) {
-  clone_ = new kir::TensorDomain(node, this);
-}
-
-void IrCloner::handle(const kir::TensorView* node) {
-  clone_ = new kir::TensorView(node, this);
-}
-
-void IrCloner::handle(const kir::UnaryOp* node) {
-  clone_ = new kir::UnaryOp(node, this);
-}
-
-void IrCloner::handle(const kir::BinaryOp* node) {
-  clone_ = new kir::BinaryOp(node, this);
-}
-
-void IrCloner::handle(const kir::TernaryOp* node) {
-  clone_ = new kir::TernaryOp(node, this);
-}
-
-void IrCloner::handle(const kir::ReductionOp* node) {
-  clone_ = new kir::ReductionOp(node, this);
-}
-
-void IrCloner::handle(const kir::BroadcastOp* node) {
-  clone_ = new kir::BroadcastOp(node, this);
-}
-
-void IrCloner::handle(const kir::TensorIndex* node) {
-  clone_ = new kir::TensorIndex(node, this);
-}
-
-void IrCloner::handle(const kir::Allocate* node) {
-  clone_ = new kir::Allocate(node, this);
-}
-
-void IrCloner::handle(const kir::Sync* node) {
-  clone_ = new kir::Sync(node, this);
-}
-
-void IrCloner::handle(const kir::ForLoop* node) {
-  clone_ = new kir::ForLoop(node, this);
-}
-
-void IrCloner::handle(const kir::IfThenElse* node) {
-  clone_ = new kir::IfThenElse(node, this);
-}
-
-void IrCloner::handle(const kir::GridReduction* node) {
-  clone_ = new kir::GridReduction(node, this);
-}
-
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/torch/csrc/jit/codegen/cuda/ir_cloner.h
index 25b101d612c8..39435aab4e65 100644
--- a/torch/csrc/jit/codegen/cuda/ir_cloner.h
+++ b/torch/csrc/jit/codegen/cuda/ir_cloner.h
@@ -67,29 +67,6 @@ class TORCH_CUDA_API IrCloner : private OptInConstDispatch {
   void handle(const Split*) override;
   void handle(const Merge*) override;
 
-  void handle(const kir::Bool*) override;
-  void handle(const kir::Float*) override;
-  void handle(const kir::Half*) override;
-  void handle(const kir::Int*) override;
-  void handle(const kir::NamedScalar*) override;
-
-  void handle(const kir::IterDomain*) override;
-  void handle(const kir::TensorDomain*) override;
-  void handle(const kir::TensorView*) override;
-
-  void handle(const kir::UnaryOp*) override;
-  void handle(const kir::BinaryOp*) override;
-  void handle(const kir::TernaryOp*) override;
-  void handle(const kir::ReductionOp*) override;
-  void handle(const kir::BroadcastOp*) override;
-
-  void handle(const kir::TensorIndex*) override;
-  void handle(const kir::Allocate*) override;
-  void handle(const kir::Sync*) override;
-  void handle(const kir::ForLoop*) override;
-  void handle(const kir::IfThenElse*) override;
-  void handle(const kir::GridReduction*) override;
-
  private:
   // The destination Fusion container
   Fusion* fusion_ = nullptr;
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
index bb3335fa1b89..488e626299ad 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp
@@ -333,17 +333,6 @@ void IrGraphGenerator::handle(const IterDomain* id) {
   }
 }
 
-void IrGraphGenerator::handle(const kir::TensorIndex* ti) {
-  graph_def_ << "    " << getid(ti) << " [label=\"TensorIndex\", "
-             << "shape=rarrow, color=gray, fontsize=10];\n";
-
-  addArc(ti, ti->view());
-
-  for (const auto index : ti->indices()) {
-    addArc(index, ti);
-  }
-}
-
 void IrGraphGenerator::handle(const Bool* b) {
   printValue(b, IrNodeLabel::gen(b, detail_level_));
 }
@@ -453,45 +442,6 @@ void IrGraphGenerator::handle(const ReductionOp* op) {
   addArc(op, op->out());
 }
 
-void IrGraphGenerator::handle(const kir::GridReduction* op) {
-  printExpr(op, "Grid Reduction");
-
-  // inputs & outputs
-  addArc(op, op->reduction_op());
-  addArc(op->reduction_buffer(), op);
-  addArc(op->sync_buffer(), op);
-}
-
-void IrGraphGenerator::handle(const kir::ForLoop* for_loop) {
-  printExpr(for_loop, "ForLoop");
-  addArc(for_loop->index(), for_loop);
-  addArc(for_loop->iter_domain(), for_loop);
-  if (for_loop->parentScope()) {
-    addArc(for_loop, for_loop->parentScope());
-  }
-}
-
-void IrGraphGenerator::handle(const kir::IfThenElse* if_then_else) {
-  printExpr(if_then_else, "IfThenElse");
-  addArc(if_then_else->cond(), if_then_else);
-  if (if_then_else->parentScope()) {
-    addArc(if_then_else, if_then_else->parentScope());
-  }
-}
-
-void IrGraphGenerator::handle(const kir::Allocate* allocate) {
-  std::stringstream msg;
-  msg << "Allocate( memory type = " << allocate->getMemoryType() << ")";
-
-  printExpr(allocate, msg.str());
-  addArc(allocate->size(), allocate);
-  addArc(allocate->buffer(), allocate);
-}
-
-void IrGraphGenerator::handle(const kir::Sync* sync) {
-  printExpr(sync, "SyncThreads");
-}
-
 void IrGraphGenerator::handle(const Split* split) {
   printExpr(split, IrNodeLabel::gen(split));
   addArc(split->in(), split);
diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
index 1940ea0a2a5b..e3c41fb525ff 100644
--- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h
+++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h
@@ -66,7 +66,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch {
   void handle(const TensorDomain*) override;
   void handle(const TensorView*) override;
   void handle(const IterDomain*) override;
-  void handle(const kir::TensorIndex*) override;
 
   void handle(const Bool*) override;
   void handle(const Float*) override;
@@ -79,12 +78,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch {
   void handle(const TernaryOp*) override;
   void handle(const BroadcastOp*) override;
   void handle(const ReductionOp*) override;
-  void handle(const kir::GridReduction*) override;
-
-  void handle(const kir::ForLoop*) override;
-  void handle(const kir::IfThenElse*) override;
-  void handle(const kir::Allocate*) override;
-  void handle(const kir::Sync*) override;
 
   void handle(const Split*) override;
   void handle(const Merge*) override;
diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
index 5e1ebf3f5bfe..4186f7dfcd88 100644
--- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h
@@ -4,6 +4,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
 
 #include <torch/csrc/jit/ir/ir.h>
 
@@ -208,7 +209,10 @@ class TORCH_CUDA_API TensorView : public Val {
   TensorView(TensorView&& other) = delete;
   TensorView& operator=(TensorView&& other) = delete;
 
-  TensorView(TensorDomain* _domain, DataType dtype);
+  TensorView(
+      TensorDomain* _domain,
+      DataType dtype,
+      MemoryType mtype = MemoryType::Local);
 
   TensorView(const std::shared_ptr<c10::TensorType>& tensor_type);
 
@@ -224,6 +228,7 @@ class TORCH_CUDA_API TensorView : public Val {
   bool hasReduction() const;
   bool hasBlockReduction() const;
   bool hasGridReduction() const;
+  bool hasBlockBroadcast() const;
   bool hasBroadcast() const;
   bool hasRFactor() const;
 
@@ -405,7 +410,7 @@ class TORCH_CUDA_API TensorView : public Val {
   // compute at axis in compute at view
   unsigned int relative_compute_at_axis_ = 0;
   unsigned int this_compute_at_axis_ = 0;
-  MemoryType memory_type_ = MemoryType::Global;
+  MemoryType memory_type_ = MemoryType::Local;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
index 10446b123532..ca71fd6c2d62 100644
--- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
+++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
@@ -257,6 +257,12 @@ class TORCH_CUDA_API IterDomain : public Val {
   // directly, users should not be able to use this call
   static std::pair<IterDomain*, IterDomain*> split(IterDomain* in, Val* factor);
 
+  // Run concretization pass and return the concretized domain of broadcast id
+  static const IterDomain* concretizeDomain(IterDomain* bcast_dom);
+
+  // Attempt to prove 2 IterDomains are equal in start and rawExtent
+  static bool proveEquivalent(IterDomain* a, IterDomain* b);
+
   bool isReduction() const {
     return getIterType() == IterType::Reduction;
   }
@@ -384,6 +390,11 @@ class TORCH_CUDA_API TensorDomain : public Val {
 
   TensorDomain(const TensorDomain* src, IrCloner* ir_cloner);
 
+  bool operator==(const TensorDomain& other) const;
+  bool operator!=(const TensorDomain& other) const {
+    return !(*this == other);
+  }
+
   std::vector<IterDomain*>::size_type nDims() const {
     return domain_.size();
   }
@@ -413,6 +424,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
   bool hasReduction() const;
   bool hasBlockReduction() const;
   bool hasGridReduction() const;
+  bool hasBlockBroadcast() const;
   bool hasBroadcast() const;
   bool hasRFactor() const;
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
index 66aeec2c5bd1..e82e3fd5baa4 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
@@ -1,11 +1,10 @@
+
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
-#include <iostream>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -25,335 +24,191 @@ static void checkInlineable(const Expr* expr) {
       "Printing inline computations involving values other than scalars is not currently supported.");
 }
 
-void IRPrinter::handle(const Statement* s) {
+void IrPrinter::handle(const Statement* s) {
   OptInConstDispatch::handle(s);
 }
 
-void IRPrinter::handle(const Val* v) {
+void IrPrinter::handle(const Val* v) {
   OptInConstDispatch::handle(v);
 }
 
-void IRPrinter::handle(const Expr* e) {
+void IrPrinter::handle(const Expr* e) {
   OptInConstDispatch::handle(e);
 }
 
-void IRPrinter::printHeader(
-    Fusion* fusion,
-    const std::string& kernel_name_,
-    const std::vector<Val*>& global_buffers) {
-  os << "__global__ void " << kernel_name_ << "(";
-
-  std::vector<Val*> vals;
-
-  for (auto val : fusion->inputs()) {
-    vals.push_back(val);
-  }
-  for (auto val : fusion->outputs()) {
-    vals.push_back(val);
-  }
-
-  for (auto val : global_buffers) {
-    vals.push_back(val);
-  }
-
-  for (Val* val : vals) {
-    switch (val->getValType().value()) {
-      case ValType::TensorView:
-        os << "Tensor<" << val->getDataType().value() << ", "
-           << TensorDomain::noReductions(val->as<TensorView>()->getRootDomain())
-                  .size()
-           << "> T" << val->name();
-        break;
-      case ValType::KirTensorView:
-        os << "Tensor<" << val->getDataType().value() << ", "
-           << kir::TensorDomain::noReductions(
-                  val->as<kir::TensorView>()->domain()->rootDomain())
-                  .size()
-           << "> T" << val->name();
-        break;
-      case ValType::Scalar:
-        os << val->getDataType().value() << " " << val;
-        break;
-      default:
-        TORCH_CHECK(
-            false,
-            "printHeader() found an input to the fusion of unexpected data type.");
-    }
-
-    if (val != vals.back())
-      os << ", ";
-  }
-
-  if (fusion->hasRNG())
-    os << ", unsigned long long seed, unsigned long long offset";
-
-  os << "){\n";
-  indent_size++;
-  if (fusion->hasRNG()) {
-    indent();
-    os << "int idx = blockIdx.x*blockDim.x + threadIdx.x;\n";
-    indent();
-    os << "Philox rnd(seed, idx, offset);\n";
-  }
-  if (fusion->hasBlockReduction() || fusion->hasGridReduction()) {
-    indent();
-    // TODO: Dynamic sizing possible? blockReduce originally used 1024
-    // values of a given type
-    os << "__shared__ float shared_mem[1024];\n";
-  }
-}
-
-void IRPrinter::handle(Fusion* fusion) {
+void IrPrinter::handle(Fusion* fusion) {
+  FUSER_PERF_SCOPE("IrPrinter");
   resetIndent();
   for (const Expr* expr : fusion->exprs()) {
     handle(expr);
   }
 }
 
-void IRPrinter::handle(const TensorDomain* td) {
+void IrPrinter::handle(const TensorDomain* td) {
   if (td->nDims() == 0) {
-    os << "[ 0 ]";
+    os_ << "[ 0 ]";
     return;
   }
-  os << "[ ";
+  os_ << "[ ";
   for (size_t i = 0; i < td->nDims(); i++) {
     handle(td->axis(i));
     if (i != td->nDims() - 1)
-      os << ", ";
+      os_ << ", ";
   }
-  os << " ]";
+  os_ << " ]";
 }
 
-void IRPrinter::handle(const TensorView* tv) {
+void IrPrinter::handle(const TensorView* tv) {
   if (tv->nDims() == 0) {
     switch (tv->getDataType().value()) {
       case DataType::Bool:
-        os << "b";
+        os_ << "b";
         break;
       case DataType::Float:
-        os << "f";
+        os_ << "f";
         break;
       case DataType::Half:
-        os << "h";
+        os_ << "h";
         break;
       case DataType::Int:
-        os << "i";
+        os_ << "i";
         break;
       default:
         TORCH_INTERNAL_ASSERT(
             false, "Did not recognize type ", tv->getDataType().value());
     }
-    os << tv->name();
+    os_ << tv->name();
 
   } else {
-    os << "T" << tv->name();
+    os_ << "T" << tv->name();
     handle(tv->domain());
 
     if (tv->getComputeAtView() != nullptr) {
-      os << " compute_at( ";
-      os << "T" << tv->getComputeAtView()->name();
-      os << ", " << tv->getRelativeComputeAtAxis() << " )";
+      os_ << " compute_at( ";
+      os_ << "T" << tv->getComputeAtView()->name();
+      os_ << ", " << tv->getRelativeComputeAtAxis() << " )";
     }
   }
 }
 
-void IRPrinter::handle(const IterDomain* id) {
-  os << id->getIterType();
-  os << id->getParallelType();
-  os << "{";
+void IrPrinter::handle(const IterDomain* id) {
+  os_ << id->getIterType();
+  os_ << id->getParallelType();
+  os_ << id->name();
+  os_ << "{";
   if (!id->start()->isZeroInt()) {
     print_inline(id->start());
-    os << " : ";
+    os_ << " : ";
   }
   print_inline(id->extent());
-  os << "}";
+  os_ << "}";
   if (id->isRFactorProduct())
-    os << "rf";
-}
-
-void IRPrinter::handle(const kir::TensorIndex* ti) {
-  os << "T" << ti->view()->name();
-  std::vector<Val*> non_zero_inds;
-  for (auto* ind : ti->indices()) {
-    if (!ind->isZeroInt()) {
-      non_zero_inds.push_back(ind);
-    }
-  }
-
-  if (non_zero_inds.size() == 0) {
-    os << "[ 0 ]";
-    return;
-  }
-
-  os << "[ ";
-  bool first = true;
-  for (auto* ind : non_zero_inds) {
-    if (!first)
-      os << " + ";
-    print_inline(ind);
-    first = false;
-  }
-  os << " ]";
+    os_ << "rf";
 }
 
-void IRPrinter::handle(const Bool* b) {
+void IrPrinter::handle(const Bool* b) {
   if (print_inline_ && FusionGuard::getCurFusion()->origin(b) != nullptr) {
-    os << "( ";
+    os_ << "( ";
     handle(FusionGuard::getCurFusion()->origin(b));
-    os << " )";
+    os_ << " )";
     return;
   }
 
   if (b->isSymbolic()) {
-    os << "b" << b->name();
+    os_ << "b" << b->name();
   } else {
-    os << "bool(" << *(b->value()) << ")";
+    os_ << "bool(" << *(b->value()) << ")";
   }
 }
 
-void IRPrinter::handle(const Float* f) {
+void IrPrinter::handle(const Float* f) {
   if (print_inline_ && FusionGuard::getCurFusion()->origin(f) != nullptr) {
-    os << "( ";
+    os_ << "( ";
     handle(FusionGuard::getCurFusion()->origin(f));
-    os << " )";
+    os_ << " )";
     return;
   }
 
   if (f->isSymbolic()) {
-    os << "f" << f->name();
+    os_ << "f" << f->name();
   } else {
-    os << "float("
-       << std::setprecision(
-              std::numeric_limits<Float::ScalarType>::max_digits10)
-       << *(f->value()) << ")";
+    os_ << "float("
+        << std::setprecision(
+               std::numeric_limits<Float::ScalarType>::max_digits10)
+        << *(f->value()) << ")";
   }
 }
 
-void IRPrinter::handle(const Half* h) {
+void IrPrinter::handle(const Half* h) {
   if (print_inline_ && FusionGuard::getCurFusion()->origin(h) != nullptr) {
-    os << "( ";
+    os_ << "( ";
     handle(FusionGuard::getCurFusion()->origin(h));
-    os << " )";
+    os_ << " )";
     return;
   }
 
   if (h->isSymbolic()) {
-    os << "h" << h->name();
+    os_ << "h" << h->name();
   } else {
-    os << "__float2half(" << *(h->value()) << ")";
+    os_ << "__float2half(" << *(h->value()) << ")";
   }
 }
 
-void IRPrinter::handle(const Int* i) {
+void IrPrinter::handle(const Int* i) {
   if (print_inline_) {
     if (auto def = FusionGuard::getCurFusion()->origin(i)) {
-      os << "( ";
+      os_ << "( ";
       handle(def);
-      os << " )";
+      os_ << " )";
       return;
     }
   }
 
   if (i->isSymbolic()) {
-    os << "i" << i->name();
+    os_ << "i" << i->name();
   } else {
-    os << *(i->value());
+    os_ << *(i->value());
   }
 }
 
-void IRPrinter::handle(const NamedScalar* i) {
-  os << i->name();
+void IrPrinter::handle(const NamedScalar* i) {
+  os_ << i->name();
 }
 
-void IRPrinter::handle(const kir::Bool* b) {
-  if (print_inline_ && FusionGuard::getCurFusion()->origin(b) != nullptr) {
-    os << "( ";
-    handle(FusionGuard::getCurFusion()->origin(b));
-    os << " )";
-    return;
-  }
-
-  if (b->isSymbolic()) {
-    os << "b" << b->name();
-  } else {
-    os << "bool(" << *(b->value()) << ")";
-  }
+void IrPrinter::handle(const kir::Bool* b) {
+  os_ << "kir::Bool";
 }
 
-void IRPrinter::handle(const kir::Float* f) {
-  if (print_inline_ && FusionGuard::getCurFusion()->origin(f) != nullptr) {
-    os << "( ";
-    handle(FusionGuard::getCurFusion()->origin(f));
-    os << " )";
-    return;
-  }
-
-  if (f->isSymbolic()) {
-    os << "f" << f->name();
-  } else {
-    os << "float("
-       << std::setprecision(
-              std::numeric_limits<Float::ScalarType>::max_digits10)
-       << *(f->value()) << ")";
-  }
+void IrPrinter::handle(const kir::Float* f) {
+  os_ << "kir::Float";
 }
 
-void IRPrinter::handle(const kir::Half* h) {
-  if (print_inline_ && FusionGuard::getCurFusion()->origin(h) != nullptr) {
-    os << "( ";
-    handle(FusionGuard::getCurFusion()->origin(h));
-    os << " )";
-    return;
-  }
-
-  if (h->isSymbolic()) {
-    os << "h" << h->name();
-  } else {
-    os << "__float2half(" << *(h->value()) << ")";
-  }
+void IrPrinter::handle(const kir::Half* h) {
+  os_ << "kir::Half";
 }
 
-void IRPrinter::handle(const kir::Int* i) {
-  if (print_inline_) {
-    if (auto def = FusionGuard::getCurFusion()->origin(i)) {
-      os << "( ";
-      handle(def);
-      os << " )";
-      return;
-    }
-  }
+void IrPrinter::handle(const kir::Int* i) {
+  os_ << "kir::Int";
+}
 
-  if (i->isSymbolic()) {
-    os << "i" << i->name();
-  } else {
-    os << *(i->value());
-  }
+void IrPrinter::handle(const kir::NamedScalar*) {
+  os_ << "kir::NamedScalar";
 }
 
-void IRPrinter::handle(const kir::NamedScalar* i) {
-  os << i->name();
+void IrPrinter::handle(const kir::TensorIndex*) {
+  os_ << "kir::TensorIndex";
 }
 
-void IRPrinter::handle(const kir::IterDomain* id) {
-  os << id->getIterType();
-  os << id->getParallelType();
-  os << "{";
-  if (!id->start()->isZeroInt()) {
-    print_inline(id->start());
-    os << " : ";
-  }
-  print_inline(id->extent());
-  os << "}";
-  if (id->isRFactorProduct())
-    os << "rf";
+void IrPrinter::handle(const kir::IterDomain*) {
+  os_ << "kir::IterDomain";
 }
 
-void IRPrinter::handle(const kir::TensorDomain*) {
-  TORCH_INTERNAL_ASSERT(false, "Unreachable");
+void IrPrinter::handle(const kir::TensorDomain*) {
+  os_ << "kir::TensorDomain";
 }
 
-void IRPrinter::handle(const kir::TensorView*) {
-  TORCH_INTERNAL_ASSERT(false, "Unreachable");
+void IrPrinter::handle(const kir::TensorView*) {
+  os_ << "kir::TensorView";
 }
 
 static bool isTV(const Val* val) {
@@ -366,62 +221,62 @@ static bool isTVOp(const Expr* expr) {
   return expr->outputs().size() == 1 && isTV(expr->outputs().front());
 }
 
-void IRPrinter::handle(const UnaryOp* uop) {
+void IrPrinter::handle(const UnaryOp* uop) {
   bool istvop = isTVOp(uop);
   if (!print_inline_) {
     indent();
-    os << uop->out();
+    os_ << uop->out();
     if (istvop) {
-      os << "\n";
-      indent_size++;
+      os_ << "\n";
+      indent_size_++;
       indent();
     }
-    os << " = ";
+    os_ << " = ";
   } else {
     checkInlineable(uop);
   }
 
   if (auto inline_uop = inline_op_str(uop->getUnaryOpType())) {
-    os << inline_uop.value();
+    os_ << inline_uop.value();
     handle(uop->in());
   } else {
     if (uop->getUnaryOpType() == UnaryOpType::Cast) {
       c10::optional<std::string> cast_str = cast_func_str(std::make_pair(
           uop->in()->getDataType().value(), uop->out()->getDataType().value()));
       TORCH_INTERNAL_ASSERT(cast_str != c10::nullopt, "Unsupported Cast");
-      os << cast_str.value();
+      os_ << cast_str.value();
     } else {
-      os << uop->getUnaryOpType();
+      os_ << uop->getUnaryOpType();
     }
-    os << "(";
+    os_ << "(";
     if (uop->getUnaryOpType() == UnaryOpType::RandLike)
-      os << "rnd";
+      os_ << "rnd";
     else
       handle(uop->in());
-    os << ")";
+    os_ << ")";
   }
 
   if (istvop)
-    indent_size--;
+    indent_size_--;
 
   if (!print_inline_)
-    os << ";\n";
+    os_ << ";\n";
 }
 
-void IRPrinter::handle(const BinaryOp* bop) {
+void IrPrinter::handle(const BinaryOp* bop) {
   bool istvop = isTVOp(bop);
   if (!print_inline_) {
     indent();
-    os << bop->out();
+    os_ << bop->out();
 
     // tensor operations tend to be long, break them up into multiple lines
     if (istvop) {
-      os << "\n";
-      indent_size++;
+      os_ << "\n";
+      indent_size_++;
       indent();
     }
 
-    os << " = ";
+    os_ << " = ";
   } else {
     checkInlineable(bop);
   }
@@ -429,546 +284,153 @@ void IRPrinter::handle(const BinaryOp* bop) {
   if (auto inline_bop = inline_op_str(bop->getBinaryOpType())) {
     handle(bop->lhs());
     if (istvop) {
-      os << "\n";
+      os_ << "\n";
       indent();
     }
-    os << " " << inline_bop.value() << " ";
+    os_ << " " << inline_bop.value() << " ";
     handle(bop->rhs());
   } else {
-    os << bop->getBinaryOpType() << "(";
+    os_ << bop->getBinaryOpType() << "(";
     handle(bop->lhs());
     if (istvop) {
-      os << "\n";
+      os_ << "\n";
       indent();
     }
-    os << ", ";
+    os_ << ", ";
     handle(bop->rhs());
-    os << ")";
+    os_ << ")";
   }
 
   if (istvop)
-    indent_size--;
+    indent_size_--;
 
   if (!print_inline_)
-    os << ";\n";
+    os_ << ";\n";
 }
 
-void IRPrinter::handle(const TernaryOp* top) {
+void IrPrinter::handle(const TernaryOp* top) {
   bool istvop = isTVOp(top);
   if (!print_inline_) {
     indent();
-    os << top->out();
+    os_ << top->out();
 
     // tensor operations tend to be long, break them up into multiple lines
     if (istvop) {
-      os << "\n";
-      indent_size++;
+      os_ << "\n";
+      indent_size_++;
       indent();
     }
 
-    os << " = ";
+    os_ << " = ";
   } else {
     checkInlineable(top);
   }
 
-  os << top->getTernaryOpType() << "(";
+  os_ << top->getTernaryOpType() << "(";
   handle(top->in1());
   if (istvop) {
-    os << "\n";
+    os_ << "\n";
     indent();
   }
-  os << ", ";
+  os_ << ", ";
   handle(top->in2());
   if (istvop) {
-    os << "\n";
+    os_ << "\n";
     indent();
   }
-  os << ", ";
+  os_ << ", ";
   handle(top->in3());
-  os << ")";
+  os_ << ")";
 
   if (istvop)
-    indent_size--;
+    indent_size_--;
 
   if (!print_inline_)
-    os << ";\n";
+    os_ << ";\n";
 }
 
-void IRPrinter::handle(const kir::UnaryOp* uop) {
-  bool istvop = isTVOp(uop);
-  if (!print_inline_) {
-    indent();
-    os << uop->out();
-    if (istvop) {
-      os << "\n";
-      indent_size++;
-      indent();
-    }
-    os << " = ";
-  } else {
-    checkInlineable(uop);
-  }
-
-  if (auto inline_uop = inline_op_str(uop->getUnaryOpType())) {
-    os << inline_uop.value();
-    handle(uop->in());
-  } else {
-    if (uop->getUnaryOpType() == UnaryOpType::Cast) {
-      c10::optional<std::string> cast_str = cast_func_str(std::make_pair(
-          uop->in()->getDataType().value(), uop->out()->getDataType().value()));
-      TORCH_INTERNAL_ASSERT(cast_str != c10::nullopt, "Unsupported Cast");
-      os << cast_str.value();
-    } else {
-      os << uop->getUnaryOpType();
-    }
-    os << "(";
-    if (uop->getUnaryOpType() == UnaryOpType::RandLike)
-      os << "rnd";
-    else
-      handle(uop->in());
-    os << ")";
-  }
-
-  if (istvop)
-    indent_size--;
-
-  if (!print_inline_)
-    os << ";\n";
+void IrPrinter::handle(const kir::UnaryOp* uop) {
+  os_ << "kir::UnaryOp";
 }
 
-void IRPrinter::handle(const kir::BinaryOp* bop) {
-  bool istvop = isTVOp(bop);
-  if (!print_inline_) {
-    indent();
-    os << bop->out();
-
-    // tensor operations tend to be long, break them up into multiple lines
-    if (istvop) {
-      os << "\n";
-      indent_size++;
-      indent();
-    }
-
-    os << " = ";
-  } else {
-    checkInlineable(bop);
-  }
-
-  if (auto inline_bop = inline_op_str(bop->getBinaryOpType())) {
-    handle(bop->lhs());
-    if (istvop) {
-      os << "\n";
-      indent();
-    }
-    os << " " << inline_bop.value() << " ";
-    handle(bop->rhs());
-  } else {
-    os << bop->getBinaryOpType() << "(";
-    handle(bop->lhs());
-    if (istvop) {
-      os << "\n";
-      indent();
-    }
-    os << ", ";
-    handle(bop->rhs());
-    os << ")";
-  }
-
-  if (istvop)
-    indent_size--;
-
-  if (!print_inline_)
-    os << ";\n";
+void IrPrinter::handle(const kir::BinaryOp* bop) {
+  os_ << "kir::BinaryOp";
 }
 
-void IRPrinter::handle(const kir::TernaryOp* top) {
-  bool istvop = isTVOp(top);
-  if (!print_inline_) {
-    indent();
-    os << top->out();
-
-    // tensor operations tend to be long, break them up into multiple lines
-    if (istvop) {
-      os << "\n";
-      indent_size++;
-      indent();
-    }
-
-    os << " = ";
-  } else {
-    checkInlineable(top);
-  }
-
-  os << top->getTernaryOpType() << "(";
-  handle(top->in1());
-  if (istvop) {
-    os << "\n";
-    indent();
-  }
-  os << ", ";
-  handle(top->in2());
-  if (istvop) {
-    os << "\n";
-    indent();
-  }
-  os << ", ";
-  handle(top->in3());
-  os << ")";
-
-  if (istvop)
-    indent_size--;
-
-  if (!print_inline_)
-    os << ";\n";
+void IrPrinter::handle(const kir::TernaryOp* top) {
+  os_ << "kir::TernaryOp";
 }
 
-void IRPrinter::handle(const ReductionOp* rop) {
+void IrPrinter::handle(const ReductionOp* rop) {
   TORCH_CHECK(rop->out()->getValType() != ValType::TensorIndex);
   indent();
-  os << rop->out() << " = reduction( " << rop->in()
-     << ", op = " << rop->getReductionOpType()
-     << ", initial value = " << rop->init() << " )\n";
+  os_ << rop->out() << " = reduction( " << rop->in()
+      << ", op = " << rop->getReductionOpType()
+      << ", initial value = " << rop->init() << " )\n";
 }
 
-void IRPrinter::handle(const kir::ReductionOp* rop) {
-  TORCH_CHECK(rop->out()->getValType() == ValType::TensorIndex);
-
-  const auto out = rop->out()->as<kir::TensorIndex>();
-  const auto domain = out->view()->domain();
-
-  const bool has_block_reduce = domain->hasBlockReduction();
-  const bool has_grid_reduce = domain->hasGridReduction();
-
-  if (!has_block_reduce && !has_grid_reduce) {
-    FusionGuard fg(rop->fusion());
-    handle(new BinaryOp(rop->getReductionOpType(), out, out, rop->in()));
-    return;
-  }
-
-  auto par_domains = rop->getParallelReductionDomains();
-  bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
-  bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
-  bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
-
-  auto d_type = rop->out()->getDataType().value();
-  auto op_type = rop->getReductionOpType();
-  const std::string block_result = "block_result";
-  if (has_block_reduce) {
-    if (has_grid_reduce) {
-      indent();
-      os << d_type << " " << block_result << ";\n";
-    }
-    indent();
-    // Thread all reduce.
-    os << "blockReduce< " << (tidx ? "true" : "false") << ", "
-       << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") << " >"
-       << " ( ";
-    if (has_grid_reduce) {
-      os << block_result;
-    } else {
-      handle(rop->out());
-    }
-    os << ", ";
-    handle(rop->in());
-    os << ", ";
-    os << "reduction_" << op_type << "_" << d_type;
-    os << ", threadIdx, blockDim";
-    os << ", reinterpret_cast<" << d_type << "*>(shared_mem)";
-    os << ");\n";
-  }
+void IrPrinter::handle(const kir::ReductionOp* rop) {
+  os_ << "kir::ReductionOp";
 }
 
-void IRPrinter::handle(const kir::GridReduction* gr) {
-  // Check if we've lowered yet.
-  const auto rop = gr->reduction_op();
-  TORCH_INTERNAL_ASSERT(
-      rop->out()->getValType() == ValType::TensorIndex,
-      "GridReduction node is a lowered node but did not find the output to be a TensorIndex.");
-
-  const auto out = rop->out()->as<kir::TensorIndex>();
-  const auto domain = out->view()->domain();
-  TORCH_INTERNAL_ASSERT(domain->hasGridReduction());
-
-  const auto par_domains = rop->getParallelReductionDomains();
-  const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end();
-  const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end();
-  const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end();
-  const bool bidx = par_domains.find(ParallelType::BIDx) != par_domains.end();
-  const bool bidy = par_domains.find(ParallelType::BIDy) != par_domains.end();
-  const bool bidz = par_domains.find(ParallelType::BIDz) != par_domains.end();
-
-  const auto d_type = rop->out()->getDataType().value();
-  const auto op_type = rop->getReductionOpType();
-  TORCH_INTERNAL_ASSERT(
-      gr->reduction_buffer()->buffer()->getValType().value() ==
-      ValType::KirTensorView);
-  TORCH_INTERNAL_ASSERT(
-      gr->sync_buffer()->buffer()->getValType().value() ==
-      ValType::KirTensorView);
-  const auto work_buffer =
-      gr->reduction_buffer()->buffer()->as<kir::TensorView>();
-  const auto sync_buffer = gr->sync_buffer()->buffer()->as<kir::TensorView>();
-  indent();
-  // Since block-level reduction is already done, those dimensions
-  // with tidx/y/z being true do not participate in the grid reduction.
-  os << kir::GridReduction::getPredicateFlagName(out->view()) << " = "
-     << "reduction::gridReduce< " << (bidx ? "true" : "false") << ", "
-     << (bidy ? "true" : "false") << ", " << (bidz ? "true" : "false") << ", "
-     << (!tidx ? "true" : "false") << ", " << (!tidy ? "true" : "false") << ", "
-     << (!tidz ? "true" : "false") << " >"
-     << " ( ";
-  handle(rop->out());
-  os << ", ";
-  if (domain->hasBlockReduction()) {
-    os << "block_result";
-  } else {
-    handle(rop->in());
-  }
-  os << ", ";
-  os << "reduction_" << op_type << "_" << d_type;
-  os << ", &T" << work_buffer->name() << "[0]";
-  os << ", T" << sync_buffer->name() << "";
-  os << ", reinterpret_cast<" << d_type << "*>(shared_mem)";
-  os << ");\n";
+void IrPrinter::handle(const kir::GridReduction* gr) {
+  os_ << "kir::GridReduction";
 }
 
-void IRPrinter::handle(const BroadcastOp* bop) {
+void IrPrinter::handle(const BroadcastOp* bop) {
   TORCH_CHECK(bop->out()->getValType() != ValType::TensorIndex);
   indent();
-  os << bop->out() << " = broadcast( " << bop->in() << " )\n";
+  os_ << bop->out() << " = broadcast( " << bop->in() << " )\n";
 }
 
-void IRPrinter::handle(const kir::BroadcastOp* bop) {
-  TORCH_CHECK(bop->out()->getValType() == ValType::TensorIndex);
-
-  const ir_utils::ParallelTypeBitmap domains =
-      ir_utils::getParallelBroadcastDomains(
-          bop->out(), getThreadPredicateMap());
-  const bool thread_x = domains.get(ParallelType::TIDx);
-  const bool thread_y = domains.get(ParallelType::TIDy);
-  const bool thread_z = domains.get(ParallelType::TIDz);
-  const bool block_x = domains.get(ParallelType::BIDx);
-  const bool block_y = domains.get(ParallelType::BIDy);
-  const bool block_z = domains.get(ParallelType::BIDz);
-
-  const bool grid_broadcast_needed = block_x || block_y || block_z;
-  const bool block_broadcast_needed = thread_x || thread_y || thread_z;
-
-  TORCH_INTERNAL_ASSERT(
-      !grid_broadcast_needed, "Parallel broadcast across blocks not supported");
-
-  if (block_broadcast_needed) {
-    indent();
-    os << "broadcast::blockBroadcast<";
-    os << (thread_x ? "true" : "false") << ", ";
-    os << (thread_y ? "true" : "false") << ", ";
-    os << (thread_z ? "true" : "false");
-    os << ">(";
-    handle(bop->out());
-    os << ", ";
-    handle(bop->in());
-    os << ");\n";
-  } else {
-    indent();
-    handle(bop->out());
-    os << "\n";
-    indent_size++;
-    indent();
-    os << " = ";
-    handle(bop->in());
-    indent_size--;
-    os << ";\n";
-  }
+void IrPrinter::handle(const kir::BroadcastOp*) {
+  os_ << "kir::BroadcastOp";
 }
 
-void IRPrinter::handle(const kir::ForLoop* fl) {
-  if (fl->iter_domain()->isThread() || fl->iter_domain()->isBroadcast()) {
-    for (auto& expr : fl->constBody().exprs())
-      handle(expr);
-    return;
-  }
-
-  indent();
-  os << "for(size_t ";
-  handle(fl->index());
-  os << " = ";
-  print_inline(fl->iter_domain()->start());
-  os << "; ";
-  handle(fl->index());
-  os << " < ";
-  print_inline(fl->iter_domain()->extent());
-  os << "; ++";
-  handle(fl->index());
-  os << " ) {\n";
-  indent_size++;
-  for (auto& expr : fl->constBody().exprs())
-    handle(expr);
-
-  indent_size--;
-  indent();
-  os << "}\n";
+void IrPrinter::handle(const kir::ForLoop* fl) {
+  os_ << "kir::ForLoop";
 }
 
-void IRPrinter::handle(const kir::IfThenElse* ite) {
-  indent();
-
-  // IF
-  os << "if ( ";
-  print_inline(ite->cond());
-  os << " ) {\n";
-
-  indent_size++;
-  for (auto& expr : ite->constBody().exprs()) {
-    handle(expr);
-  }
-  indent_size--;
-
-  // ELSE
-  if (ite->hasElse()) {
-    indent();
-    os << "} else {\n";
-    indent_size++;
-    for (auto& expr : ite->constElseBody().exprs()) {
-      handle(expr);
-    }
-    indent_size--;
-  }
-  indent();
-  os << "}\n";
+void IrPrinter::handle(const kir::IfThenElse* ite) {
+  os_ << "kir::IfThenElse";
 }
 
-void IRPrinter::handle(const kir::Allocate* a) {
-  indent();
-  if (a->buffer()->getValType().value() == ValType::KirTensorView) {
-    const auto tv = a->buffer()->as<kir::TensorView>();
-    TORCH_INTERNAL_ASSERT(tv->domain()->nDims() > 0);
-    TORCH_INTERNAL_ASSERT(a->size() != nullptr);
-    switch (tv->getMemoryType()) {
-      case MemoryType::Global:
-        os << "// Allocate global tensor ";
-        break;
-      case MemoryType::Shared:
-        os << "__shared__ ";
-        break;
-      case MemoryType::Local:
-        break;
-    }
-    os << a->buffer_type();
-    os << " T" << tv->name() << "[";
-    print_inline(a->size());
-    os << "];\n";
-  } else {
-    os << a->buffer_type() << " ";
-    handle(a->buffer());
-    os << ";\n";
-  }
+void IrPrinter::handle(const kir::Allocate* a) {
+  os_ << "kir::Allocate";
 }
 
-void IRPrinter::handle(const kir::Sync* a) {
-  indent();
-  os << "__syncthreads();\n";
+void IrPrinter::handle(const kir::Sync* a) {
+  os_ << "kir::Sync";
 }
 
-void IRPrinter::handle(const Split* s) {
-  os << "Split: ";
+void IrPrinter::handle(const Split* s) {
+  os_ << "Split: ";
   handle(s->in());
-  os << " by factor " << s->factor() << " -> ";
+  os_ << " by factor " << s->factor() << " -> ";
   handle(s->outer());
-  os << ", ";
+  os_ << ", ";
   handle(s->inner());
-  os << "\n";
+  os_ << "\n";
 }
 
-void IRPrinter::handle(const Merge* m) {
-  os << "Merge: ";
+void IrPrinter::handle(const Merge* m) {
+  os_ << "Merge: ";
   handle(m->outer());
-  os << " and ";
+  os_ << " and ";
   handle(m->inner());
-  os << " -> ";
+  os_ << " -> ";
   handle(m->out());
-  os << "\n";
-}
-
-namespace {
-
-class ReductionOps : OptOutDispatch {
- public:
-  std::set<std::pair<BinaryOpType, DataType>> rops;
-  void handle(ReductionOp* rop) override {
-    rops.emplace(std::pair<BinaryOpType, DataType>{
-        rop->getReductionOpType(), rop->in()->getDataType().value()});
-  }
-
-  using OptOutDispatch::handle;
-
-  static std::set<std::pair<BinaryOpType, DataType>> get(Fusion* fusion) {
-    ReductionOps ROPs;
-    for (auto expr : fusion->exprs(true)) {
-      ROPs.handle(expr);
-    }
-    return ROPs.rops;
-  }
-};
-
-} // namespace
-
-void IRPrinter::printReductionOps(Fusion* fusion) {
-  FusionGuard fg(fusion);
-  auto a = new NamedScalar("a", DataType::Null);
-  auto b = new NamedScalar("b", DataType::Null);
-  for (auto rop_pair : ReductionOps::get(fusion)) {
-    auto op_type = rop_pair.first;
-    auto d_type = rop_pair.second;
-
-    indent();
-    os << "__device__ void reduction_" << op_type << "_" << d_type << "("
-       << d_type << "& a, "
-       << "const " << d_type << " b) {\n";
-    indent_size++;
-
-    handle(new BinaryOp(op_type, a, a, b));
-    indent_size--;
-    indent();
-    os << "}\n";
-  }
-}
-
-void IRPrinter::printKernel(
-    const std::vector<Expr*>& exprs,
-    const std::string& kernel_name,
-    const std::vector<Val*>& global_buffers) {
-  Fusion* fusion = FusionGuard::getCurFusion();
-  if (exprs.empty())
-    return;
-  TORCH_INTERNAL_ASSERT(
-      exprs[0]->fusion() == FusionGuard::getCurFusion(),
-      "Incorrect fusion set during printKernel.");
-
-  printReductionOps(fusion);
-  printHeader(fusion, kernel_name, global_buffers);
-
-  for (auto* expr : exprs) {
-    handle(expr);
-  }
-  os << "}\n";
-}
-
-const ThreadPredicateMap& IRPrinter::getThreadPredicateMap() {
-  if (thread_predicates_ == nullptr) {
-    Fusion* fusion = FusionGuard::getCurFusion();
-    thread_predicates_ = std::make_unique<ThreadPredicateMap>(fusion);
-  }
-  return *thread_predicates_;
+  os_ << "\n";
 }
 
 std::ostream& operator<<(std::ostream& os, const Statement* stmt) {
-  IRPrinter p(os);
+  IrPrinter p(os);
   p.handle(stmt);
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, Fusion* f) {
-  IRPrinter p(os);
+  IrPrinter p(os);
   FusionGuard guard(f);
   p.handle(f);
   return os;
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/torch/csrc/jit/codegen/cuda/ir_iostream.h
index e6d4b473a758..01e8bdaa09dc 100644
--- a/torch/csrc/jit/codegen/cuda/ir_iostream.h
+++ b/torch/csrc/jit/codegen/cuda/ir_iostream.h
@@ -1,9 +1,9 @@
+
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 
 #include <iostream>
 
@@ -11,92 +11,29 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-class Fusion;
-
-// Hierarchal dispatch functions for handle
-class Statement;
-class Expr;
-class Val;
-
-// Vals
-class IterDomain;
-class TensorDomain;
-class TensorView;
-class Bool;
-class Float;
-class Half;
-class Int;
-class NamedScalar;
-
-// Exprs
-class Split;
-class Merge;
-class UnaryOp;
-class BinaryOp;
-class TernaryOp;
-class ReductionOp;
-class BroadcastOp;
-
-// Kernel IR
-namespace kir {
-
-class Bool;
-class Float;
-class Half;
-class Int;
-class NamedScalar;
-
-class IterDomain;
-class TensorDomain;
-class TensorView;
-
-class UnaryOp;
-class BinaryOp;
-class TernaryOp;
-class ReductionOp;
-class BroadcastOp;
-
-class TensorIndex;
-class Allocate;
-class ForLoop;
-class IfThenElse;
-class GridReduction;
-class Sync;
-
-} // namespace kir
-
-/*
- * Define pretty printing functions for all nodes. handle is used so we can take
- * advantage of OptInConstDispatch. Where we will throw an error if a print
- * function is not defined for a node. Stream operator << is also provided for
- * Fusion&, Fusion* and Statement* which allow us to print any node through
- * stream operator <<.
- */
-
-class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
+//! Define pretty printing functions for IR nodes
+//!
+//! This class is intended for debug printing, so it attempts
+//! to handle invalid states as well.
+//!
+class TORCH_CUDA_API IrPrinter : public OptInConstDispatch {
  public:
-  std::ostream& os;
-  bool print_inline_ = false;
-
-  // Track the indentation size for pretty printing
-  int indent_size = 0;
+  explicit IrPrinter(std::ostream& os) : os_(os) {}
 
   // Indent the generated code
   void indent() {
-    for (int i = 0; i < indent_size; i++)
-      os << "  ";
+    for (int i = 0; i < indent_size_; i++) {
+      os_ << "  ";
+    }
   }
 
   void resetIndent() {
-    indent_size = 0;
+    indent_size_ = 0;
   }
 
-  void printHeader(
-      Fusion* fusion,
-      const std::string& kernel_name_,
-      const std::vector<Val*>& global_buffers);
-
-  IRPrinter(std::ostream& _os) : os(_os) {}
+  bool printInline() const {
+    return print_inline_;
+  }
 
   virtual void handle(Fusion* f);
 
@@ -118,7 +55,6 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
   void handle(const TensorDomain*) override;
   void handle(const TensorView*) override;
   void handle(const IterDomain*) override;
-  void handle(const kir::TensorIndex*) override;
 
   void handle(const Bool*) override;
   void handle(const Float*) override;
@@ -138,6 +74,7 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
   void handle(const kir::Int*) override;
   void handle(const kir::NamedScalar*) override;
 
+  void handle(const kir::TensorIndex*) override;
   void handle(const kir::IterDomain*) override;
   void handle(const kir::TensorDomain*) override;
   void handle(const kir::TensorView*) override;
@@ -164,17 +101,10 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch {
     print_inline_ = prev;
   }
 
-  void printReductionOps(Fusion* fusion);
-
-  void printKernel(
-      const std::vector<Expr*>& exprs,
-      const std::string& kernel_name,
-      const std::vector<Val*>& global_buffers);
-
  private:
-  std::unique_ptr<ThreadPredicateMap> thread_predicates_;
-
-  const ThreadPredicateMap& getThreadPredicateMap();
+  std::ostream& os_;
+  bool print_inline_ = false;
+  int indent_size_ = 0;
 };
 
 TORCH_CUDA_API std::ostream& operator<<(
diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
index d63d2ce68183..2e1e34de6871 100644
--- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
+++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp
@@ -2,12 +2,11 @@
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
 #include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
 
-#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
-
 #include <sstream>
 
 namespace torch {
@@ -321,13 +320,27 @@ IterDomain::IterDomain(
 
   TORCH_INTERNAL_ASSERT(
       _extent->isAnInt(),
-      "Cannot create an iter domain over an extent that is not an int but recieved ",
+      "Cannot create an iter domain over an extent that is not an int but received ",
       _extent,
       " .");
 
   TORCH_INTERNAL_ASSERT(
       _start->isAnInt(),
-      "Cannot create an iter domain with a start that is not an int but recieved ",
+      "Cannot create an iter domain with a start that is not an int but received ",
+      _extent,
+      " .");
+
+  // Check that all for-loops iterate from zero to some positive integer
+  // lower_insert_syncs uses this assumption for correctness.
+  TORCH_INTERNAL_ASSERT(
+      _start->isZeroInt(),
+      "Cannot create an iter domain with a start that is non-zero but received ",
+      _extent,
+      " .");
+
+  TORCH_INTERNAL_ASSERT(
+      !_extent->isZeroInt(),
+      "Cannot create an iter domain with a extent that is zero but received ",
       _extent,
       " .");
 
@@ -571,6 +584,15 @@ TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner)
       rfactor_domain_(ir_cloner->clone(src->rfactor_domain_)),
       contiguity_(src->contiguity()) {}
 
+bool TensorDomain::operator==(const TensorDomain& other) const {
+  // Checks equality of each class field. Should not be necessary to
+  // check no_bcast_domain_ and no_reduction_domain_ as they are just
+  // derived from domain_.
+  return root_domain_ == other.root_domain_ && domain_ == other.domain_ &&
+      rfactor_domain_ == other.rfactor_domain_ &&
+      contiguity_ == other.contiguity_;
+}
+
 bool TensorDomain::sameAs(const TensorDomain* const other) const {
   if (nDims() != other->nDims())
     return false;
@@ -623,6 +645,12 @@ bool TensorDomain::hasGridReduction() const {
   });
 }
 
+bool TensorDomain::hasBlockBroadcast() const {
+  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
+    return id->isBroadcast() && id->isThreadDim();
+  });
+}
+
 bool TensorDomain::hasBroadcast() const {
   return no_bcast_domain_.size() != domain_.size();
 }
@@ -1002,6 +1030,347 @@ std::pair<TensorDomain*, TensorDomain*> TensorDomain::rFactor(
       TransformRFactor::runReplay2(this, axes)};
 }
 
+namespace {
+
+//! Container class DisjointSet models equivalence relationships
+//!
+//! Each instance of this class keeps a set of equivalent classes
+//! DisjointSet::join(a,b) makes the full class of a and b equivalent
+//! DisjointSet::areEqual(a,b) checks if a and b belong same class
+//!
+//! \note The template type T is assumed to be hashable
+template <typename T>
+class DisjointSet {
+ public:
+  DisjointSet() = default;
+
+  //! Joins the equivalent class that a and b belong to
+  //! areEqual(a',b') will be true for each a'=a and b'=b
+  //!
+  //! \param a An element from a equivalent class
+  //!          will create a new equivalent class if a does
+  //!          not belong to any
+  //! \param b An element from another equivalent class
+  //!          will create a new equivalent class if b does
+  //!          not belong to any
+  void join(T a, T b) {
+    // cases where either of the quiv class doesn't exist
+    if (!entry_map.count(a) && !entry_map.count(b)) {
+      createPoint(a);
+      entry_map[b] = fixedPoint(a);
+    } else if (!entry_map.count(a)) {
+      entry_map[a] = fixedPoint(b);
+    } else if (!entry_map.count(b)) {
+      entry_map[b] = fixedPoint(a);
+    } else {
+      // case where both equiv classes exist and need to join
+      const int i0 = fixedPoint(a);
+      const int i1 = fixedPoint(b);
+      int new_parent = 0;
+      int new_child = 0;
+
+      // Either order here is correct but joining larger class to smaller class
+      // tend to be faster
+      std::tie(new_parent, new_child) = (weights[i0] < weights[i1])
+          ? std::make_pair(i0, i1)
+          : std::make_pair(i1, i0);
+      weights[new_parent] += weights[new_child];
+      set_map[new_child] = new_parent;
+    }
+  }
+
+  //! Checks if a and b belong to the same equivalent class
+  //!
+  //! \param a An element from a equivalent class
+  //! \param b An element from another equivalent class
+  //! \returns Boolean value representing if a and b are
+  //!          recorded to be in the same equivalent class
+  //!          will return false if any of a or b doesn't
+  //!          have an equivalent class recorded
+  bool areEquivalent(T a, T b) const {
+    if (!entry_map.count(a) || !entry_map.count(b)) {
+      return false;
+    }
+    return fixedPoint(a) == fixedPoint(b);
+  }
+
+ private:
+  // Internal fixed point implementation:
+  //  Returns the equivalent class that e belongs to
+  int fixedPoint(int e) const {
+    TORCH_INTERNAL_ASSERT(static_cast<int>(set_map.size()) > e);
+    while (set_map[e] != e) {
+      // Chasing to fixed point
+      e = set_map[e];
+    }
+    return e;
+  }
+
+  //! Utility to check the class i belongs to:
+  //!
+  //! Will create a new class if no match seen
+  //! \param e element e to find the equiv class for
+  //! \returns the equivalent class that e belongs to
+  //!
+  int fixedPoint(T e) const {
+    // Handles case when i doesn't have an equivalence class
+    TORCH_INTERNAL_ASSERT(entry_map.count(e));
+
+    // Use fixed point as a representation for the equiv class
+    return fixedPoint(entry_map.at(e));
+  }
+
+  //! Utility to create a new equiv class for i
+  //
+  //! \param i Element i to create the equiv class for
+  void createPoint(T i) {
+    entry_map[i] = next_index_;
+    set_map.push_back(next_index_++);
+    weights.push_back(1);
+  }
+
+ private:
+  // Internal representation of the equivalence class as integers
+  // set_map implements the "parent" relationship
+  std::vector<int> set_map;
+  // Weights is used for preliminary perf optimization
+  std::vector<int> weights;
+
+  // Map the input of type T to its equivalence class
+  std::unordered_map<T, int> entry_map;
+
+  // Running counter for generating new index when
+  // Creating new equiv classes
+  int next_index_ = 0;
+};
+
+//! Concretize broadcast axes, i.e. identifying a non-broadcast
+//! IterDomain that the broadcast IterDomain can map to.
+//!
+//! This traversal processes root domains only, concretization works by
+//! inspecting pointwise ops, e.g. : T2 [i0,i1] = T1[i0,B0] + T0[i0,i1]
+//! will concretize axis B0 to i1
+//!
+class ConcretizeDomain : private BackwardVisitor {
+ public:
+  //! Traverses the graph backward from outputs
+  //! to identify all concretizing opportunities
+  //!
+  explicit ConcretizeDomain(Fusion* fusion) {
+    traverseFrom(fusion, fusion->outputs(), false);
+  }
+
+  //! API call to run the concretize pass and return the
+  //! axis that bcast_dom concretizes to
+  //!
+  static const IterDomain* getConcreteDomain(IterDomain* bcast_dom) {
+    ConcretizeDomain cd(bcast_dom->fusion());
+
+    // Remove this assertion once we support broadcast on output
+    TORCH_INTERNAL_ASSERT(cd.canConcretize(bcast_dom));
+    return cd.concretized(bcast_dom);
+  }
+
+  // Returns true if either id is not a broadcast or
+  // the traversal has found a concretized axis for id
+  bool canConcretize(IterDomain* id) const {
+    return !id->isBroadcast() || bcast_domain_map_.count(id);
+  }
+
+  // Returns the concretized id recorded from traversal
+  IterDomain* concretized(IterDomain* id) const {
+    TORCH_INTERNAL_ASSERT(canConcretize(id));
+    if (!id->isBroadcast()) {
+      return id;
+    }
+    return bcast_domain_map_.at(id);
+  }
+
+ private:
+  // Utility to inspect a pointwise operator and
+  // record concretize opportunities
+  void concretizePwOp(Expr* e);
+
+  // Utility to record new concretize opportunity
+  void concretizeTo(IterDomain* id, IterDomain* To) {
+    TORCH_INTERNAL_ASSERT(id->isBroadcast() && !To->isBroadcast());
+    bcast_domain_map_[id] = concretized(To);
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Woverloaded-virtual"
+  void handle(ReductionOp* rop) override {
+    concretizePwOp(rop);
+  }
+
+  void handle(UnaryOp* uop) override {
+    concretizePwOp(uop);
+  }
+
+  void handle(BinaryOp* bop) override {
+    concretizePwOp(bop);
+  }
+
+  void handle(TernaryOp* top) override {
+    concretizePwOp(top);
+  };
+#pragma clang diagnostic pop
+
+ private:
+  using MapType = std::unordered_map<IterDomain*, IterDomain*>;
+  MapType bcast_domain_map_;
+};
+
+void ConcretizeDomain::concretizePwOp(Expr* e) {
+  TensorView* tv = *ir_utils::filterByType<TensorView>(e->outputs()).begin();
+
+  std::vector<IterDomain*> io = tv->getRootDomain();
+
+  for (auto* i : ir_utils::filterByType<TensorView>(e->inputs())) {
+    std::vector<IterDomain*> ii =
+        TensorDomain::noReductions(i->getMaybeRFactorDomain());
+    TORCH_INTERNAL_ASSERT(ii.size() == io.size());
+
+    for (size_t it = 0; it < ii.size(); it++) {
+      if (!canConcretize(io[it]))
+        continue;
+
+      if (!canConcretize(ii[it]))
+        concretizeTo(ii[it], concretized(io[it]));
+    }
+  }
+}
+
+//! Models equivalence provable by the graph
+//!
+//! This traversal processes root domains only,
+//! equalities , e.g. :
+//!    T2 [i0,i1] = T1[i2,i3] + T0[i4,i5]
+//! will prove that i2 and i4 are equal in the sense that
+//!    i2.start = i4.start, i2.extent = i4.extent
+//! Depends on ConcretizeDomain, and equalities involving
+//! broadcast domains are defined based on the concretized version
+class ProveValEqual : private IterVisitor {
+ public:
+  explicit ProveValEqual(Fusion* fusion) : cd_(fusion) {
+    traverseFrom(fusion, fusion->outputs(), false);
+  }
+
+  //! Checks if two scalars are equal
+  //!
+  //! First checks if ScalarCheck has them equal,
+  //! next try to prove them equal from
+  //! the graph_traversal result
+  //!
+  //! \param a A symbolic value
+  //! \param b Another value from the same fusion
+  //! \returns Boolean representing if they are proven to be
+  //!          equal based on scalar check and graph traversal
+  bool areEqual(Val* a, Val* b) const {
+    if (ScalarCheck::sameAs(a, b)) {
+      return true;
+    }
+    if (eq_set_.areEquivalent(a, b)) {
+      return true;
+    }
+    return false;
+  }
+
+  //! Checks if two iterdomains are equal
+  //!
+  //! Equality defined as equal start and equal extent
+  //! true means a and b are equal
+  //! false only means that they cannot be proven equal based
+  //! on scalar check and graph traversal
+  //!
+  //! \param a An iterdomain
+  //! \param b Another iterdomain from the same fusion
+  //! \returns Boolean representing if they are proven to be
+  //!          equivalent in the sense that they have equal
+  //!          start and extent
+  bool areEquivalent(IterDomain* a, IterDomain* b) const {
+    if (a->sameAs(b)) {
+      return true;
+    }
+
+    // Abort on un-concretized domains, this can appear once we
+    // allow broadcast on fusion output
+    if (!cd_.canConcretize(a) || !cd_.canConcretize(b)) {
+      return false;
+    }
+
+    auto ac = cd_.concretized(a);
+    auto bc = cd_.concretized(b);
+    return areEqual(ac->start(), bc->start()) &&
+        areEqual(ac->rawExtent(), bc->rawExtent());
+  }
+
+ private:
+  // Utility class to record new equality found
+  void proveId(IterDomain* a, IterDomain* b) {
+    if (!a->sameAs(b)) {
+      eq_set_.join(a->start(), b->start());
+      eq_set_.join(a->rawExtent(), b->rawExtent());
+    }
+  }
+
+  // Inspect a pointwise op and record the identified equality
+  void provePwOp(Expr* e) {
+    TensorView* tv = *ir_utils::filterByType<TensorView>(e->outputs()).begin();
+    std::vector<IterDomain*> io = tv->getRootDomain();
+
+    // Record equalities from output to all the inputs
+    // ignores un-concretizable broadcasts
+    for (auto* i : ir_utils::filterByType<TensorView>(e->inputs())) {
+      std::vector<IterDomain*> ii =
+          TensorDomain::noReductions(i->getMaybeRFactorDomain());
+
+      for (size_t it = 0; it < ii.size(); it++)
+        if (cd_.canConcretize(ii[it]) && cd_.canConcretize(io[it]))
+          proveId(cd_.concretized(ii[it]), cd_.concretized(io[it]));
+    }
+  }
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Woverloaded-virtual"
+  void handle(ReductionOp* rop) override {
+    provePwOp(rop);
+  }
+
+  void handle(UnaryOp* uop) override {
+    provePwOp(uop);
+  }
+
+  void handle(BinaryOp* bop) override {
+    provePwOp(bop);
+  }
+
+  void handle(TernaryOp* top) override {
+    provePwOp(top);
+  }
+#pragma clang diagnostic pop
+
+ private:
+  ConcretizeDomain cd_;
+  DisjointSet<const Val*> eq_set_;
+};
+
+} // namespace
+
+// API call to return the concretized axis of a broadcast axis
+const IterDomain* IterDomain::concretizeDomain(IterDomain* bcast_dom) {
+  return ConcretizeDomain::getConcreteDomain(bcast_dom);
+}
+
+// API call to check if two IterDomains are equal
+// checks start and extent, contains both scalar check and graph traversal
+// broadcast domains are concretized before comparing
+bool IterDomain::proveEquivalent(IterDomain* a, IterDomain* b) {
+  TORCH_INTERNAL_ASSERT(a->fusion() == b->fusion());
+  ProveValEqual pve(a->fusion());
+  return pve.areEquivalent(a, b);
+}
+
 Split::Split(
     IterDomain* _outer,
     IterDomain* _inner,
diff --git a/torch/csrc/jit/codegen/cuda/ir_printer.h b/torch/csrc/jit/codegen/cuda/ir_printer.h
index 84f3a2a188ad..57ca00076afc 100644
--- a/torch/csrc/jit/codegen/cuda/ir_printer.h
+++ b/torch/csrc/jit/codegen/cuda/ir_printer.h
@@ -1,3 +1,4 @@
+
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
@@ -6,51 +7,58 @@
 
 #include <iostream>
 
-/*
- * IRMathPrinter and IRTransformPrinter allow the splitting up of fusion print
- * functions. IRMathPrinter as its name implies focuses solely on what tensor
- * computations are taking place. Resulting TensorView math will reflect the
- * series of split/merge/computeAts that have taken place, however these
- * nodes will not be displayed in what is printed. IRTransformPrinter does not
- * print any mathematical functions and only lists the series of
- * split/merge calls that were made. Both of these printing methods are
- * quite verbose on purpose as to show accurately what is represented in the IR
- * of a fusion.
- */
-
 namespace torch {
 namespace jit {
 namespace fuser {
 
-class TORCH_CUDA_API IRMathPrinter : public IRPrinter {
+//! Prints computation Fusion IR nodes
+//!
+//! IrMathPrinter and IrTransformPrinter allow the splitting up of fusion print
+//! functions. IrMathPrinter as its name implies focuses solely on what tensor
+//! computations are taking place. Resulting TensorView math will reflect the
+//! series of split/merge/computeAts that have taken place, however these
+//! nodes will not be displayed in what is printed. IrTransformPrinter does not
+//! print any mathematical functions and only lists the series of
+//! split/merge calls that were made. Both of these printing methods are
+//! quite verbose on purpose as to show accurately what is represented in the IR
+//! of a fusion.
+//
+//! \sa IrTransformPrinter
+//!
+class TORCH_CUDA_API IrMathPrinter : public IrPrinter {
  public:
-  IRMathPrinter(std::ostream& os) : IRPrinter(os) {}
+  IrMathPrinter(std::ostream& os) : IrPrinter(os) {}
 
   void handle(const Split* const) override {}
   void handle(const Merge* const) override {}
 
   void handle(Fusion* f) override {
-    IRPrinter::handle(f);
+    IrPrinter::handle(f);
   }
 };
 
-class TORCH_CUDA_API IRTransformPrinter : public IRPrinter {
+//! Prints transformation (schedule) Fusion IR nodes
+//!
+//! \sa IrMathPrinter
+//!
+class TORCH_CUDA_API IrTransformPrinter : public IrPrinter {
  public:
-  IRTransformPrinter(std::ostream& os) : IRPrinter(os) {}
+  IrTransformPrinter(std::ostream& os) : IrPrinter(os) {}
 
-  // Tensor Expressions
   void handle(const UnaryOp* const uop) override {
-    if (print_inline_)
-      IRPrinter::handle(uop);
+    if (printInline()) {
+      IrPrinter::handle(uop);
+    }
   }
 
   void handle(const BinaryOp* const bop) override {
-    if (print_inline_)
-      IRPrinter::handle(bop);
+    if (printInline()) {
+      IrPrinter::handle(bop);
+    }
   }
 
   void handle(Fusion* f) override {
-    IRPrinter::handle(f);
+    IrPrinter::handle(f);
   }
 };
 
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
index 198643414a09..1a846fa96a72 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp
@@ -10,32 +10,6 @@ namespace fuser {
 
 /* ITER VISITOR */
 
-std::vector<Statement*> IterVisitor::next(Statement* stmt) {
-  if (stmt->isVal()) {
-    return next(stmt->as<Val>());
-  } else if (stmt->isExpr()) {
-    return next(stmt->as<Expr>());
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        false, "IterVisitor could not detect type in next_dispatch.");
-  }
-}
-
-std::vector<Statement*> IterVisitor::next(Val* v) {
-  FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, ");
-  if (FusionGuard::getCurFusion()->origin(v) != nullptr) {
-    return {FusionGuard::getCurFusion()->origin(v)};
-  }
-  return {};
-}
-
-std::vector<Statement*> IterVisitor::next(Expr* expr) {
-  FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, ");
-  std::vector<Statement*> next_stmts{expr->inputs().begin(),
-                                     expr->inputs().end()};
-  return next_stmts;
-}
-
 namespace {
 
 // Remove any stmt in stmts that is in visited
@@ -349,7 +323,7 @@ namespace {
 // them.
 struct Dependencies : public IterVisitor {
   std::unordered_set<Val*> dependencies_;
-  std::unordered_set<Val*> vals;
+  std::unordered_set<Val*> vals_;
 
   std::vector<Statement*> next(Val* v) override {
     if (dependencies_.find(v) != dependencies_.end())
@@ -358,7 +332,7 @@ struct Dependencies : public IterVisitor {
   }
 
   void handle(Val* val) override {
-    vals.emplace(val);
+    vals_.emplace(val);
   }
 
   Dependencies(
@@ -377,11 +351,43 @@ struct Dependencies : public IterVisitor {
     }
 
     Dependencies deps(dependencies, of);
-    return deps.vals;
+    return deps.vals_;
+  }
+};
+
+// Looks for and returns all output values with dependencies on `of`.
+struct FindOutputs : public IterVisitor {
+  const std::unordered_set<Val*>& of_;
+  std::unordered_set<Val*> outs_;
+
+  void handle(Val* val) override {
+    if (of_.find(val) != of_.end()) {
+      Statement* out_stmt = stmt_stack.front().back();
+      if (out_stmt->isVal()) {
+        auto out_val = out_stmt->as<Val>();
+        if (of_.find(out_val) == of_.end()) {
+          outs_.emplace(out_val);
+        }
+      }
+    }
+  }
+
+  FindOutputs(const std::unordered_set<Val*>& _of) : of_(_of) {
+    auto fusion = (*of_.begin())->fusion();
+    traverseFrom(fusion, fusion->outputs(), false);
+  };
+
+  static std::unordered_set<Val*> getAllOutputsOf(
+      const std::unordered_set<Val*>& of) {
+    if (of.empty()) {
+      return std::unordered_set<Val*>();
+    }
+
+    FindOutputs finder(of);
+    return finder.outs_;
   }
 };
 
-// Looks for and returns
 class DependencyChains : public IterVisitor {
  public:
   std::deque<std::deque<Val*>> dep_chains;
@@ -496,6 +502,44 @@ std::unordered_set<Val*> DependencyCheck::getAllValsBetween(
   return Dependencies::getAllVals(dependencies, of);
 }
 
+std::unordered_set<Val*> DependencyCheck::getAllOutputsOf(
+    const std::unordered_set<Val*>& of) {
+  if (of.empty()) {
+    return std::unordered_set<Val*>();
+  }
+  FusionGuard fg((*of.begin())->fusion());
+  return FindOutputs::getAllOutputsOf(of);
+}
+
+void ExprSort::handle(Expr* expr) {
+  exprs.push_back(expr);
+}
+
+std::vector<Expr*> ExprSort::getExprs(Fusion* fusion, bool from_outputs_only) {
+  ExprSort es;
+  es.traverse(fusion, from_outputs_only);
+  return es.exprs;
+}
+
+std::vector<Expr*> ExprSort::getExprs(
+    Fusion* fusion,
+    const std::vector<Val*>& from) {
+  ExprSort es;
+  es.traverseFrom(fusion, from, false);
+  return es.exprs;
+}
+
+void InputsOf::handle(Val* v) {
+  if (FusionGuard::getCurFusion()->origin(v) == nullptr)
+    inputs.emplace(v);
+}
+
+std::unordered_set<Val*> InputsOf::output(Fusion* fusion, Val* output_) {
+  InputsOf io;
+  io.traverseFrom(FusionGuard::getCurFusion(), {output_}, false);
+  return io.inputs;
+}
+
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/torch/csrc/jit/codegen/cuda/iter_visitor.h
index ec08df28a89f..cf01e903f3a1 100644
--- a/torch/csrc/jit/codegen/cuda/iter_visitor.h
+++ b/torch/csrc/jit/codegen/cuda/iter_visitor.h
@@ -4,6 +4,11 @@
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 
+#include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+
 #include <deque>
 #include <unordered_set>
 #include <vector>
@@ -12,14 +17,6 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-class Statement;
-class Val;
-class Expr;
-
-class Fusion;
-
-enum class ValType;
-
 /*
  * IterVisitor starts from leaf nodes, fusion outputs, or the provided values.
  * It walks the DAG bacwkards from the starting nodes, to roots. Each node in
@@ -49,9 +46,31 @@ class TORCH_CUDA_API IterVisitor : public OptOutDispatch {
   // These functions will start at outputs and propagate up through the DAG
   // to inputs based on depth first traversal. Next could be called on a node
   // multiple times.
-  virtual std::vector<Statement*> next(Statement* stmt);
-  virtual std::vector<Statement*> next(Expr* expr);
-  virtual std::vector<Statement*> next(Val* v);
+  virtual std::vector<Statement*> next(Statement* stmt) {
+    if (stmt->isVal()) {
+      return next(stmt->as<Val>());
+    } else if (stmt->isExpr()) {
+      return next(stmt->as<Expr>());
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          false, "IterVisitor could not detect type in next_dispatch.");
+    }
+  }
+
+  virtual std::vector<Statement*> next(Val* v) {
+    FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, ");
+    if (FusionGuard::getCurFusion()->origin(v) != nullptr) {
+      return {FusionGuard::getCurFusion()->origin(v)};
+    }
+    return {};
+  }
+
+  virtual std::vector<Statement*> next(Expr* expr) {
+    FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, ");
+    std::vector<Statement*> next_stmts{expr->inputs().begin(),
+                                       expr->inputs().end()};
+    return next_stmts;
+  }
 
   // This handle functions is called on every Statement* in topological order,
   // starting from outputs to inputs.
@@ -212,6 +231,36 @@ class TORCH_CUDA_API DependencyCheck {
   static std::unordered_set<Val*> getAllValsBetween(
       const std::unordered_set<Val*>& dependencies,
       const std::vector<Val*>& of);
+
+  // Return registered outputs of the fusion that are a dependency of any val of
+  static std::unordered_set<Val*> getAllOutputsOf(
+      const std::unordered_set<Val*>& of);
+};
+
+// Expr sort will take a fusion and return a topologically sorted list of
+// expressions.
+class ExprSort : public IterVisitor {
+ private:
+  std::vector<Expr*> exprs;
+
+  void handle(Expr* expr) override;
+
+ public:
+  static std::vector<Expr*> getExprs(Fusion* fusion, bool from_outputs_only);
+
+  static std::vector<Expr*> getExprs(
+      Fusion* fusion,
+      const std::vector<Val*>& from);
+};
+
+class InputsOf : public IterVisitor {
+ private:
+  std::unordered_set<Val*> inputs;
+
+  void handle(Val* v) final;
+
+ public:
+  static std::unordered_set<Val*> output(Fusion* fusion, Val* output_);
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp
index 284bcffda7fb..c6c0a39ccb79 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel.cpp
@@ -1,11 +1,156 @@
 
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+
+#include <unordered_set>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 
-void Kernel::print() const {}
+namespace {
+
+//! Scan all primary expressions in the Kernel IR and build
+//! list of specialized nodes
+//!
+//! \note primary expressions are expressions which are not subexpressions
+//!   in a larger expression (things like ForLoop or IfThenElse are not
+//!   real expressions)
+//!
+class KernelIrScanner : private OptOutDispatch {
+ public:
+  // Use expression count to uniquely identify each expression
+  size_t all_expression_count = 0;
+
+  // Map expression id to war hazard sync
+  std::unordered_map<size_t, kir::Sync*> war_hazard_syncs;
+
+  std::vector<kir::Allocate*> global_allocations;
+  std::vector<kir::Allocate*> dynamic_allocations;
+  std::vector<kir::Allocate*> static_allocations;
+  std::unordered_set<Expr*> primary_expressions;
+
+ public:
+  explicit KernelIrScanner(const std::vector<Expr*>& exprs) {
+    TORCH_INTERNAL_ASSERT(!exprs.empty());
+    for (auto expr : exprs) {
+      handle(expr);
+    }
+  }
+
+ private:
+  void handle(Expr* expr) final {
+    TORCH_CHECK(primary_expressions.insert(expr).second);
+    ++all_expression_count;
+    OptOutDispatch::handle(expr);
+  }
+
+  void handle(kir::Sync* sync) final {
+    // TODO: Move to a dedicated validation pass
+    // which is not on the common execution/compilation path
+    if (sync->isWarHazardSync()) {
+      war_hazard_syncs[all_expression_count] = sync;
+    }
+  }
+
+  void handle(kir::ForLoop* fl) final {
+    for (auto expr : fl->body().exprs()) {
+      handle(expr);
+    }
+  }
+
+  void handle(kir::IfThenElse* ite) final {
+    for (auto expr : ite->thenBody().exprs()) {
+      handle(expr);
+    }
+    for (auto expr : ite->elseBody().exprs()) {
+      handle(expr);
+    }
+  }
+
+  void handle(kir::Allocate* a) final {
+    switch (a->getMemoryType()) {
+      case MemoryType::Global:
+        global_allocations.push_back(a);
+        break;
+      case MemoryType::Shared:
+        if (a->size()->isConstScalar()) {
+          static_allocations.push_back(a);
+        } else {
+          dynamic_allocations.push_back(a);
+        }
+        break;
+      case MemoryType::Local:
+        break;
+    }
+  }
+};
+
+} // namespace
+
+// TODO(kir): Kernel IR validation
+void Kernel::finalize(
+    std::vector<Expr*> top_level_exprs,
+    ThreadPredicateMap predicate_map) {
+  TORCH_CHECK(top_level_exprs_.empty());
+  TORCH_CHECK(!predicate_map_);
+  top_level_exprs_ = std::move(top_level_exprs);
+  predicate_map_ =
+      std::make_unique<ThreadPredicateMap>(std::move(predicate_map));
+  analyze();
+}
+
+void Kernel::analyze() {
+  FUSER_PERF_SCOPE("Kernel::analyze");
+
+  const KernelIrScanner ir_scanner(top_level_exprs_);
+
+  // Cache the list of buffers used within the kernel
+  summary_.war_hazard_syncs = ir_scanner.war_hazard_syncs;
+  summary_.global_allocations = ir_scanner.global_allocations;
+  summary_.dynamic_smem_allocations = ir_scanner.dynamic_allocations;
+  summary_.static_smem_allocations = ir_scanner.static_allocations;
+
+  // Figure out if the kernel uses random numbers
+  for (auto expr : ir_scanner.primary_expressions) {
+    if (expr->getExprType() == ExprType::KirUnaryOp) {
+      if (expr->as<kir::UnaryOp>()->getUnaryOpType() == UnaryOpType::RandLike) {
+        summary_.is_stochastic = true;
+        break;
+      }
+    }
+  }
+
+  // Look for reductions and shared memory buffers
+  size_t max_smem_type_size = 0;
+  for (auto expr : ir_scanner.primary_expressions) {
+    for (auto out : expr->outputs()) {
+      if (out->getValType() == ValType::TensorIndex) {
+        const auto tv = out->as<kir::TensorIndex>()->view();
+        const auto domain = tv->domain();
+
+        // Do we have any reductions?
+        summary_.has_block_reductions |= domain->hasBlockReduction();
+        summary_.has_grid_reductions |= domain->hasGridReduction();
+
+        // Do we have block broadcasts?
+        summary_.has_block_broadcasts |= domain->hasBlockBroadcast();
+
+        // Update the largest smem data type
+        if (domain->hasBlockReduction() || domain->hasGridReduction() ||
+            tv->memoryType() == MemoryType::Shared) {
+          const auto data_type = tv->getDataType().value();
+          const size_t type_size = dataTypeSize(data_type);
+          if (type_size > max_smem_type_size) {
+            max_smem_type_size = type_size;
+            summary_.largest_smem_data_type = data_type;
+          }
+        }
+      }
+    }
+  }
+}
 
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h
index 73774e6f85fb..1d7b1834c39f 100644
--- a/torch/csrc/jit/codegen/cuda/kernel.h
+++ b/torch/csrc/jit/codegen/cuda/kernel.h
@@ -3,21 +3,132 @@
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
+#include <torch/csrc/jit/codegen/cuda/utils.h>
 
+#include <memory>
+#include <utility>
 #include <vector>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 
-class TORCH_CUDA_API Kernel final {
+//! Summary of interesting facts about the kernel
+//!
+//! TODO(kir): const node ptrs
+//!
+struct KernelSummary {
+  //! List of Write-After-Read (WAR) synchronization barriers
+  std::unordered_map<size_t, kir::Sync*> war_hazard_syncs;
+
+  //! List of global buffers
+  std::vector<kir::Allocate*> global_allocations;
+
+  //! List of dynamic shared memory buffers
+  std::vector<kir::Allocate*> dynamic_smem_allocations;
+
+  //! List of static shared memory buffers
+  std::vector<kir::Allocate*> static_smem_allocations;
+
+  //! Indicate the need to generate random numbers
+  bool is_stochastic = false;
+
+  //! Do we have any block reductions?
+  bool has_block_reductions = false;
+
+  //! Do we have any grid reductions?
+  bool has_grid_reductions = false;
+
+  //! Do we have any block broadcasts?
+  bool has_block_broadcasts = false;
+
+  //! Largest shared memory buffer base type
+  DataType largest_smem_data_type = DataType::Null;
+};
+
+//! Container for a lowered Kernel IR
+//!
+//! TODO(kir): currently, it is just pointing to nodes owned
+//!  by a Fusion object. The goal is to have the Kernel object
+//!  own the Kernel IR nodes
+//!
+class TORCH_CUDA_API Kernel final : public NonCopyable {
  public:
-  void print() const;
+  Kernel() = default;
+
+  //! Finalize a kernel definition
+  //!
+  //! At this point we have a complete kernel definition and we can
+  //! run analysis passes to build a KernelSummary
+  //!
+  void finalize(
+      std::vector<Expr*> top_level_exprs,
+      ThreadPredicateMap predicate_map);
+
+  //! Register input as an input of the kernel
+  void addInput(Val* input) {
+    inputs_.push_back(input);
+  }
+
+  //! Register output as an output of the kernel
+  void addOutput(Val* output) {
+    outputs_.push_back(output);
+  }
+
+  const auto& inputs() const {
+    return inputs_;
+  }
+
+  const auto& outputs() const {
+    return outputs_;
+  }
+
+  const auto& topLevelExprs() const {
+    return top_level_exprs_;
+  }
+
+  const KernelSummary& summary() const {
+    return summary_;
+  }
+
+  const ThreadPredicateMap& predicateMap() const {
+    return *predicate_map_;
+  }
+
+  //! Register a new Kernel IR node
+  //!
+  //! \note This is a specialized helper for kir::IrBuilder, not
+  //!   intendted for general use
+  //!
+  void registerIrNode(std::unique_ptr<Statement> node) {
+    ir_nodes_.push_back(std::move(node));
+  }
 
  private:
-  // Lowered IR
-  std::unordered_set<Val*> lowered_val_set_;
-  std::unordered_set<Expr*> lowered_expr_set_;
+  // Analyze the kernel IR and caches the summary of interesting data
+  void analyze();
+
+ private:
+  // Kernel IR nodes
+  std::vector<std::unique_ptr<Statement>> ir_nodes_;
+
+  // Map from value to its definition expression
+  std::unordered_map<const Val*, Expr*> definitions_;
+
+  // Top level expressions
+  std::vector<Expr*> top_level_exprs_;
+
+  // Kernel inputs and outputs
+  std::vector<Val*> inputs_;
+  std::vector<Val*> outputs_;
+
+  // Summary of interesting kernel data
+  KernelSummary summary_;
+
+  // Predicate map
+  // TODO(kir): consider a simpler, kernel IR based version
+  std::unique_ptr<ThreadPredicateMap> predicate_map_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index 6277a8103c79..e8300970eb59 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -1,11 +1,11 @@
+
 #include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 
-// TODO: This class is dead at the moment, but we need to figure out a generic
-// cacheing system that will suite our needs.
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -18,6 +18,8 @@ std::vector<size_t> toVector(const at::DimVector& small_vec) {
   return std::vector<size_t>(small_vec.begin(), small_vec.end());
 }
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-function"
 void debugPrint(const TensorTypePtr& type) {
   printf("\nsizes:");
   if (auto sizes = type->symbolic_sizes().sizes()) {
@@ -67,18 +69,15 @@ void debugPrint(const TensorTypePtr& type) {
     printf("no stride properties available\n");
   }
 }
+#pragma clang diagnostic pop
 
 at::DimVector graphReductionAxes(const std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("graphReductionAxes");
+
   at::DimVector reduction_axes;
+  // TODO: let check that we have only single reduction node in the graph.
   for (const auto& n : graph->nodes()) {
     if (isReductionNode(n)) {
-      // TODO: I think this is enough to detect reduction that's not the output
-      // as well. Since we go in topological order, we would run into
-      // intermediate reduction, if there's any.
-      TORCH_INTERNAL_ASSERT(
-          graph->outputs().size() == 1 && graph->outputs()[0] == n->output(),
-          "support for graph with reduction is limited to single output from reduction node");
-
       // TODO: we should return empty when `keepdim` is True?
       auto dims_list = constant_as<c10::List<int64_t>>(n->input(1));
       TORCH_INTERNAL_ASSERT(
@@ -97,6 +96,8 @@ at::DimVector graphReductionAxes(const std::shared_ptr<Graph>& graph) {
 }
 
 at::DimVector getPermutationPerSortedStride(const TensorTypePtr& type) {
+  FUSER_PERF_SCOPE("getPermutationPerSortedStride");
+
   // `permute_seq` is the returned permutation to achieve sorted stride;
   at::DimVector permute_seq;
 
@@ -157,9 +158,9 @@ at::DimVector inversePermutation(
     for (const auto& dim : permuted) {
       int adjusted_offset = 0;
       for (const auto& red_dim : reduction_axes) {
-        if (red_dim < (const unsigned long)dim) {
+        if (red_dim < (unsigned long)dim) {
           adjusted_offset++; // 1.b
-        } else if (red_dim == (const unsigned long)dim) {
+        } else if (red_dim == (unsigned long)dim) {
           adjusted_offset = -1; // 1.a
           break;
         }
@@ -185,59 +186,200 @@ at::DimVector inversePermutation(
 
 } // namespace
 
+InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
+    const at::ArrayRef<IValue>& inputs) {
+  IdLookupReturn ret;
+  std::stringstream encoded_inputs;
+  for (const auto& input : inputs) {
+    if (input.isTensor()) {
+      auto input_tensor = input.toTensor();
+
+      encoded_inputs << ";";
+      auto sep = "";
+      for (auto size : input_tensor.sizes()) {
+        encoded_inputs << sep << size;
+        sep = ",";
+      }
+      encoded_inputs << "@";
+      sep = "";
+      for (auto stride : input_tensor.strides()) {
+        encoded_inputs << sep << stride;
+        sep = ",";
+      }
+    } else {
+      // encode s for scalar;
+      encoded_inputs << ";s";
+    }
+  }
+  auto& id_iter_pair = encoding_lookup_[encoded_inputs.str()];
+
+  // short-cut to leave LRU entry as is;
+  if (id_iter_pair.lru_iter == used_entry_.begin()) {
+    ret.id = id_iter_pair.id;
+    return ret;
+  }
+
+  if (id_iter_pair.id == 0) {
+    // no entry existed for given input set, set id for given entry
+    id_iter_pair.id = current_id_++;
+    if (used_entry_.size() == max_cache_size_) {
+      // pop least recently used cache;
+      const auto& remove_iter = encoding_lookup_.find(used_entry_.back());
+      used_entry_.pop_back();
+      ret.evict_id = remove_iter->second.id;
+      ret.eviction = true;
+      encoding_lookup_.erase(remove_iter);
+    }
+  } else {
+    used_entry_.erase(id_iter_pair.lru_iter);
+  }
+
+  ret.id = id_iter_pair.id;
+  id_iter_pair.lru_iter =
+      used_entry_.insert(used_entry_.begin(), encoded_inputs.str());
+  return ret;
+}
+
 FusionExecutorCache::FusionExecutorCache(
     std::unique_ptr<Fusion>&& fusion,
     at::Device device)
-    : device_(device), fusion_(std::move(fusion)) {}
+    : device_(device), fusion_(std::move(fusion)) {
+  FUSER_PERF_SCOPE("FusionExecutorCache::FusionExecutorCache");
+  // avoid putting `has_reduction_` in the initializer list
+  has_reduction_ = fusion_->hasReduction();
+}
 
-// TODO: dummy cache
 std::vector<at::Tensor> FusionExecutorCache::runFusionWithInputs(
-    const at::ArrayRef<IValue>& inputs) {
-  // caching strategy is different for pw-fusion and reduction-fusion.
-  if (fusion_->hasReduction()) {
-    // copy the fusion, since each FusionExecutor needs to manipulate the fusion
-    // in order to generate kernel.
-    Fusion fusion = *fusion_;
-    FusionGuard fg(&fusion);
-    TensorView* red_tv = nullptr;
-    for (auto expr : fusion.exprs()) {
-      if (expr->getExprType().has_value() &&
-          expr->getExprType().value() == ExprType::ReductionOp) {
-        red_tv = expr->outputs()[0]->as<TensorView>();
-        break;
+    const at::ArrayRef<IValue>& inputs,
+    size_t unique_id) {
+  FUSER_PERF_SCOPE("runFusionWithInputs");
+  LaunchParams launch_params;
+  if (code_to_fe_lookup_.count(unique_id) == 0) {
+    // enter when we get a new input set. We need to search for compatible
+    // entries in cached `FusionExecutor` or compile new one as needed.
+
+    // caching strategy is different for pw-fusion and reduction-fusion.
+    if (has_reduction_) {
+      // Grab the fusion to analyze for heuristics
+      FusionGuard fg(fusion_.get());
+
+      TensorView* reduction_tv = nullptr;
+      // Use dependency check to find the reduction tv as it returns used values
+      // instead of exprs.
+
+      // The call is relatively heavy weight, consider caching
+      auto used_vals = DependencyCheck::getAllValsBetween(
+          {fusion_->inputs().begin(), fusion_->inputs().end()},
+          fusion_->outputs());
+
+      // Find the reduction tensor view, make sure there's only one
+      for (auto val : used_vals) {
+        if (val->getValType().value() == ValType::TensorView) {
+          auto tv = val->as<TensorView>();
+          if (tv->hasReduction()) {
+            TORCH_INTERNAL_ASSERT(
+                reduction_tv == nullptr,
+                "Already found a reduction tensorview, cannot handle fusion of multiple reductions.");
+            reduction_tv = tv;
+          }
+        }
       }
+
+      TORCH_INTERNAL_ASSERT(
+          reduction_tv != nullptr,
+          "Could not find the reduction tensor view in the fusion.");
+
+      // Generate the reduction parameters
+      auto reduction_params =
+          getReductionHeuristics(fusion_.get(), inputs, reduction_tv);
+
+      TORCH_INTERNAL_ASSERT(
+          reduction_params.has_value(),
+          "Error getting reduction heuristics for scheduling.");
+
+      launch_params = reduction_params.value().lparams;
+
+      auto fusion_executor =
+          &red_fusion_executor_cache_[reduction_params.value()];
+
+      if (!fusion_executor->compiled()) {
+        // HEURISTIC NOT COMPILED, COMPILE A KERNEL
+        Fusion fusion = *fusion_;
+
+        FusionGuard fg(&fusion);
+
+        // Heavy weight call
+        auto used_vals = DependencyCheck::getAllValsBetween(
+            {fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
+
+        TensorView* reduction_tv = nullptr;
+
+        for (auto val : used_vals) {
+          if (val->getValType().value() == ValType::TensorView) {
+            auto tv = val->as<TensorView>();
+            if (tv->hasReduction()) {
+              TORCH_INTERNAL_ASSERT(
+                  reduction_tv == nullptr,
+                  "Already found a reduction tensorview, cannot handle fusion of multiple reductions.");
+              reduction_tv = tv;
+            }
+          }
+        }
+
+        TORCH_INTERNAL_ASSERT(
+            reduction_tv != nullptr,
+            "Could not find the reduction tensor view in the fusion.");
+
+        // Heavy weight call
+        auto outputsOfReduction =
+            DependencyCheck::getAllOutputsOf({reduction_tv});
+
+        auto tv_entries =
+            ir_utils::filterByType<TensorView>(outputsOfReduction);
+
+        std::vector<TensorView*> tvOutputsOfReduction(
+            tv_entries.begin(), tv_entries.end());
+
+        scheduleReduction(
+            &fusion,
+            reduction_params.value(),
+            reduction_tv,
+            tvOutputsOfReduction);
+
+        // This means we have not found a previously generated kernel that's
+        // compatible with the new reduction params. We need to finish codegen.
+        CompileOptions options;
+        options.device = device_;
+        fusion_executor->compileFusion(&fusion, options);
+      }
+      // record new short cut to `FusionExecutor`
+      code_to_fe_lookup_[unique_id] = fusion_executor;
+
+    } else {
+      // Handle pointwise operations
+      if (!pw_fusion_executor_cache_) {
+        pw_fusion_executor_cache_ = std::make_unique<FusionExecutor>();
+        CompileOptions options;
+        options.device = device_;
+        // no need to copy fusion_, as we are not generating more than 1 kernel
+        // for PW.
+        scheduleFusion(fusion_.get(), inputs);
+        pw_fusion_executor_cache_->compileFusion(fusion_.get(), options);
+      }
+      // record new short cut to `FusionExecutor`
+      code_to_fe_lookup_[unique_id] = pw_fusion_executor_cache_.get();
     }
-    auto reduction_params = scheduleReduction(&fusion, inputs, red_tv);
-    TORCH_INTERNAL_ASSERT(
-        reduction_params.has_value(),
-        "reduction schedule failed in `scheduleReduction`");
-    auto& fusion_executor =
-        red_fusion_executor_cache_[reduction_params.value()];
-    if (!fusion_executor.compiled()) {
-      // This means we have not found a previously generated kernel that's
-      // compatible with the new reduction params. We need to finish codegen.
-      CompileOptions options;
-      options.device = device_;
-      fusion_executor.compileFusion(&fusion, options);
-    }
-    return fusion_executor.runFusion(inputs);
-  } else {
-    if (!pw_fusion_executor_cache_) {
-      pw_fusion_executor_cache_ = std::make_unique<FusionExecutor>();
-      CompileOptions options;
-      options.device = device_;
-      // no need to copy fusion_, as we are not generating more than 1 kernel
-      // for PW.
-      scheduleFusion(fusion_.get(), inputs);
-      pw_fusion_executor_cache_->compileFusion(fusion_.get(), options);
-    }
-    return pw_fusion_executor_cache_->runFusion(inputs);
   }
+
+  return code_to_fe_lookup_[unique_id]->runFusion(
+      inputs, launch_params, unique_id);
 }
 
 GraphCache::InputsRequirement::InputsRequirement(
     const std::shared_ptr<Graph>& graph,
     const std::vector<size_t>& reduction_axes) {
+  FUSER_PERF_SCOPE("InputsRequirement::InputsRequirement");
+
   // run over inputs to extract common types;
   TensorTypePtr acc_type = TensorType::get();
   for (const auto& input : graph->inputs()) {
@@ -256,16 +398,14 @@ GraphCache::InputsRequirement::InputsRequirement(
       vec_optional_ttp.emplace_back(c10::nullopt);
     }
   }
-  input_permutation_ = getPermutationPerSortedStride(acc_type);
-  output_permutation_ = inversePermutation(input_permutation_, reduction_axes);
-  TORCH_CHECK(
-      acc_type->device().has_value(), "requires fixed device for all inputs");
-  device_ = acc_type->device();
+  extractPermutation(acc_type, reduction_axes);
 }
 
 GraphCache::InputsRequirement::InputsRequirement(
     const at::ArrayRef<IValue>& inputs,
     const std::vector<size_t>& reduction_axes) {
+  FUSER_PERF_SCOPE("InputsRequirement::InputsRequirement");
+
   // run over inputs to extract common types;
   TensorTypePtr acc_type = TensorType::get();
   for (const auto& input : inputs) {
@@ -287,11 +427,7 @@ GraphCache::InputsRequirement::InputsRequirement(
       vec_optional_ttp.emplace_back(c10::nullopt);
     }
   }
-  input_permutation_ = getPermutationPerSortedStride(acc_type);
-  output_permutation_ = inversePermutation(input_permutation_, reduction_axes);
-  TORCH_CHECK(
-      acc_type->device().has_value(), "requires fixed device for all inputs");
-  device_ = acc_type->device();
+  extractPermutation(acc_type, reduction_axes);
 }
 
 bool GraphCache::InputsRequirement::requiresPermutation() {
@@ -302,10 +438,16 @@ bool GraphCache::InputsRequirement::requiresPermutation() {
     }
   }
   // Check if output agrees
-  const size_t output_rank = output_permutation_.size();
-  for (size_t i = 0; i < output_rank; i++) {
+  const size_t pw_output_rank = pw_output_permutation_.size();
+  for (size_t i = 0; i < pw_output_rank; i++) {
     TORCH_INTERNAL_ASSERT(
-        output_permutation_[i] == (long)i,
+        pw_output_permutation_[i] == (long)i,
+        "permutation of output and input is not consistent");
+  }
+  const size_t reduction_output_rank = reduction_output_permutation_.size();
+  for (size_t i = 0; i < reduction_output_rank; i++) {
+    TORCH_INTERNAL_ASSERT(
+        reduction_output_permutation_[i] == (long)i,
         "permutation of output and input is not consistent");
   }
   return false;
@@ -314,9 +456,12 @@ bool GraphCache::InputsRequirement::requiresPermutation() {
 // TODO: tests!
 bool GraphCache::InputsRequirement::complyWith(
     const InputsRequirement& expect) {
+  FUSER_PERF_SCOPE("InputsRequirement::complyWith");
+
   if (device_ != expect.device_ ||
       input_permutation_ != expect.input_permutation_ ||
-      output_permutation_ != expect.output_permutation_ ||
+      pw_output_permutation_ != expect.pw_output_permutation_ ||
+      reduction_output_permutation_ != expect.reduction_output_permutation_ ||
       vec_optional_ttp.size() != expect.vec_optional_ttp.size()) {
     return false;
   }
@@ -381,8 +526,22 @@ bool GraphCache::InputsRequirement::complyWith(
   return true;
 }
 
-FusionExecutorCache* GraphCache::createFusionExecutorCache(
+void GraphCache::InputsRequirement::extractPermutation(
+    const TensorTypePtr& acc_type,
+    const std::vector<size_t>& reduction_axes) {
+  input_permutation_ = getPermutationPerSortedStride(acc_type);
+  reduction_output_permutation_ =
+      inversePermutation(input_permutation_, reduction_axes);
+  pw_output_permutation_ = inversePermutation(input_permutation_, {});
+  TORCH_CHECK(
+      acc_type->device().has_value(), "requires fixed device for all inputs");
+  device_ = acc_type->device();
+}
+
+FusionExecutorCache* GraphCache::appendFusionExecutorCache(
     const InputsRequirement& input_stack) {
+  FUSER_PERF_SCOPE("createFusionExecutorCache");
+
   input_stacks_.emplace_back(input_stack);
   std::shared_ptr<Graph> parsing_graph = graph_->copy();
   // assign inputs on parsing_graph to accommodate legacy executor, where input
@@ -457,12 +616,6 @@ FusionExecutorCache* GraphCache::createFusionExecutorCache(
       // see [ NOTE - reduction in graph ] part 2.
       for (auto n : parsing_graph->nodes()) {
         if (isReductionNode(n)) {
-          // TODO: this is mostly redundant check, but it's compile time, we
-          //       leave it here to be safe;
-          TORCH_INTERNAL_ASSERT(
-              parsing_graph->outputs().size() == 1 &&
-                  parsing_graph->outputs()[0] == n->output(),
-              "supporfor graph with reduction is limited to single output from reduction node");
           auto dims_list = constant_as<c10::List<int64_t>>(n->input(1));
           TORCH_INTERNAL_ASSERT(
               dims_list.has_value(), "reduction axes should be constant");
@@ -496,10 +649,12 @@ FusionExecutorCache* GraphCache::createFusionExecutorCache(
 
 GraphCache::GraphCache(std::shared_ptr<Graph> graph)
     : graph_(std::move(graph)) {
+  FUSER_PERF_SCOPE("GraphCache::GraphCache");
+
   // [ NOTE - reduction in graph ]
   //
   // reduction complicates our permutation in integration, it addes two things:
-  // 1. we need to adjust output_permutation_;
+  // 1. we need to adjust xxx_output_permutation_;
   //    because of dimension elimination during permutation (not necessarily,
   //    given the `keepdim` argument.) this needs to be accommodated later when
   //    we added the support.
@@ -511,50 +666,99 @@ GraphCache::GraphCache(std::shared_ptr<Graph> graph)
   // compile a kernel if we have enough information from graph (profiling
   // record)
   if (IsNewExecutorEnabled()) {
-    createFusionExecutorCache(
+    appendFusionExecutorCache(
         InputsRequirement(graph_, toVector(reduction_axes_)));
   }
 }
 
 std::vector<at::Tensor> GraphCache::runGraphWithInputs(
     const at::ArrayRef<IValue>& inputs) {
-  InputsRequirement input_stack(inputs, toVector(reduction_axes_));
+  FUSER_PERF_SCOPE("runGraphWithInputs");
+  // get unique id `unique_id` for given input set `inputs`;
+  auto id_lookup_ret = inputs_id_lookup_.lookupId(inputs);
+  const size_t unique_id = id_lookup_ret.id;
+
+  // if we went over the cache size for short-cut, we evict entries using LRU;
+  if (id_lookup_ret.eviction) {
+    auto index_lookup_iter = code_to_index_lookup_.find(id_lookup_ret.evict_id);
+    TORCH_INTERNAL_ASSERT(
+        index_lookup_iter != code_to_index_lookup_.end(),
+        "evicting cache entry not found in lookup table");
+    // evict nested cache in FusionExecutorCache
+    fe_cache_[index_lookup_iter->second]->evictCache(index_lookup_iter->first);
+    code_to_index_lookup_.erase(index_lookup_iter);
+  }
+
   FusionExecutorCache* fusion_executor_cache = nullptr;
 
-  // TODO: hash indexing;
-  for (size_t i = 0; i < fe_cache_.size(); i++) {
-    if (input_stack.complyWith(input_stacks_[i])) {
-      fusion_executor_cache = fe_cache_[i].get();
-      break;
+  if (code_to_index_lookup_.count(unique_id) == 0) {
+    InputsRequirement input_stack(inputs, toVector(reduction_axes_));
+    for (size_t i = 0; i < fe_cache_.size(); i++) {
+      if (input_stack.complyWith(input_stacks_[i])) {
+        // found compliable fe_cache_ entry
+        fusion_executor_cache = fe_cache_[i].get();
+        // record short cut to designated fusion executor
+        code_to_index_lookup_[unique_id] = i;
+        break;
+      }
     }
+    if (!fusion_executor_cache) {
+      // This is the ugly bit, each level of cache has their own entry. At this
+      // point, we are creating an instance of FusionExecutorCache as well as a
+      // cache entry for GraphCache;
+      // But we are not creating any cache entry for nested structures. We only
+      // create cache entry below when we later call
+      // `fusion_executor_cache->runFusionWithInputs`
+      fusion_executor_cache = appendFusionExecutorCache(input_stack);
+      // record short cut to designated fusion executor
+      code_to_index_lookup_[unique_id] = fe_cache_.size() - 1;
+    }
+  } else {
+    // take short cut to designated fusion executor
+    fusion_executor_cache = fe_cache_[code_to_index_lookup_[unique_id]].get();
   }
-  if (!fusion_executor_cache) {
-    fusion_executor_cache = createFusionExecutorCache(input_stack);
-  }
+  InputsRequirement* input_requirement =
+      &input_stacks_[code_to_index_lookup_[unique_id]];
 
   // GraphCache need to permute inputs/outputs to accommodate dimension
   // coalescing
-  if (input_stack.requiresPermutation()) {
+  if (input_requirement->requiresPermutation()) {
     std::vector<IValue> permuted_inputs;
     permuted_inputs.reserve(inputs.size());
     for (const auto& input : inputs) {
       if (input.isTensor()) {
         permuted_inputs.emplace_back(
-            input.toTensor().permute(input_stack.input_permutation_));
+            input.toTensor().permute(input_requirement->input_permutation_));
       } else {
         permuted_inputs.emplace_back(input);
       }
     }
-    auto outputs = fusion_executor_cache->runFusionWithInputs(permuted_inputs);
+    auto outputs =
+        fusion_executor_cache->runFusionWithInputs(permuted_inputs, unique_id);
     std::vector<at::Tensor> permuted_outputs;
     permuted_outputs.reserve(outputs.size());
     for (const auto& output : outputs) {
-      permuted_outputs.emplace_back(
-          output.permute(input_stack.output_permutation_));
+      // This is to address the issue that not all outputs from a reduction
+      // fusion are reduced tensor; We support intermediate tensors to be output
+      if (static_cast<size_t>(output.dim()) ==
+          input_requirement->pw_output_permutation_.size()) {
+        permuted_outputs.emplace_back(
+            output.permute(input_requirement->pw_output_permutation_));
+      } else if (
+          static_cast<size_t>(output.dim()) ==
+          input_requirement->reduction_output_permutation_.size()) {
+        permuted_outputs.emplace_back(
+            output.permute(input_requirement->reduction_output_permutation_));
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false,
+            "Something went wrong with integration permutation, can't find a consistent permutation for output in fusion",
+            *graph_);
+      }
     }
     return permuted_outputs;
   } else {
-    return fusion_executor_cache->runFusionWithInputs(inputs);
+    return fusion_executor_cache->runFusionWithInputs(inputs, unique_id);
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h
index 1b8233846dda..e0e8a75ea5cd 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h
@@ -8,12 +8,73 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <type_traits>
+#include <unordered_map>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 namespace cuda {
 
+//! Encoding an input set to unique id, which is used to short-cut cache entry
+//! selection in our nested cache implementation to cut off overhead.
+//!
+//! We have implemented naive LRU cache eviction policy here, since each entry
+//! in `InputsIdLookup` is attached to a static input shape/stride, and could
+//! grow gigantic when we have input shapes that does not stabalize to a finite
+//! set.
+//!
+//! \note the uniqueness of the ide generated for a given input set is only
+//!   local to the instance of `InputsIdLookup`.
+//!
+class TORCH_CUDA_API InputsIdLookup {
+ public:
+  // constructor where maximum cache size is fixed during init
+  explicit InputsIdLookup(size_t max_cache_size = 10)
+      : max_cache_size_(max_cache_size){};
+
+  // struct to hold return value for lookupId.
+  struct IdLookupReturn {
+    size_t id = 0;
+    size_t evict_id = 0;
+    bool eviction = false;
+  };
+
+  // encode each input sets to with an unique id;
+  // Returned data structure also indicates whether eviction has happened within
+  // the lookup cache. This is needed because lookup shortcut is also cached in
+  // nested `GraphCache`, `FusionExecutorCache` and `FusionExecutor`.
+  // see [ Note -- 2 level cache implementation ]
+  IdLookupReturn lookupId(const at::ArrayRef<IValue>& inputs);
+
+  // debugging API
+  size_t size() const {
+    return encoding_lookup_.size();
+  }
+
+ private:
+  // entry stored in `encoding_lookup_` to implement LRU
+  struct EncodingEntry {
+    size_t id;
+    std::list<std::string>::iterator lru_iter;
+  };
+
+  // maximum cache size for LRU
+  const size_t max_cache_size_;
+
+  // next available unique id, we monotonically increase `current_id_` avoid
+  // conflicts
+  size_t current_id_ = 1;
+
+  // entry in the cache, This is used to implement LRU cache, where entries in
+  // the list is ordered by their recent usage (freshly used entry is placed at
+  // the beginning)
+  std::list<std::string> used_entry_;
+
+  // map from `std::string` to a unique id `size_t` (packaged in `EncodingEntry`
+  // ). We store an iterator to `used_entry_` to implement LRU
+  std::unordered_map<std::string, EncodingEntry> encoding_lookup_;
+};
+
 // [ Note -- 2 level cache implementation ]
 //
 // 2 level hierarchically nested cache is to handle the code generation and
@@ -65,7 +126,19 @@ class FusionExecutorCache {
 
   // Execute fusion graph with given inputs, create `FusionExecutor` as needed;
   std::vector<at::Tensor> runFusionWithInputs(
-      const at::ArrayRef<IValue>& inputs);
+      const at::ArrayRef<IValue>& inputs,
+      size_t unique_id);
+
+  // evict cached short cut entry in `code_to_fe_lookup_`;
+  inline void evictCache(size_t cache_id) {
+    auto iter = code_to_fe_lookup_.find(cache_id);
+    TORCH_INTERNAL_ASSERT(
+        iter != code_to_fe_lookup_.end(),
+        "evict cache failed to find an entry");
+    // evict nested lookup entry in nested FusionExecutor
+    (iter->second)->evictCache(cache_id);
+    code_to_fe_lookup_.erase(iter);
+  };
 
  private:
   // device_ where compiled binaries are loaded on & inputs are expected to
@@ -75,6 +148,16 @@ class FusionExecutorCache {
   // original un-scheduled `Fusion`;
   std::unique_ptr<Fusion> fusion_;
 
+  // I'm trading the const model in favor of assigning `has_reduction_` in the
+  // body of constructor, instead of the initializer list;
+  // Because of the move statement used in the constructor, it's tricky to
+  // maintain the code if we have `has_reduction_` as a const member and
+  // initizlize it in the initializer list, where the order of initialization
+  // is controled by the order of declaration instead of their order in the list
+  //
+  // cache fusion->hasReduction() because it's expensive;
+  bool has_reduction_;
+
   // TODO: ugly logic for now. We should integrate the hashing of cache for
   //       different kernels. (alternatively we could do so in scheduler).
   // ugly bits now:
@@ -84,14 +167,12 @@ class FusionExecutorCache {
   //    `pw_fusion_executor_cache_`
   // 2. For reduction fusion we have a hash table with ReductionParams as entry
   //    pointing to the actual `FusionExecutor` in `red_fusion_executor_cache_`
-  //
-  // Unfortunately, at run-time in order to search compatible `FusionExecutor`,
-  // we have to call `scheduleReduction` in order to get an instance of
-  // `ReductionParams` for indexing. This is not very efficient. Hence the TODO:
-  // add a direct cache from inputs shapes to `FusionExecutor` entries.
   std::unique_ptr<FusionExecutor> pw_fusion_executor_cache_;
   std::unordered_map<ReductionParams, FusionExecutor, ReductionParamsHash>
       red_fusion_executor_cache_;
+
+  // short cut to FusionExecutor for input set encoded with id;
+  std::unordered_map<size_t, FusionExecutor*> code_to_fe_lookup_;
 };
 
 class GraphCache {
@@ -121,7 +202,8 @@ class GraphCache {
 
     // common permutation order used for dimension coalescing;
     at::DimVector input_permutation_;
-    at::DimVector output_permutation_;
+    at::DimVector pw_output_permutation_;
+    at::DimVector reduction_output_permutation_;
 
     // construct InputsRequirement from `Graph`, this is used for constructing
     // `GraphCache` entry using profiling record
@@ -136,18 +218,23 @@ class GraphCache {
         const at::ArrayRef<IValue>& inputs,
         const std::vector<size_t>& reduction_axes);
 
-    // bool operator==(const InputsRequirement& other);
     bool complyWith(const InputsRequirement& expect);
 
     // helper function used at run-time to check whether a common permutation is
     // present, this is used to take the short-cut to skip permutation logic.
     bool requiresPermutation();
+
+    // extract permutation for input output tensor from accumulcated tensor type
+    // pointer on all inputs;
+    void extractPermutation(
+        const TensorTypePtr& acc_type,
+        const std::vector<size_t>& reduction_axes);
   };
 
   // construct FusionExecutorCache per InputsRequirement.
   // This function makes sure that we properly insert both `input_stacks_` and
   // `fe_cache_` at the same time.
-  FusionExecutorCache* createFusionExecutorCache(
+  FusionExecutorCache* appendFusionExecutorCache(
       const InputsRequirement& input_stack);
 
  private:
@@ -156,10 +243,16 @@ class GraphCache {
   // TODO: poor name, we should use `eliminated_axes_` instead;
   at::DimVector reduction_axes_;
 
+  // short cut to index of stack for input set encoded with id;
+  std::unordered_map<size_t, size_t> code_to_index_lookup_;
+
   // TODO: we should really hash instead of iterative check. Optimize later...
   //       unordered_map<InputsRequirement, FusionExecutorCache>;
   std::vector<InputsRequirement> input_stacks_;
   std::vector<std::unique_ptr<FusionExecutorCache>> fe_cache_;
+
+  // inputs to unique_id lookup table;
+  InputsIdLookup inputs_id_lookup_;
 };
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
index c7c6d0ec39f0..7941f369d4ff 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
@@ -1,12 +1,10 @@
 
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
-// TODO(kir): remove
-#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -14,12 +12,14 @@ namespace kir {
 
 NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) {
   std::string parallel_dim = stringifyThreadSize(p_type);
-  return new NamedScalar(parallel_dim, DataType::Int);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  return ir_builder.create<NamedScalar>(parallel_dim, DataType::Int);
 }
 
 NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) {
   std::string parallel_ind = stringifyThread(p_type);
-  return new NamedScalar(parallel_ind, DataType::Int);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  return ir_builder.create<NamedScalar>(parallel_ind, DataType::Int);
 }
 
 c10::optional<ParallelType> NamedScalar::getParallelDim() const {
@@ -56,27 +56,19 @@ c10::optional<ParallelType> NamedScalar::getParallelIndex() const {
   return c10::nullopt;
 }
 
-IterDomain::IterDomain(Val* start, Val* extent)
+IterDomain::IterDomain(Passkey, Val* start, Val* extent)
     : Val(ValType::KirIterDomain, DataType::Int, true, true),
       start_(start),
       extent_(extent) {}
 
-IterDomain::IterDomain(const fuser::IterDomain* iter_domain)
+IterDomain::IterDomain(Passkey, const fuser::IterDomain* iter_domain)
     : Val(iter_domain),
-      start_(lowerValue(iter_domain->start())),
-      extent_(lowerValue(iter_domain->rawExtent())),
+      start_(GpuLower::lowerValue(iter_domain->start())),
+      extent_(GpuLower::lowerValue(iter_domain->rawExtent())),
       parallel_type_(iter_domain->getParallelType()),
       iter_type_(iter_domain->getIterType()),
       is_rfactor_domain_(iter_domain->isRFactorProduct()) {}
 
-IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      start_(ir_cloner->clone(src->start_)),
-      extent_(ir_cloner->clone(src->extent_)),
-      parallel_type_(src->parallel_type_),
-      iter_type_(src->iter_type_),
-      is_rfactor_domain_(src->is_rfactor_domain_) {}
-
 Val* IterDomain::extent() const {
   TORCH_CHECK(isLoweredVal(extent_));
   if (isThread()) {
@@ -90,20 +82,21 @@ Val* IterDomain::extent() const {
   return extent_;
 }
 
-TensorDomain::TensorDomain(std::vector<IterDomain*> domain)
+TensorDomain::TensorDomain(Passkey, std::vector<IterDomain*> domain)
     : Val(ValType::KirTensorDomain), root_domain_(std::move(domain)) {
   domain_ = root_domain_;
   resetDomains();
 }
 
-TensorDomain::TensorDomain(const fuser::TensorDomain* tensor_domain)
+TensorDomain::TensorDomain(Passkey, const fuser::TensorDomain* tensor_domain)
     : Val(tensor_domain), contiguity_(tensor_domain->contiguity()) {
   const auto lowerIterDomains =
       [](const std::vector<fuser::IterDomain*>& domains) {
         std::vector<IterDomain*> lowered_domains;
         lowered_domains.reserve(domains.size());
         for (const auto iter_domain : domains) {
-          lowered_domains.push_back(lowerValue(iter_domain)->as<IterDomain>());
+          lowered_domains.push_back(
+              GpuLower::lowerValue(iter_domain)->as<IterDomain>());
         }
         return lowered_domains;
       };
@@ -115,15 +108,6 @@ TensorDomain::TensorDomain(const fuser::TensorDomain* tensor_domain)
   rfactor_domain_ = lowerIterDomains(tensor_domain->getRFactorDomain());
 }
 
-TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      root_domain_(ir_cloner->clone(src->root_domain_)),
-      domain_(ir_cloner->clone(src->domain_)),
-      no_bcast_domain_(ir_cloner->clone(src->no_bcast_domain_)),
-      no_reduction_domain_(ir_cloner->clone(src->no_reduction_domain_)),
-      rfactor_domain_(ir_cloner->clone(src->rfactor_domain_)),
-      contiguity_(src->contiguity()) {}
-
 bool TensorDomain::hasReduction() const {
   return no_reduction_domain_.size() != domain_.size();
 }
@@ -140,6 +124,12 @@ bool TensorDomain::hasGridReduction() const {
   });
 }
 
+bool TensorDomain::hasBlockBroadcast() const {
+  return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) {
+    return id->isBroadcast() && id->isThreadDim();
+  });
+}
+
 bool TensorDomain::hasBroadcast() const {
   return no_bcast_domain_.size() != domain_.size();
 }
@@ -175,31 +165,20 @@ std::vector<IterDomain*> TensorDomain::noBroadcasts(
   return no_broadcast_domains;
 }
 
-TensorView::TensorView(const fuser::TensorView* tv) : Val(tv), fuser_tv_(tv) {
-  domain_ = lowerValue(tv->domain())->as<TensorDomain>();
+TensorView::TensorView(Passkey, const fuser::TensorView* tv)
+    : Val(tv), fuser_tv_(tv) {
+  domain_ = GpuLower::lowerValue(tv->domain())->as<TensorDomain>();
   memory_type_ = tv->getMemoryType();
 }
 
-TensorView::TensorView(const TensorView* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      domain_(ir_cloner->clone(src->domain_)),
-      memory_type_(src->memory_type_),
-      fuser_tv_(src->fuser_tv_) {}
-
-UnaryOp::UnaryOp(UnaryOpType type, Val* out, Val* in)
+UnaryOp::UnaryOp(Passkey, UnaryOpType type, Val* out, Val* in)
     : Expr(ExprType::KirUnaryOp), unary_op_type_{type}, out_{out}, in_{in} {
   addOutput(out);
   addInput(in);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      unary_op_type_(src->unary_op_type_),
-      out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
-
-BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs)
+BinaryOp::BinaryOp(Passkey, BinaryOpType type, Val* out, Val* lhs, Val* rhs)
     : Expr(ExprType::KirBinaryOp),
       binary_op_type_{type},
       out_{out},
@@ -211,14 +190,13 @@ BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-BinaryOp::BinaryOp(const BinaryOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      binary_op_type_(src->binary_op_type_),
-      out_(ir_cloner->clone(src->out_)),
-      lhs_(ir_cloner->clone(src->lhs_)),
-      rhs_(ir_cloner->clone(src->rhs_)) {}
-
-TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3)
+TernaryOp::TernaryOp(
+    Passkey,
+    TernaryOpType type,
+    Val* out,
+    Val* in1,
+    Val* in2,
+    Val* in3)
     : Expr(ExprType::KirTernaryOp),
       ternary_op_type_{type},
       out_{out},
@@ -232,36 +210,24 @@ TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-TernaryOp::TernaryOp(const TernaryOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      ternary_op_type_(src->ternary_op_type_),
-      out_(ir_cloner->clone(src->out_)),
-      in1_(ir_cloner->clone(src->in1_)),
-      in2_(ir_cloner->clone(src->in2_)),
-      in3_(ir_cloner->clone(src->in3_)) {}
-
 ReductionOp::ReductionOp(
+    Passkey,
     BinaryOpType reduction_op_type,
     Val* init,
     Val* out,
-    Val* in)
+    Val* in,
+    Bool* pred)
     : Expr(ExprType::KirReductionOp),
       reduction_op_type_(reduction_op_type),
       init_(init),
       out_(out),
-      in_(in) {
+      in_(in),
+      pred_(pred) {
   addOutput(out);
   addInput(in);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-ReductionOp::ReductionOp(const ReductionOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      reduction_op_type_(src->reduction_op_type_),
-      init_(ir_cloner->clone(src->init_)),
-      out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
-
 std::vector<IterDomain*> ReductionOp::getReductionDomains() const {
   // out is a TensorIndex after lowering
   const auto out_val = out()->as<kir::TensorIndex>()->view();
@@ -288,7 +254,7 @@ std::unordered_map<ParallelType, IterDomain*, TypeHash> ReductionOp::
   return parallel_domains;
 }
 
-BroadcastOp::BroadcastOp(Val* out, Val* in)
+BroadcastOp::BroadcastOp(Passkey, Val* out, Val* in)
     : Expr(ExprType::KirBroadcastOp), out_(out), in_(in) {
   TORCH_CHECK(in->getValType().value() == ValType::TensorIndex);
   TORCH_CHECK(out->getValType().value() == ValType::TensorIndex);
@@ -297,16 +263,12 @@ BroadcastOp::BroadcastOp(Val* out, Val* in)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-BroadcastOp::BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      out_(ir_cloner->clone(src->out_)),
-      in_(ir_cloner->clone(src->in_)) {}
-
 TensorIndex::TensorIndex(
+    Passkey,
     const fuser::TensorView* view,
     std::vector<Val*> indices)
     : Val(ValType::TensorIndex, view->getDataType().value(), true, true),
-      view_(lowerValue(view)->as<TensorView>()),
+      view_(GpuLower::lowerValue(view)->as<TensorView>()),
       indices_(indices) {
   TORCH_INTERNAL_ASSERT(
       std::all_of(
@@ -320,13 +282,9 @@ TensorIndex::TensorIndex(
       "Cannot index with a value other than an int.");
 }
 
-TensorIndex::TensorIndex(const TensorIndex* src, IrCloner* ir_cloner)
-    : Val(src, ir_cloner),
-      view_(ir_cloner->clone(src->view_)),
-      indices_(ir_cloner->clone(src->indices_)) {}
-
-Scope::Scope(const Scope* src, IrCloner* ir_cloner)
-    : exprs_(ir_cloner->clone(src->exprs_)) {}
+Sync::Sync(Passkey, bool war_sync) : Expr(ExprType::Sync), war_sync_(war_sync) {
+  name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
+}
 
 void Scope::insert_before(Expr* ref, Expr* expr) {
   auto it = exprs_.begin();
@@ -373,9 +331,9 @@ void Scope::clear() {
 }
 
 ForLoop::ForLoop(
+    Passkey,
     Val* index,
     IterDomain* iter_domain,
-    const std::vector<Expr*>& body,
     Expr* parent_scope)
     : Expr(ExprType::ForLoop),
       index_{index},
@@ -386,18 +344,8 @@ ForLoop::ForLoop(
   addInput(index);
   addInput(iter_domain);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
-  for (Expr* expr : body) {
-    body_.push_back(expr);
-  }
 }
 
-ForLoop::ForLoop(const ForLoop* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      index_(ir_cloner->clone(src->index_)),
-      iter_domain_(ir_cloner->clone(src->iter_domain_)),
-      body_(&src->body_, ir_cloner),
-      parent_scope_(ir_cloner->clone(src->parent_scope_)) {}
-
 void ForLoop::setParentScope(Expr* scope) {
   TORCH_INTERNAL_ASSERT(
       !scope_utils::exprInScope(parentScope(), this),
@@ -405,28 +353,12 @@ void ForLoop::setParentScope(Expr* scope) {
   parent_scope_ = scope;
 }
 
-IfThenElse::IfThenElse(
-    Bool* cond,
-    const std::vector<Expr*>& if_body,
-    const std::vector<Expr*>& else_body,
-    Expr* parent_scope)
+IfThenElse::IfThenElse(Passkey, Bool* cond, Expr* parent_scope)
     : Expr(ExprType::IfThenElse), cond_{cond}, parent_scope_(parent_scope) {
   addInput(cond);
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
-
-  for (auto* expr : if_body)
-    body_.push_back(expr);
-  for (auto* expr : else_body)
-    else_body_.push_back(expr);
 }
 
-IfThenElse::IfThenElse(const IfThenElse* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      cond_(src->cond_),
-      body_(&src->body_, ir_cloner),
-      else_body_(&src->else_body_, ir_cloner),
-      parent_scope_(ir_cloner->clone(src->parent_scope_)) {}
-
 void IfThenElse::setParentScope(Expr* scope) {
   TORCH_INTERNAL_ASSERT(
       !scope_utils::exprInScope(parentScope(), this),
@@ -443,11 +375,17 @@ Val* TensorIndex::index(int i) const {
   return indices_[i];
 }
 
-Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size)
+Allocate::Allocate(
+    Passkey,
+    Val* buffer,
+    MemoryType memory_type,
+    Val* size,
+    bool zero_init)
     : Expr(ExprType::Allocate),
       buffer_(buffer),
       memory_type_(memory_type),
-      size_(size) {
+      size_(size),
+      zero_init_(zero_init) {
   if (size_ != nullptr) {
     TORCH_INTERNAL_ASSERT(
         size_->isOneInt() ||
@@ -455,16 +393,20 @@ Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size)
         "Cannot allocate a non-TensorView buffer with a size != 1, received buffer: ",
         buffer_);
   } else {
-    TORCH_CHECK(buffer_->getValType().value() == ValType::KirTensorView);
+    TORCH_INTERNAL_ASSERT(
+        buffer_->getValType().value() == ValType::KirTensorView);
+    TORCH_INTERNAL_ASSERT(
+        buffer_->as<TensorView>()->memoryType() == memory_type_);
+    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
     const auto domain = buffer_->as<TensorView>()->domain();
-    size_ = domain->nDims() == 0 ? new Int(1) : domain->axis(0)->extent();
+    size_ = domain->nDims() == 0 ? ir_builder.create<Int>(1)
+                                 : domain->axis(0)->extent();
     for (size_t i = 1; i < domain->nDims(); i++) {
-      size_ = mulExpr(size_, domain->axis(i)->extent());
+      size_ = ir_builder.mulExpr(size_, domain->axis(i)->extent());
     }
   }
 
-  if ((memory_type_ == MemoryType::Local ||
-       memory_type_ == MemoryType::Shared)) {
+  if (memory_type_ == MemoryType::Local) {
     if (!size_->isConstScalar()) {
       TORCH_INTERNAL_ASSERT(
           false,
@@ -480,151 +422,36 @@ Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size)
   name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this);
 }
 
-Allocate::Allocate(const Allocate* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      buffer_(ir_cloner->clone(src->buffer_)),
-      memory_type_(src->memory_type_),
-      size_(ir_cloner->clone(src->size_)) {}
-
-Sync::Sync() : Expr(ExprType::Sync) {
-  name_ = FusionGuard::getCurFusion()->registerExpr(this);
-}
-
-Sync::Sync(const Sync* src, IrCloner* ir_cloner) : Expr(src, ir_cloner) {}
-
-GridReduction::GridReduction(ReductionOp* reduction_op)
+GridReduction::GridReduction(Passkey, ReductionOp* reduction_op)
     : Expr(ExprType::GridReduction), reduction_op_(reduction_op) {
   TORCH_INTERNAL_ASSERT(false, "Not implemented yet.");
 }
 
 GridReduction::GridReduction(
+    Passkey,
     ReductionOp* reduction_op,
-    kir::Allocate* reduction_buffer,
-    kir::Allocate* sync_buffer)
+    Allocate* reduction_buffer,
+    Allocate* sync_buffer,
+    Bool* pred)
     : Expr(ExprType::GridReduction),
       reduction_op_(reduction_op),
       reduction_buffer_(reduction_buffer),
-      sync_buffer_(sync_buffer) {}
-
-GridReduction::GridReduction(const GridReduction* src, IrCloner* ir_cloner)
-    : Expr(src, ir_cloner),
-      reduction_op_(ir_cloner->clone(src->reduction_op_)),
-      reduction_buffer_(ir_cloner->clone(src->reduction_buffer_)),
-      sync_buffer_(ir_cloner->clone(src->sync_buffer_)) {}
+      sync_buffer_(sync_buffer),
+      pred_(pred) {}
 
 std::string GridReduction::getPredicateFlagName(const TensorView* val) {
   std::stringstream ss;
-  ss << "T" << val->name() << "pred";
+  ss << "T" << val->name() << "_pred";
   return ss.str();
 }
 
+// TODO(kir): remove this
 std::string GridReduction::getPredicateFlagName(const fuser::TensorView* val) {
   std::stringstream ss;
-  ss << "T" << val->name() << "pred";
+  ss << "T" << val->name() << "_pred";
   return ss.str();
 }
 
-bool isLoweredScalar(const Val* val) {
-  switch (val->getValType().value()) {
-    case ValType::KirNamedScalar:
-    case ValType::KirScalar:
-      return true;
-    default:
-      return false;
-  }
-}
-
-bool isLoweredVal(const Val* val) {
-  switch (val->getValType().value()) {
-    case ValType::TensorIndex:
-    case ValType::KirNamedScalar:
-    case ValType::KirScalar:
-    case ValType::KirTensorDomain:
-    case ValType::KirIterDomain:
-    case ValType::KirTensorView:
-      return true;
-    default:
-      return false;
-  }
-}
-
-namespace {
-
-Val* newResult(const Val* lhs, const Val* rhs) {
-  TORCH_CHECK(isLoweredScalar(lhs));
-  TORCH_CHECK(isLoweredScalar(rhs));
-  TORCH_CHECK(lhs->getDataType() == rhs->getDataType());
-
-  // Allocate a compatible result value
-  switch (lhs->getDataType().value()) {
-    case DataType::Bool:
-      return new Bool(c10::nullopt);
-    case DataType::Float:
-      return new Float(c10::nullopt);
-    case DataType::Half:
-      return new Half(c10::nullopt);
-    case DataType::Int:
-      return new Int(c10::nullopt);
-    default:
-      TORCH_CHECK(false, "Unexpected data type");
-  }
-}
-
-Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  auto result = newResult(lhs, rhs);
-  new BinaryOp(op_type, result, lhs, rhs);
-  return result;
-}
-
-Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
-  auto result = new Bool(c10::nullopt);
-  new BinaryOp(op_type, result, lhs, rhs);
-  return result;
-}
-
-} // namespace
-
-Val* lowerValue(const Val* val) {
-  TORCH_INTERNAL_ASSERT(!isLoweredVal(val), val, " is already lowered.");
-  return GpuLower::lowerValue(val);
-}
-
-Val* andExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::And, lhs, rhs);
-}
-
-Val* eqExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
-}
-
-Val* ltExpr(Val* lhs, Val* rhs) {
-  return newLogicExpr(BinaryOpType::LT, lhs, rhs);
-}
-
-Val* addExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
-}
-
-Val* subExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
-}
-
-Val* mulExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
-}
-
-Val* divExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
-}
-
-Val* ceilDivExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
-}
-
-Val* modExpr(Val* lhs, Val* rhs) {
-  return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
-}
-
 } // namespace kir
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h
index ef7c455ef8fb..e51bde37d285 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_ir.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h
@@ -1,8 +1,6 @@
 
 #pragma once
 
-#include <torch/csrc/WindowsTorchApiMacro.h>
-
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
 // TODO(kir): remove these once the Kernel IR is separated from Fusion IR
@@ -12,6 +10,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_internal_nodes.h>
 
 #include <c10/util/Optional.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <string>
 #include <unordered_map>
@@ -22,17 +21,27 @@ namespace jit {
 namespace fuser {
 namespace kir {
 
+class IrBuilder;
+
+//! Token used to restrict the access to Kernel IR constructors
+//!
+//! Granular "friendship" token, used to implement the "passkey" idiom:
+//! https://www.spiria.com/en/blog/desktop-software/passkey-idiom-and-better-friendship-c
+//! https://arne-mertz.de/2016/10/passkey-idiom
+//!
+class Passkey {
+  friend class IrBuilder;
+  Passkey() {}
+};
+
 class TORCH_CUDA_API NamedScalar : public Val {
  public:
-  NamedScalar(std::string name, DataType dtype)
+  NamedScalar(Passkey, std::string name, DataType dtype)
       : Val(ValType::KirNamedScalar, dtype, true, true), name_(name) {}
 
-  explicit NamedScalar(const fuser::NamedScalar* node)
+  explicit NamedScalar(Passkey, const fuser::NamedScalar* node)
       : Val(node), name_(node->name()) {}
 
-  NamedScalar(const NamedScalar* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), name_(src->name_) {}
-
   const std::string& name() const {
     return name_;
   }
@@ -57,16 +66,13 @@ class TORCH_CUDA_API NamedScalar : public Val {
 
 class TORCH_CUDA_API Bool : public Val {
  public:
-  explicit Bool(const c10::optional<bool>& value)
+  explicit Bool(Passkey, const c10::optional<bool>& value)
       : Val(ValType::KirScalar, DataType::Bool, true, true),
         maybe_value_(value) {}
 
-  explicit Bool(const fuser::Bool* node)
+  explicit Bool(Passkey, const fuser::Bool* node)
       : Val(node), maybe_value_(node->value()) {}
 
-  Bool(const Bool* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -85,16 +91,13 @@ class TORCH_CUDA_API Float : public Val {
  public:
   using ScalarType = double;
 
-  explicit Float(const c10::optional<ScalarType>& value)
+  explicit Float(Passkey, const c10::optional<ScalarType>& value)
       : Val(ValType::KirScalar, DataType::Float, true, true),
         maybe_value_(value) {}
 
-  explicit Float(const fuser::Float* node)
+  explicit Float(Passkey, const fuser::Float* node)
       : Val(node), maybe_value_(node->value()) {}
 
-  Float(const Float* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -111,16 +114,13 @@ class TORCH_CUDA_API Float : public Val {
 
 class TORCH_CUDA_API Half : public Val {
  public:
-  explicit Half(const c10::optional<float>& value)
+  explicit Half(Passkey, const c10::optional<float>& value)
       : Val(ValType::KirScalar, DataType::Half, true, true),
         maybe_value_(value) {}
 
-  explicit Half(const fuser::Half* node)
+  explicit Half(Passkey, const fuser::Half* node)
       : Val(node), maybe_value_(node->value()) {}
 
-  Half(const Half* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -139,16 +139,13 @@ class TORCH_CUDA_API Int : public Val {
  public:
   using ScalarType = int64_t;
 
-  explicit Int(const c10::optional<ScalarType>& value)
+  explicit Int(Passkey, const c10::optional<ScalarType>& value)
       : Val(ValType::KirScalar, DataType::Int, true, true),
         maybe_value_(value) {}
 
-  explicit Int(const fuser::Int* node, bool /*avoid_zero_ambiguity*/)
+  explicit Int(Passkey, const fuser::Int* node, bool /*avoid_zero_ambiguity*/)
       : Val(node), maybe_value_(node->value()) {}
 
-  Int(const Int* src, IrCloner* ir_cloner)
-      : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {}
-
   bool isSymbolic() const {
     return !(maybe_value_.has_value());
   }
@@ -165,11 +162,9 @@ class TORCH_CUDA_API Int : public Val {
 
 class TORCH_CUDA_API IterDomain : public Val {
  public:
-  IterDomain(Val* start, Val* extent);
+  IterDomain(Passkey, Val* start, Val* extent);
 
-  explicit IterDomain(const fuser::IterDomain* iter_domain);
-
-  IterDomain(const IterDomain* src, IrCloner* ir_cloner);
+  explicit IterDomain(Passkey, const fuser::IterDomain* iter_domain);
 
   bool isReduction() const {
     return getIterType() == IterType::Reduction;
@@ -233,11 +228,9 @@ class TORCH_CUDA_API IterDomain : public Val {
 
 class TORCH_CUDA_API TensorDomain : public Val {
  public:
-  explicit TensorDomain(std::vector<IterDomain*> domain);
-
-  explicit TensorDomain(const fuser::TensorDomain* tensor_domain);
+  explicit TensorDomain(Passkey, std::vector<IterDomain*> domain);
 
-  TensorDomain(const TensorDomain* src, IrCloner* ir_cloner);
+  explicit TensorDomain(Passkey, const fuser::TensorDomain* tensor_domain);
 
   std::vector<IterDomain*>::size_type nDims() const {
     return domain_.size();
@@ -262,6 +255,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
   bool hasReduction() const;
   bool hasBlockReduction() const;
   bool hasGridReduction() const;
+  bool hasBlockBroadcast() const;
   bool hasBroadcast() const;
   bool hasRFactor() const;
 
@@ -288,6 +282,7 @@ class TORCH_CUDA_API TensorDomain : public Val {
 
   IterDomain* axis(int i) const;
 
+  // TODO(kir): overloading non-static and static methods is not a good idea
   static std::vector<IterDomain*> noReductions(const std::vector<IterDomain*>&);
   static std::vector<IterDomain*> noBroadcasts(const std::vector<IterDomain*>&);
 
@@ -302,15 +297,13 @@ class TORCH_CUDA_API TensorDomain : public Val {
 
 class TORCH_CUDA_API TensorView : public Val {
  public:
-  explicit TensorView(const fuser::TensorView* tv);
-
-  TensorView(const TensorView* src, IrCloner* ir_cloner);
+  explicit TensorView(Passkey, const fuser::TensorView* tv);
 
   TensorDomain* domain() const {
     return domain_;
   }
 
-  MemoryType getMemoryType() const {
+  MemoryType memoryType() const {
     return memory_type_;
   }
 
@@ -321,7 +314,7 @@ class TORCH_CUDA_API TensorView : public Val {
 
  private:
   TensorDomain* domain_ = nullptr;
-  MemoryType memory_type_ = MemoryType::Global;
+  MemoryType memory_type_ = MemoryType::Local;
 
   // TODO(kir): remove temporary hack
   const fuser::TensorView* fuser_tv_ = nullptr;
@@ -329,9 +322,7 @@ class TORCH_CUDA_API TensorView : public Val {
 
 class TORCH_CUDA_API UnaryOp : public Expr {
  public:
-  UnaryOp(UnaryOpType type, Val* out, Val* in);
-
-  UnaryOp(const UnaryOp* src, IrCloner* ir_cloner);
+  UnaryOp(Passkey, UnaryOpType type, Val* out, Val* in);
 
   Val* out() const {
     return out_;
@@ -353,9 +344,7 @@ class TORCH_CUDA_API UnaryOp : public Expr {
 
 class TORCH_CUDA_API BinaryOp : public Expr {
  public:
-  BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs);
-
-  BinaryOp(const BinaryOp* src, IrCloner* ir_cloner);
+  BinaryOp(Passkey, BinaryOpType type, Val* out, Val* lhs, Val* rhs);
 
   Val* out() const {
     return out_;
@@ -382,9 +371,13 @@ class TORCH_CUDA_API BinaryOp : public Expr {
 
 class TORCH_CUDA_API TernaryOp : public Expr {
  public:
-  TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3);
-
-  TernaryOp(const TernaryOp* src, IrCloner* ir_cloner);
+  TernaryOp(
+      Passkey,
+      TernaryOpType type,
+      Val* out,
+      Val* in1,
+      Val* in2,
+      Val* in3);
 
   Val* out() const {
     return out_;
@@ -416,9 +409,13 @@ class TORCH_CUDA_API TernaryOp : public Expr {
 
 class TORCH_CUDA_API ReductionOp : public Expr {
  public:
-  ReductionOp(BinaryOpType reduction_op_type, Val* init, Val* out, Val* in);
-
-  ReductionOp(const ReductionOp* src, IrCloner* ir_cloner);
+  ReductionOp(
+      Passkey,
+      BinaryOpType reduction_op_type,
+      Val* init,
+      Val* out,
+      Val* in,
+      Bool* pred = nullptr);
 
   Val* out() const {
     return out_;
@@ -432,6 +429,10 @@ class TORCH_CUDA_API ReductionOp : public Expr {
     return init_;
   }
 
+  Bool* pred() const {
+    return pred_;
+  }
+
   BinaryOpType getReductionOpType() const {
     return reduction_op_type_;
   }
@@ -447,20 +448,20 @@ class TORCH_CUDA_API ReductionOp : public Expr {
   Val* const init_ = nullptr;
   Val* const out_ = nullptr;
   Val* const in_ = nullptr;
+  Bool* const pred_ = nullptr;
 };
 
 class TORCH_CUDA_API TensorIndex : public Val {
  public:
-  TensorIndex(const fuser::TensorView* view, std::vector<Val*> indices);
-
-  TensorIndex(const TensorIndex* src, IrCloner* ir_cloner);
+  TensorIndex(
+      Passkey,
+      const fuser::TensorView* view,
+      std::vector<Val*> indices);
 
   std::vector<Val*>::size_type nDims() const {
     return indices_.size();
   }
 
-  // i here is int, as we want to accept negative value and ::size_type can be a
-  // uint.
   Val* index(int i) const;
 
   const std::vector<Val*>& indices() const {
@@ -478,9 +479,7 @@ class TORCH_CUDA_API TensorIndex : public Val {
 
 class TORCH_CUDA_API BroadcastOp : public Expr {
  public:
-  BroadcastOp(Val* out, Val* in);
-
-  BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner);
+  BroadcastOp(Passkey, Val* out, Val* in);
 
   Val* out() const {
     return out_;
@@ -505,11 +504,11 @@ class TORCH_CUDA_API BroadcastOp : public Expr {
 class TORCH_CUDA_API Allocate : public Expr {
  public:
   explicit Allocate(
+      Passkey,
       Val* buffer,
       MemoryType memory_type = MemoryType::Local,
-      Val* size = nullptr);
-
-  Allocate(const Allocate* src, IrCloner* ir_cloner);
+      Val* size = nullptr,
+      bool zero_init = false);
 
   Val* buffer() const {
     return buffer_;
@@ -523,6 +522,10 @@ class TORCH_CUDA_API Allocate : public Expr {
     return size_;
   }
 
+  bool zeroInit() const {
+    return zero_init_;
+  }
+
   DataType buffer_type() const {
     return buffer_->getDataType().value();
   }
@@ -531,19 +534,27 @@ class TORCH_CUDA_API Allocate : public Expr {
   Val* buffer_ = nullptr;
   MemoryType memory_type_ = MemoryType::Local;
   Val* size_ = nullptr;
+  bool zero_init_ = false;
 };
 
 // Sync represents __syncthreads barrier for block level coordination.
 class TORCH_CUDA_API Sync : public Expr {
  public:
-  Sync();
-  Sync(const Sync* src, IrCloner* ir_cloner);
+  explicit Sync(Passkey, bool war_sync = false);
+
+  bool isWarHazardSync() const {
+    return war_sync_;
+  }
+
+ private:
+  // TODO: war_sync_ is only used for testing/validation purposes.
+  bool war_sync_ = false;
 };
 
+// TODO(kir): promote to IR node
 class TORCH_CUDA_API Scope {
  public:
   Scope() = default;
-  Scope(const Scope* src, IrCloner* ir_cloner);
 
   const std::vector<Expr*>& exprs() const {
     return exprs_;
@@ -597,15 +608,12 @@ class TORCH_CUDA_API Scope {
 // in its body are considered inside the scope of the for loop. In the future
 // the implementation should look quite different so that we can do proper
 // dependency annalysis like in Fusion.
+//
+// TODO(kir): this is not a real expression
+//
 class TORCH_CUDA_API ForLoop : public Expr {
  public:
-  explicit ForLoop(
-      Val* index,
-      IterDomain* iter_domain,
-      const std::vector<Expr*>& body = {},
-      Expr* parent_scope = nullptr);
-
-  ForLoop(const ForLoop* src, IrCloner* ir_cloner);
+  ForLoop(Passkey, Val* index, IterDomain* iter_domain, Expr* parent_scope);
 
   Val* index() const {
     return index_;
@@ -619,7 +627,7 @@ class TORCH_CUDA_API ForLoop : public Expr {
     return body_;
   }
 
-  const Scope& constBody() const {
+  const Scope& body() const {
     return body_;
   }
 
@@ -640,33 +648,29 @@ class TORCH_CUDA_API ForLoop : public Expr {
 // are considered inside the scope of the if statement. In the future the
 // implementation should look quite different so that we can do proper
 // dependency annalysis like in Fusion.
+//
+// TODO(kir): this is not a real expression
+//
 class TORCH_CUDA_API IfThenElse : public Expr {
  public:
-  explicit IfThenElse(
-      Bool* cond,
-      const std::vector<Expr*>& if_body = {},
-      const std::vector<Expr*>& else_body = {},
-      Expr* parent_scope = nullptr);
-
-  IfThenElse(const IfThenElse* src, IrCloner* ir_cloner);
+  explicit IfThenElse(Passkey, Bool* cond, Expr* parent_scope);
 
   Bool* cond() const {
     return cond_;
   }
 
-  const Scope& constBody() const {
-    return body_;
+  Scope& thenBody() {
+    return then_body_;
   }
-
-  const Scope& constElseBody() const {
-    return else_body_;
+  const Scope& thenBody() const {
+    return then_body_;
   }
 
-  Scope& body() {
-    return body_;
+  Scope& elseBody() {
+    return else_body_;
   }
 
-  Scope& elseBody() {
+  const Scope& elseBody() const {
     return else_body_;
   }
 
@@ -682,7 +686,7 @@ class TORCH_CUDA_API IfThenElse : public Expr {
 
  private:
   Bool* const cond_ = nullptr;
-  Scope body_;
+  Scope then_body_;
   Scope else_body_;
   Expr* parent_scope_ = nullptr;
 };
@@ -693,14 +697,14 @@ class TORCH_CUDA_API IfThenElse : public Expr {
 // reduction and sync buffers.
 class TORCH_CUDA_API GridReduction : public Expr {
  public:
-  explicit GridReduction(ReductionOp* reduction_op);
+  explicit GridReduction(Passkey, ReductionOp* reduction_op);
 
   GridReduction(
+      Passkey,
       ReductionOp* reduction_op,
       Allocate* reduction_buffer,
-      Allocate* sync_buffer);
-
-  GridReduction(const GridReduction* src, IrCloner* ir_cloner);
+      Allocate* sync_buffer,
+      Bool* pred = nullptr);
 
   ReductionOp* reduction_op() const {
     return reduction_op_;
@@ -714,6 +718,10 @@ class TORCH_CUDA_API GridReduction : public Expr {
     return sync_buffer_;
   }
 
+  Bool* pred() const {
+    return pred_;
+  }
+
   static std::string getPredicateFlagName(const TensorView* val);
   static std::string getPredicateFlagName(const fuser::TensorView* val);
 
@@ -721,26 +729,9 @@ class TORCH_CUDA_API GridReduction : public Expr {
   ReductionOp* reduction_op_ = nullptr;
   Allocate* reduction_buffer_ = nullptr;
   Allocate* sync_buffer_ = nullptr;
+  Bool* pred_ = nullptr;
 };
 
-// Simple classification helpers
-bool isLoweredScalar(const Val* val);
-bool isLoweredVal(const Val* val);
-
-// Converts a Fusion IR value into the Kernel IR equivalent
-Val* lowerValue(const Val* val);
-
-// A minimal builder interface
-Val* andExpr(Val* lhs, Val* rhs);
-Val* eqExpr(Val* lhs, Val* rhs);
-Val* ltExpr(Val* lhs, Val* rhs);
-Val* addExpr(Val* lhs, Val* rhs);
-Val* subExpr(Val* lhs, Val* rhs);
-Val* mulExpr(Val* lhs, Val* rhs);
-Val* divExpr(Val* lhs, Val* rhs);
-Val* ceilDivExpr(Val* lhs, Val* rhs);
-Val* modExpr(Val* lhs, Val* rhs);
-
 } // namespace kir
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
new file mode 100644
index 000000000000..84fb818891f6
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp
@@ -0,0 +1,104 @@
+
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace kir {
+
+bool isLoweredScalar(const Val* val) {
+  switch (val->getValType().value()) {
+    case ValType::KirNamedScalar:
+    case ValType::KirScalar:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool isLoweredVal(const Val* val) {
+  switch (val->getValType().value()) {
+    case ValType::TensorIndex:
+    case ValType::KirNamedScalar:
+    case ValType::KirScalar:
+    case ValType::KirTensorDomain:
+    case ValType::KirIterDomain:
+    case ValType::KirTensorView:
+      return true;
+    default:
+      return false;
+  }
+}
+
+Val* IrBuilder::newResult(const Val* lhs, const Val* rhs) {
+  TORCH_CHECK(isLoweredScalar(lhs));
+  TORCH_CHECK(isLoweredScalar(rhs));
+  TORCH_CHECK(lhs->getDataType() == rhs->getDataType());
+
+  // Allocate a compatible result value
+  switch (lhs->getDataType().value()) {
+    case DataType::Bool:
+      return create<Bool>(c10::nullopt);
+    case DataType::Float:
+      return create<Float>(c10::nullopt);
+    case DataType::Half:
+      return create<Half>(c10::nullopt);
+    case DataType::Int:
+      return create<Int>(c10::nullopt);
+    default:
+      TORCH_CHECK(false, "Unexpected data type");
+  }
+}
+
+Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
+  auto result = newResult(lhs, rhs);
+  create<BinaryOp>(op_type, result, lhs, rhs);
+  return result;
+}
+
+Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
+  auto result = create<Bool>(c10::nullopt);
+  create<BinaryOp>(op_type, result, lhs, rhs);
+  return result;
+}
+
+Val* IrBuilder::andExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::And, lhs, rhs);
+}
+
+Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
+}
+
+Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) {
+  return newLogicExpr(BinaryOpType::LT, lhs, rhs);
+}
+
+Val* IrBuilder::addExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
+}
+
+Val* IrBuilder::subExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
+}
+
+Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
+}
+
+Val* IrBuilder::divExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
+}
+
+Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
+}
+
+Val* IrBuilder::modExpr(Val* lhs, Val* rhs) {
+  return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
+}
+
+} // namespace kir
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h
new file mode 100644
index 000000000000..bed780edcc65
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h
@@ -0,0 +1,81 @@
+
+#pragma once
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/codegen/cuda/kernel.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
+
+#include <memory>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace kir {
+
+// Simple classification helpers
+bool isLoweredScalar(const Val* val);
+bool isLoweredVal(const Val* val);
+
+//! Kernel IR builder interface
+//!
+//! The only way to create new Kernel IR nodes is through the
+//! kir::IrBuilder interface. An IrBuilder instance is attached to a
+//! particular Kernel instance and it provides methods for creating
+//! single nodes (kir::IrBuilder::create()) or basic composite expressions
+//! (ex. kir::IrBuilder::addExpr()).
+//!
+//! If the Kernel object is readily available, an IrBuilder can be "wrapped"
+//! around it directly:
+//!
+//!   kir::IrBuilder ir_builder(kernel);
+//!
+//! During lowering, another option is to create an IrBuilder for the
+//! kernel that is being created:
+//!
+//!   kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+//!
+//! Once we have an IR builder instance, creating nodes looks like:
+//!
+//!   auto new_node = ir_builder.create<kir::Int>(1));
+//!   auto result = ir_builder.mulExpr(lhs, rhs);
+//!
+class IrBuilder {
+ public:
+  explicit IrBuilder(Kernel* kernel) : kernel_(kernel) {}
+
+  //! Allocate a new Kernel IR node, forwarding the arguments
+  //! to the appropriate constructor
+  template <class T, class... Args>
+  T* create(Args&&... args) {
+    // TODO(kir): switch this to Kernel registration
+    return new T(kir::Passkey(), std::forward<Args>(args)...);
+  }
+
+  // Binary expressions
+  Val* andExpr(Val* lhs, Val* rhs);
+  Val* eqExpr(Val* lhs, Val* rhs);
+  Val* ltExpr(Val* lhs, Val* rhs);
+  Val* addExpr(Val* lhs, Val* rhs);
+  Val* subExpr(Val* lhs, Val* rhs);
+  Val* mulExpr(Val* lhs, Val* rhs);
+  Val* divExpr(Val* lhs, Val* rhs);
+  Val* ceilDivExpr(Val* lhs, Val* rhs);
+  Val* modExpr(Val* lhs, Val* rhs);
+
+ private:
+  Val* newResult(const Val* lhs, const Val* rhs);
+  Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
+  Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
+
+ private:
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-private-field"
+  // Non-owning pointer to the kernel to be modified
+  Kernel* kernel_ = nullptr;
+#pragma clang diagnostic pop
+};
+
+} // namespace kir
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h b/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
index cdc41ddab51c..d30eb3fcda52 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
+++ b/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h
@@ -149,6 +149,9 @@ static auto code_helper_funcs = R"(
 __device__ constexpr int ceilDiv(const int a, const int b) {
   return (a + b - 1) / b;
 }
+__device__ constexpr int alignBufferSize(const int buffer, const int size) {
+  return (buffer + (size-1)) & ~(size-1);
+}
 __device__ float clamp(const float x, const float minv, const float maxv) {
   return x < minv ? minv : (x > maxv ? maxv : x);
 }
@@ -195,9 +198,17 @@ static auto code_template_block_reduction = R"(
 // may actually be slower.
 template<bool X_REDUCE, bool Y_REDUCE, bool Z_REDUCE, typename T, typename Func>
 __inline__ __device__
-void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_idx, const dim3& block_dim, T* shared_mem) {
-
-  unsigned int reduction_size 
+void blockReduce(
+    T& out,
+    const T inp_val,
+    Func reduction_op,
+    const dim3& thread_idx,
+    const dim3& block_dim,
+    T* shared_mem,
+    bool read_write_pred,
+    T init_val) {
+
+  unsigned int reduction_size
     = (X_REDUCE ? block_dim.x : 1)
     * (Y_REDUCE ? block_dim.y : 1)
     * (Z_REDUCE ? block_dim.z : 1);
@@ -223,8 +234,8 @@ void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_
     reduction_tid = threadIdx.z * blockDim.x + threadIdx.x;
   } else {
     // Normal reduction in order
-    reduction_stride 
-    = (X_REDUCE ? 1 
+    reduction_stride
+    = (X_REDUCE ? 1
     : (Y_REDUCE ? block_dim.x
     : (Z_REDUCE ? block_dim.x * block_dim.y : 0)));
 
@@ -238,7 +249,11 @@ void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_
 
   assert( reduction_stride != 0 );
 
-  shared_mem[linear_tid] = inp_val;
+  if(read_write_pred){
+    shared_mem[linear_tid] = inp_val;
+  } else {
+    shared_mem[linear_tid] = init_val;
+  }
   __syncthreads();
   // Reduce down to nearest power of 2:
   int np2 =  1 << (31 - __clz(reduction_size));
@@ -256,9 +271,10 @@ void blockReduce(T& out, const T inp_val, Func reduction_op, const dim3& thread_
     }
     __syncthreads();
   }
-  if(should_write)
+
+  if(should_write && read_write_pred)
     out = shared_mem[linear_tid];
-  
+
 }
 )";
 
@@ -434,13 +450,20 @@ __host__ __device__ int offset_in_reduction_block(const dim3& thread_idx,
 */
 template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD,
           typename T, typename Func>
-__device__ void gridReduceLastBlock(T& out, const T *in, const size_t in_size,
-                                    Func reduction_op, T* shared_buf) {
+__device__ void gridReduceLastBlock(
+      T& out,
+      const T *in,
+      const size_t in_size,
+      Func reduction_op,
+      T* shared_buf,
+      bool read_write_pred,
+      T init_val) {
+        
   const int tid = ioffset(threadIdx, blockDim);
   const int block_size = isize(blockDim);
   const int rblock_size = size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
 
-  T inp = 0;
+  T inp = init_val;
   if (tid < in_size) {
     inp = in[tid];
   }
@@ -461,7 +484,7 @@ __device__ void gridReduceLastBlock(T& out, const T *in, const size_t in_size,
         inp, inp, reduction_op,
         dim3{(unsigned)rblock_offset, (unsigned)rblock_idx, 0},
         dim3{(unsigned)rblock_size, (unsigned)rem_size},
-        shared_buf);
+        shared_buf, true, init_val);
     __syncthreads();
     if (tid < rblock_size) {
       shared_buf[tid] = inp;
@@ -473,7 +496,7 @@ __device__ void gridReduceLastBlock(T& out, const T *in, const size_t in_size,
     }
   }
 
-  if (should_write) {
+  if (should_write && read_write_pred) {
     out = inp;
   }
 }
@@ -527,15 +550,22 @@ template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK,
 __device__ bool gridReduce(T& out, T inp_val, Func reduction_op,
                            volatile T* work_buf,
                            Tensor<int64_t, 1> sync_flags,
-                           T* shared_buf) {
+                           T* shared_buf, bool read_write_pred, T init_val) {
+
+  // Number of values to reduce in the grid dimensions
   const auto seg_size =
       size_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
+
+  // Index of the reduction we're performing out of the seg_size
   const auto seg_idx =
       index_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
+
+  // Number of threads we can use in final reduction, Seems to assume all threads in the block participate
   const auto rblock_size =
       size_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(blockDim);
 
   // advance to the offset for this segment
+  // index of reduction * size of the reduction * size of threads
   work_buf += seg_idx * seg_size * rblock_size;
 
   if ((X_THREAD || threadIdx.x == 0) &&
@@ -546,25 +576,33 @@ __device__ bool gridReduce(T& out, T inp_val, Func reduction_op,
     auto thread_offset =
         offset_in_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(threadIdx, blockDim);
     auto work_buf_offset = rblock_size * rblock_offset + thread_offset;
-    work_buf[work_buf_offset] = inp_val;
+    if(read_write_pred){
+      work_buf[work_buf_offset] = inp_val;
+    } else {
+      work_buf[work_buf_offset] = init_val;
+    }
   }
   __syncthreads();
 
   __shared__ bool last_block;
   if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
     __threadfence();
-    auto old = atomicAdd(  (unsigned long long*) &sync_flags[seg_idx], 1);
+    // printf("%ld\n", sync_flags[seg_idx]);
+    auto old = (int64_t) atomicAdd(  (unsigned long long*) &sync_flags[seg_idx], 1);
     last_block = old + 1 == seg_size;
+    // printf("Last_block = %d + 1 == %d\n", (int)old, (int)seg_size);
   }
   __syncthreads();
 
   if (last_block) {
+    // printf("Last block %d %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
     // final reduction
     gridReduceLastBlock<X_THREAD, Y_THREAD, Z_THREAD>(
         out, (T*)work_buf, seg_size * rblock_size,
-        reduction_op, shared_buf);
+        reduction_op, shared_buf, read_write_pred, init_val);
     return true;
   } else {
+    // printf("Not last block %d %d %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
     return false;
   }
 }
@@ -595,10 +633,7 @@ __host__ __device__ unsigned offset_of_source(const dim3& block_dim, const dim3&
     out: Per-thread output location
  */
 template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
-__device__ void blockBroadcast(T& out, T inp_val) {
-
-  // Use worst case for memory.
-  __shared__ T shared_mem[1024];
+  __device__ void blockBroadcast(T& out, T inp_val, T* shared_mem) {
 
   const bool has_valid_data =
       (!X_THREAD || threadIdx.x == 0) &&
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
index 94ac287722bb..4e9d2ec499bf 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -1,9 +1,10 @@
 
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
-#include <torch/csrc/jit/codegen/cuda/dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/lower_index.h>
+#include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
 #include <torch/csrc/jit/codegen/cuda/lower_loops.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 #include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
@@ -14,66 +15,14 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-namespace {
-
 // TODO(kir): revisit this
 thread_local GpuLower* active_gpu_lower = nullptr;
 
-class GridReductionBuffers : OptOutDispatch {
- public:
-  static std::vector<kir::Allocate*> getGlobalAllocs(
-      const std::vector<Expr*>& exprs) {
-    GridReductionBuffers fgr;
-    for (auto expr : exprs) {
-      fgr.handle(expr);
-    }
-    return fgr.global_allocations_;
-  }
-
-  static std::vector<kir::Allocate*> getSyncAllocs(
-      const std::vector<Expr*>& exprs) {
-    GridReductionBuffers fgr;
-    for (auto expr : exprs) {
-      fgr.handle(expr);
-    }
-    return fgr.sync_allocations_;
-  }
-
- private:
-  std::vector<kir::Allocate*> global_allocations_;
-  std::vector<kir::Allocate*> sync_allocations_;
-
-  GridReductionBuffers() = default;
-
-  void handle(Expr* expr) final {
-    OptOutDispatch::handle(expr);
-  }
-
-  void handle(kir::ForLoop* fl) final {
-    for (auto expr : fl->body().exprs()) {
-      OptOutDispatch::handle(expr);
-    }
-  }
-
-  void handle(kir::IfThenElse* ite) final {
-    for (auto expr : ite->body().exprs()) {
-      OptOutDispatch::handle(expr);
-    }
-
-    for (auto expr : ite->elseBody().exprs()) {
-      OptOutDispatch::handle(expr);
-    }
-  }
-
-  void handle(kir::GridReduction* gr) final {
-    global_allocations_.push_back(gr->reduction_buffer());
-    sync_allocations_.push_back(gr->sync_buffer());
-  }
-};
+void GpuLower::replaceSymbolicSizes() {
+  FUSER_PERF_SCOPE("replaceSymbolicSizes");
 
-} // namespace
+  kir::IrBuilder ir_builder(kernel());
 
-void GpuLower::buildSizesMap() {
   // Grab inputs and outputs
   // TODO: Only run through inputs for the size map, outputs don't actually set
   // any sizes of the problem.
@@ -101,10 +50,9 @@ void GpuLower::buildSizesMap() {
 
     size_t dim = 0;
     for (auto id : root_td) {
-      // Output sizes could have reduction axes, which isn't what gets output.
-
-      Val* orig_size = id->extent();
+      const Val* orig_size = id->extent();
 
+      // Output sizes could have reduction axes, which isn't what gets output.
       if (id->isReduction()) {
         continue;
       } else if (id->getIterType() == IterType::BroadcastWithoutStride) {
@@ -117,31 +65,21 @@ void GpuLower::buildSizesMap() {
         continue;
       }
 
+      // TODO(kir): consider a different implementation which doesn't
+      //  hijack the kir_map_
       if (kir_map_.find(orig_size) == kir_map_.end()) {
         std::stringstream ss;
         ss << "T" << tv->name() << ".size[" << dim++ << "]";
-        auto new_size =
-            new kir::NamedScalar(ss.str(), orig_size->getDataType().value());
-        kir_map_[orig_size] = new_size;
-      }
-    }
-  }
-}
-
-void GpuLower::adjustMemoryTypes() {
-  for (auto val : fusion_->deterministic_vals()) {
-    if (ir_utils::isTV(val)) {
-      auto tv = val->as<TensorView>();
-      if (fusion_->hasInput(tv) || fusion_->hasOutput(tv)) {
-        tv->setMemoryType(MemoryType::Global);
-      } else if (tv->getMemoryType() == MemoryType::Global) {
-        tv->setMemoryType(MemoryType::Local);
+        kir_map_[orig_size] = ir_builder.create<kir::NamedScalar>(
+            ss.str(), orig_size->getDataType().value());
       }
     }
   }
 }
 
 void GpuLower::lower() {
+  FUSER_PERF_SCOPE("lower");
+
   TORCH_INTERNAL_ASSERT(fusion_ != nullptr);
   TORCH_INTERNAL_ASSERT(
       active_gpu_lower == nullptr, "Nested lowering passes are not supported");
@@ -158,68 +96,54 @@ void GpuLower::lower() {
 
   FusionGuard fg(fusion_);
 
+  // Start with a fresh kernel
+  kernel_ = std::make_unique<Kernel>();
+
   // prepare for lowering
   validateIr(fusion_);
-  buildSizesMap();
-  adjustMemoryTypes();
+  replaceSymbolicSizes();
 
   // Compute thread predicates
   ThreadPredicateMap preds(fusion_);
 
-  // Run our passes keeping the lowered expressions and forwarding
-  // them.
+  // Run our passes keeping the lowered expressions and forwarding them
   const auto lowered_exprs =
       LoopNestGenerator::loweredExprs(fusion_, preds, fusion_->exprs(true));
 
   const auto unrolled_loops =
       UnrollPass::runPass(fusion_, lowered_exprs, preds);
 
-  const auto indexed_loops =
-      IndexLowering::getIndexedExprs(fusion_, unrolled_loops);
-
-  // Store the final lowered IR
-  lowered_exprs_ = indexed_loops;
+  // Insert SyncThreads at end of for-loop to avoid WAR race condition
+  const auto sync_exprs = insertThreadSynchronization(fusion_, unrolled_loops);
 
-  // Get allocations
-  global_allocations_ = GridReductionBuffers::getGlobalAllocs(lowered_exprs_);
-  sync_allocations_ = GridReductionBuffers::getSyncAllocs(lowered_exprs_);
-}
+  const auto indexed_loops =
+      IndexLowering::getIndexedExprs(fusion_, sync_exprs);
 
-// Traverse through the fusion and print CUDA code associated with it
-std::ostream& GpuLower::printKernel(
-    std::ostream& os,
-    const std::string& kernel_name) {
-  FusionGuard fg(fusion_);
+  // We now have the lowered expressions, finalize the kernel IR
+  kernel_->finalize(indexed_loops, preds);
 
-  std::vector<kir::Allocate*> allocs;
-  allocs.insert(
-      allocs.end(), global_allocations_.begin(), global_allocations_.end());
-  allocs.insert(
-      allocs.end(), sync_allocations_.begin(), sync_allocations_.end());
-
-  std::vector<Val*> global_tensors(allocs.size(), nullptr);
-  std::transform(
-      allocs.begin(),
-      allocs.end(),
-      global_tensors.begin(),
-      [](kir::Allocate* alloc) { return alloc->buffer(); });
-
-  IRPrinter irp(os);
-  irp.printKernel(lowered_exprs_, kernel_name, global_tensors);
-  return os;
+  // Set the kernel inputs & outputs
+  for (auto input : fusion_->inputs()) {
+    kernel_->addInput(GpuLower::lowerValue(input));
+  }
+  for (auto output : fusion_->outputs()) {
+    kernel_->addOutput(GpuLower::lowerValue(output));
+  }
 }
 
-std::string GpuLower::getKernel(const std::string& kernel_name) {
-  std::stringstream ss;
-  printKernel(ss, kernel_name);
-  return ss.str();
+Kernel* GpuLower::kernel() const {
+  TORCH_CHECK(kernel_);
+  return kernel_.get();
 }
 
 // Maps Fusion IR nodes to the Kernel IR counterparts
-// (this is a interim solution for easing the Kernel IR splitting)
+//
+// TODO(kir): this is a interim solution for easing the Kernel IR splitting
+//
 class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
  public:
-  explicit KernelIrMapper(GpuLower* gpu_lower) : gpu_lower_(gpu_lower) {}
+  explicit KernelIrMapper(GpuLower* gpu_lower)
+      : gpu_lower_(gpu_lower), ir_builder_(gpu_lower->kernel()) {}
 
   Val* lower(const Val* value) {
     const auto it = gpu_lower_->kir_map_.find(value);
@@ -248,12 +172,13 @@ class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
     switch (def->type()) {
       case ExprType::UnaryOp: {
         const auto op = def->as<fuser::UnaryOp>();
-        new kir::UnaryOp(op->getUnaryOpType(), lowered_value, lower(op->in()));
+        ir_builder_.create<kir::UnaryOp>(
+            op->getUnaryOpType(), lowered_value, lower(op->in()));
         break;
       }
       case ExprType::BinaryOp: {
         const auto op = def->as<fuser::BinaryOp>();
-        new kir::BinaryOp(
+        ir_builder_.create<kir::BinaryOp>(
             op->getBinaryOpType(),
             lowered_value,
             lower(op->lhs()),
@@ -262,7 +187,7 @@ class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
       }
       case ExprType::TernaryOp: {
         const auto op = def->as<fuser::TernaryOp>();
-        new kir::TernaryOp(
+        ir_builder_.create<kir::TernaryOp>(
             op->getTernaryOpType(),
             lowered_value,
             lower(op->in1()),
@@ -288,56 +213,68 @@ class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch {
   }
 
   void handle(const TensorDomain* node) override {
-    const auto lowered_node = new kir::TensorDomain(node);
+    const auto lowered_node = ir_builder_.create<kir::TensorDomain>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const IterDomain* node) override {
-    const auto lowered_node = new kir::IterDomain(node);
+    const auto lowered_node = ir_builder_.create<kir::IterDomain>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const TensorView* node) override {
-    const auto lowered_node = new kir::TensorView(node);
+    const auto lowered_node = ir_builder_.create<kir::TensorView>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Bool* node) override {
-    const auto lowered_node = new kir::Bool(node);
+    const auto lowered_node = ir_builder_.create<kir::Bool>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Float* node) override {
-    const auto lowered_node = new kir::Float(node);
+    const auto lowered_node = ir_builder_.create<kir::Float>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Half* node) override {
-    const auto lowered_node = new kir::Half(node);
+    const auto lowered_node = ir_builder_.create<kir::Half>(node);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const Int* node) override {
-    const auto lowered_node = new kir::Int(node, false);
+    const auto lowered_node = ir_builder_.create<kir::Int>(node, false);
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
   void handle(const NamedScalar* node) override {
-    const auto lowered_node =
-        new kir::NamedScalar(node->name(), node->getDataType().value());
+    const auto lowered_node = ir_builder_.create<kir::NamedScalar>(
+        node->name(), node->getDataType().value());
     TORCH_CHECK(gpu_lower_->kir_map_.insert({node, lowered_node}).second);
   }
 
  private:
   GpuLower* gpu_lower_ = nullptr;
+  kir::IrBuilder ir_builder_;
 };
 
 Val* GpuLower::lowerValue(const Val* val) {
+  TORCH_INTERNAL_ASSERT(!kir::isLoweredVal(val));
   TORCH_INTERNAL_ASSERT(active_gpu_lower != nullptr);
   KernelIrMapper kir_mapper(active_gpu_lower);
   return kir_mapper.lower(val);
 }
 
+Val* GpuLower::getLowerValue(const Val* val) {
+  KernelIrMapper kir_mapper(this);
+  return kir_mapper.lower(val);
+}
+
+GpuLower* GpuLower::current() {
+  TORCH_INTERNAL_ASSERT(active_gpu_lower != nullptr);
+  return active_gpu_lower;
+}
+
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h
index 4ffccba33339..1cc50fa20ab4 100644
--- a/torch/csrc/jit/codegen/cuda/lower2device.h
+++ b/torch/csrc/jit/codegen/cuda/lower2device.h
@@ -1,3 +1,4 @@
+
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
@@ -6,6 +7,7 @@
 #include <torch/csrc/jit/codegen/cuda/kernel.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
 
+#include <memory>
 #include <ostream>
 
 namespace torch {
@@ -22,20 +24,7 @@ class TORCH_CUDA_API GpuLower {
     lower();
   }
 
-  // print generated code to ostream
-  std::ostream& printKernel(
-      std::ostream& _os,
-      const std::string& kernel_name = "CUDAGeneratedKernel");
-
-  std::string getKernel(const std::string& kernel_name = "CUDAGeneratedKernel");
-
-  std::vector<kir::Allocate*> global_allocations() {
-    return global_allocations_;
-  }
-
-  std::vector<kir::Allocate*> sync_allocations() {
-    return sync_allocations_;
-  }
+  Kernel* kernel() const;
 
   // Converts a Fusion IR value into the Kernel IR equivalent
   //
@@ -43,6 +32,14 @@ class TORCH_CUDA_API GpuLower {
   //
   static Val* lowerValue(const Val* val);
 
+  // TODO(kir): we have two methods which do almost the same thing
+  //
+  Val* getLowerValue(const Val* val);
+
+  //! Returns the currently active lowering object
+  //! (or nullptr if no lowering is in progress)
+  static GpuLower* current();
+
  private:
   void lower();
 
@@ -52,21 +49,11 @@ class TORCH_CUDA_API GpuLower {
   // not have this information. Since we need to have the correct information in
   // the kernel being fetched for shapes, we want to replace input and output
   // tensors to reference the runtime structure containing sizes.
-  void buildSizesMap();
-
-  // Adjust memory types to make sure they are valid
-  void adjustMemoryTypes();
+  void replaceSymbolicSizes();
 
  private:
-  // List of global buffers (not including buffers for grid syncronization)
-  std::vector<kir::Allocate*> global_allocations_;
-
-  // List of syncronization buffers that must be initialized to 0 when running
-  // the fusion
-  std::vector<kir::Allocate*> sync_allocations_;
-
-  // Lowered IR
-  std::vector<Expr*> lowered_exprs_;
+  // Lowered Kernel IR
+  std::unique_ptr<Kernel> kernel_;
 
   // Fusion IR node to Kernel IR node mapping
   std::unordered_map<const Val*, Val*> kir_map_;
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp
index 443a718cb014..5dcefda05f48 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_index.cpp
@@ -1,7 +1,11 @@
+
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
 
 #include <torch/csrc/jit/codegen/cuda/lower_index.h>
 
@@ -9,6 +13,8 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
+IndexLowering::IndexLowering() : ir_builder_(GpuLower::current()->kernel()) {}
+
 Val* IndexLowering::lowerOperand(Val* op, Val* out) const {
   if (ir_utils::isTV(op)) {
     return Index::getProducerIndex(
@@ -16,7 +22,7 @@ Val* IndexLowering::lowerOperand(Val* op, Val* out) const {
         ir_utils::asTV(out),
         scope_utils::getLoops(active_scope_expr));
   } else {
-    return kir::lowerValue(op);
+    return GpuLower::lowerValue(op);
   }
 }
 
@@ -27,27 +33,29 @@ Val* IndexLowering::lowerOutput(Expr* expr) const {
     return Index::getConsumerIndex(
         ir_utils::asTV(out), scope_utils::getLoops(active_scope_expr));
   } else {
-    return kir::lowerValue(out);
+    return GpuLower::lowerValue(out);
   }
 }
 
 void IndexLowering::pushBack(Expr* expr) {
-  if (active_scope == nullptr)
+  if (active_scope == nullptr) {
     lowered_exprs.push_back(expr);
-  else
+  } else {
     active_scope->push_back(expr);
+  }
 }
 
 void IndexLowering::handle(kir::IfThenElse* ite) {
   Expr* prev_scope_expr = active_scope_expr;
   kir::Scope* prev_scope = active_scope;
 
-  auto new_ite = new kir::IfThenElse(ite->cond(), {}, {}, prev_scope_expr);
+  auto new_ite =
+      ir_builder_.create<kir::IfThenElse>(ite->cond(), prev_scope_expr);
   pushBack(new_ite);
   active_scope_expr = new_ite;
-  active_scope = &new_ite->body();
+  active_scope = &new_ite->thenBody();
 
-  for (auto expr : ite->body().exprs()) {
+  for (auto expr : ite->thenBody().exprs()) {
     OptInDispatch::handle(expr);
   }
 
@@ -65,8 +73,8 @@ void IndexLowering::handle(kir::ForLoop* fl) {
   Expr* prev_scope_expr = active_scope_expr;
   kir::Scope* prev_scope = active_scope;
 
-  auto newFl =
-      new kir::ForLoop(fl->index(), fl->iter_domain(), {}, prev_scope_expr);
+  auto newFl = ir_builder_.create<kir::ForLoop>(
+      fl->index(), fl->iter_domain(), prev_scope_expr);
   pushBack(newFl);
 
   active_scope_expr = newFl;
@@ -81,52 +89,52 @@ void IndexLowering::handle(kir::ForLoop* fl) {
 }
 
 void IndexLowering::handle(UnaryOp* uop) {
-  // TODO(kir): lower this expression
-  if (!ir_utils::isTVOp(uop)) {
-    pushBack(uop);
-    return;
+  if (ir_utils::isTVOp(uop)) {
+    const auto in = lowerOperand(uop->in(), uop->out());
+    const auto out = lowerOutput(uop);
+    pushBack(ir_builder_.create<kir::UnaryOp>(uop->getUnaryOpType(), out, in));
+  } else {
+    // This will automatically lower the expression defining the value
+    pushBack(GpuLower::lowerValue(uop->out())->getOrigin());
   }
-
-  const auto in = lowerOperand(uop->in(), uop->out());
-  const auto out = lowerOutput(uop);
-  pushBack(new kir::UnaryOp(uop->getUnaryOpType(), out, in));
 }
 
 void IndexLowering::handle(BinaryOp* bop) {
-  // TODO(kir): lower this expression
-  if (!ir_utils::isTVOp(bop)) {
-    pushBack(bop);
-    return;
+  if (ir_utils::isTVOp(bop)) {
+    const auto lhs = lowerOperand(bop->lhs(), bop->out());
+    const auto rhs = lowerOperand(bop->rhs(), bop->out());
+    const auto out = lowerOutput(bop);
+    pushBack(ir_builder_.create<kir::BinaryOp>(
+        bop->getBinaryOpType(), out, lhs, rhs));
+  } else {
+    // This will automatically lower the expression defining the value
+    pushBack(GpuLower::lowerValue(bop->out())->getOrigin());
   }
-
-  const auto lhs = lowerOperand(bop->lhs(), bop->out());
-  const auto rhs = lowerOperand(bop->rhs(), bop->out());
-  const auto out = lowerOutput(bop);
-  pushBack(new kir::BinaryOp(bop->getBinaryOpType(), out, lhs, rhs));
 }
 
 void IndexLowering::handle(TernaryOp* top) {
-  // TODO(kir): lower this expression
-  if (!ir_utils::isTVOp(top)) {
-    pushBack(top);
-    return;
+  if (ir_utils::isTVOp(top)) {
+    const auto in1 = lowerOperand(top->in1(), top->out());
+    const auto in2 = lowerOperand(top->in2(), top->out());
+    const auto in3 = lowerOperand(top->in3(), top->out());
+    const auto out = lowerOutput(top);
+    pushBack(ir_builder_.create<kir::TernaryOp>(
+        top->getTernaryOpType(), out, in1, in2, in3));
+  } else {
+    // This will automatically lower the expression defining the value
+    pushBack(GpuLower::lowerValue(top->out())->getOrigin());
   }
-
-  const auto in1 = lowerOperand(top->in1(), top->out());
-  const auto in2 = lowerOperand(top->in2(), top->out());
-  const auto in3 = lowerOperand(top->in3(), top->out());
-  const auto out = lowerOutput(top);
-  pushBack(new kir::TernaryOp(top->getTernaryOpType(), out, in1, in2, in3));
 }
 
 namespace {
 
 void allocateGridReductionFlag(TensorView* out_tv, Expr* current_scope_expr) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
   auto flag_name = kir::GridReduction::getPredicateFlagName(out_tv);
-  auto flag_var = new kir::Allocate(
-      new kir::NamedScalar(flag_name, DataType::Bool),
+  auto flag_var = ir_builder.create<kir::Allocate>(
+      ir_builder.create<kir::NamedScalar>(flag_name, DataType::Bool),
       MemoryType::Local,
-      new kir::Int(1));
+      ir_builder.create<kir::Int>(1));
   // When enclosed by IfThenElse, place the variable outside of the
   // IfThenElse. This IfThenElse is assumed to be the prediate for
   // this grid reduction expression.
@@ -174,8 +182,15 @@ void IndexLowering::handle(ReductionOp* rop) {
 
   kir::ReductionOp* block_reduction_op = nullptr;
   if (is_block_reduce) {
-    block_reduction_op = new kir::ReductionOp(
-        rop->getReductionOpType(), kir::lowerValue(rop->init()), out, in);
+    auto pred =
+        PredicateCompute::getInlinePredicate(rop, loops, nullptr, false);
+
+    block_reduction_op = ir_builder_.create<kir::ReductionOp>(
+        rop->getReductionOpType(),
+        GpuLower::lowerValue(rop->init()),
+        out,
+        in,
+        pred);
     pushBack(block_reduction_op);
   }
 
@@ -217,23 +232,34 @@ void IndexLowering::handle(ReductionOp* rop) {
 
     IterDomain* buffer_id = new IterDomain(new Int(0), buffer_size);
     TensorView* reduce_buffer_tv = new TensorView(
-        new TensorDomain({buffer_id}), out->getDataType().value());
+        new TensorDomain({buffer_id}),
+        out->getDataType().value(),
+        MemoryType::Global);
 
     IterDomain* sync_id = new IterDomain(new Int(0), sync_size);
-    TensorView* reduce_sync_tv =
-        new TensorView(new TensorDomain({sync_id}), DataType::Int);
-
-    const auto reduce_buffer = new kir::Allocate(
-        kir::lowerValue(reduce_buffer_tv), MemoryType::Global);
-    const auto sync_buffer =
-        new kir::Allocate(kir::lowerValue(reduce_sync_tv), MemoryType::Global);
+    TensorView* reduce_sync_tv = new TensorView(
+        new TensorDomain({sync_id}), DataType::Int, MemoryType::Global);
+
+    const auto reduce_buffer = ir_builder_.create<kir::Allocate>(
+        GpuLower::lowerValue(reduce_buffer_tv),
+        reduce_sync_tv->getMemoryType());
+    const auto sync_buffer = ir_builder_.create<kir::Allocate>(
+        GpuLower::lowerValue(reduce_sync_tv),
+        reduce_sync_tv->getMemoryType(),
+        nullptr,
+        true);
 
     const auto grid_reduction_op = block_reduction_op == nullptr
-        ? new kir::ReductionOp(
-              rop->getReductionOpType(), kir::lowerValue(rop->init()), out, in)
+        ? ir_builder_.create<kir::ReductionOp>(
+              rop->getReductionOpType(),
+              GpuLower::lowerValue(rop->init()),
+              out,
+              in)
         : block_reduction_op;
-    const auto grid_reduction =
-        new kir::GridReduction(grid_reduction_op, reduce_buffer, sync_buffer);
+    auto pred =
+        PredicateCompute::getInlinePredicate(rop, loops, nullptr, false);
+    const auto grid_reduction = ir_builder_.create<kir::GridReduction>(
+        grid_reduction_op, reduce_buffer, sync_buffer, pred);
 
     pushBack(reduce_buffer);
     pushBack(sync_buffer);
@@ -241,7 +267,8 @@ void IndexLowering::handle(ReductionOp* rop) {
   }
 
   if (!is_block_reduce && !is_grid_reduce) {
-    pushBack(new kir::BinaryOp(rop->getReductionOpType(), out, out, in));
+    pushBack(ir_builder_.create<kir::BinaryOp>(
+        rop->getReductionOpType(), out, out, in));
   }
 }
 
@@ -260,7 +287,7 @@ void IndexLowering::handle(BroadcastOp* bop) {
   if (ir_utils::isTV(in))
     in = Index::getProducerIndex(
         ir_utils::asTV(in), ir_utils::asTV(bop->out()), loops);
-  pushBack(new kir::BroadcastOp(out, in));
+  pushBack(ir_builder_.create<kir::BroadcastOp>(out, in));
 }
 
 void IndexLowering::handle(kir::Allocate* allocate) {
diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h
index ea420abdf359..7e553f8013dc 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index.h
+++ b/torch/csrc/jit/codegen/cuda/lower_index.h
@@ -2,7 +2,10 @@
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 
 #include <vector>
 
@@ -15,6 +18,7 @@ class TORCH_CUDA_API IndexLowering : public OptInDispatch {
   static std::vector<Expr*> getIndexedExprs(
       Fusion* fusion,
       std::vector<Expr*> incoming_exprs) {
+    FUSER_PERF_SCOPE("IndexLowering::getIndexedExprs");
     FusionGuard fg(fusion);
     IndexLowering il;
     il.generate(incoming_exprs);
@@ -22,6 +26,8 @@ class TORCH_CUDA_API IndexLowering : public OptInDispatch {
   }
 
  private:
+  IndexLowering();
+
   // Wrap pushBack, if active_scope is null we want it to go
   // straight to lower_exprs
   void pushBack(Expr*);
@@ -57,6 +63,8 @@ class TORCH_CUDA_API IndexLowering : public OptInDispatch {
   // to understand the nesting of IfThenElse/ForLoop nodes.
   kir::Scope* active_scope = nullptr;
   Expr* active_scope_expr = nullptr;
+
+  kir::IrBuilder ir_builder_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
new file mode 100644
index 000000000000..71bf2a282fec
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp
@@ -0,0 +1,227 @@
+
+#include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
+#include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+
+namespace {
+
+class LocalSyncInserter final : private OptOutDispatch {
+ public:
+  static void InsertSyncs(Expr* expr) {
+    LocalSyncInserter sync_inserter;
+    sync_inserter.handle(expr);
+  }
+
+  void handle(Expr* expr) final {
+    if (ir_utils::isTVOp(expr)) {
+      // For this SyncInserter
+      (!initial_sync_) ? hasOutputSmemExpr(expr, initial_)
+                       : hasInputSmemExpr(expr, final_);
+
+      // For parent SyncInserter
+      hasOutputSmemExpr(expr, all_smem_outputs_);
+      hasInputSmemExpr(expr, all_smem_inputs_);
+    } else {
+      OptOutDispatch::handle(expr);
+    }
+  }
+
+  const std::unordered_set<const TensorView*>& initial() const {
+    return initial_;
+  }
+
+  const std::unordered_set<const TensorView*>& final() const {
+    return final_;
+  }
+
+  const std::unordered_set<const TensorView*>& all_smem_inputs() const {
+    return all_smem_inputs_;
+  }
+
+  const std::unordered_set<const TensorView*>& all_smem_outputs() const {
+    return all_smem_outputs_;
+  }
+
+ private:
+  void handle(kir::IfThenElse* ite) final {
+    for (auto expr : ite->thenBody().exprs()) {
+      handle(expr);
+    }
+    for (auto expr : ite->elseBody().exprs()) {
+      handle(expr);
+    }
+  }
+
+  void handle(kir::ForLoop* fl) final {
+    // Track if last op in body is sync in nested for-loop
+    bool is_last_op_sync_ = false;
+    for (auto expr : fl->body().exprs()) {
+      is_last_op_sync_ = false;
+      if (expr->getExprType().value() == ExprType::Sync) {
+        initial_sync_ = true;
+        final_.clear();
+      } else if (expr->getExprType().value() == ExprType::ForLoop) {
+        // Recursively handle nested for-loop
+        LocalSyncInserter child_sync_inserter;
+        child_sync_inserter.handle(expr);
+        const auto& child_inputs = child_sync_inserter.all_smem_inputs();
+        const auto& child_outputs = child_sync_inserter.all_smem_outputs();
+
+        // Default - Track all smem inputs / outputs
+        all_smem_inputs_.insert(child_inputs.begin(), child_inputs.end());
+        all_smem_outputs_.insert(child_outputs.begin(), child_outputs.end());
+
+        if (!initial_sync_) {
+          // Parent - None
+          if (!child_sync_inserter.initial_sync_) {
+            // Child - None
+            // Append All Child Outputs to Parent Initial
+            initial_.insert(child_outputs.begin(), child_outputs.end());
+          } else if (child_sync_inserter.has_war_hazard_sync_) {
+            // Child - WAR race
+            // Parent first sync
+            // Inherit Child Initial / Clear Parent Final
+            initial_sync_ = true;
+            is_last_op_sync_ = true;
+            initial_.insert(
+                child_sync_inserter.initial().begin(),
+                child_sync_inserter.initial().end());
+            final_.clear();
+          } else {
+            // Child - 1+
+            // Parent first sync
+            // Inherit Child Initial + Final
+            initial_sync_ = true;
+            initial_.insert(
+                child_sync_inserter.initial().begin(),
+                child_sync_inserter.initial().end());
+            final_.insert(
+                child_sync_inserter.final().begin(),
+                child_sync_inserter.final().end());
+          }
+        } else {
+          // Parent - 1+
+          if (!child_sync_inserter.initial_sync_) {
+            // Child - None
+            // Append All Child to Parent Last
+            final_.insert(child_inputs.begin(), child_inputs.end());
+          } else if (child_sync_inserter.has_war_hazard_sync_) {
+            // Child - WAR race
+            // Clear Parent Last / Discard Child Initial
+            is_last_op_sync_ = true;
+            final_.clear();
+          } else {
+            // Child - 1+
+            // Inherit Child Final / Discard Child Initial
+            final_.insert(
+                child_sync_inserter.final().begin(),
+                child_sync_inserter.final().end());
+          }
+        }
+      } else {
+        handle(expr);
+      }
+    }
+
+    // This level of the nested for-loop may not exist in the kernel.
+    // However, subsequent levels can exist, so we handle the body of the
+    // for-loop first.
+    if (!fl->iter_domain()->isThread() && !fl->iter_domain()->isBroadcast()) {
+      // Determine if any smem TV is written to at beginning of the for-loop
+      // and whether that smem TV is read from at the end of the for-loop
+      // Insert new SyncThreads at end of for-loop to prevent WAR race condition
+      if (detect_intersection(initial_, final_) &&
+          fl->body().exprs().back()->getExprType().value() != ExprType::Sync &&
+          !is_last_op_sync_) {
+        // std::cout << "WAR race detected; Add Sync" << std::endl;
+        has_war_hazard_sync_ = true;
+        kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+        fl->body().push_back(ir_builder.create<kir::Sync>(true));
+      }
+    }
+  }
+
+  bool detect_intersection(
+      std::unordered_set<const TensorView*>& left,
+      std::unordered_set<const TensorView*>& right) {
+    for (auto item : left) {
+      if (right.find(item) != right.end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void hasOutputSmemExpr(
+      Expr* expr,
+      std::unordered_set<const TensorView*>& set) {
+    for (auto out : expr->outputs()) {
+      if (ir_utils::isTV(out)) {
+        auto tv = out->as<TensorView>();
+        if (tv->getMemoryType() == MemoryType::Shared) {
+          set.insert(tv);
+        }
+      }
+    }
+  }
+
+  void hasInputSmemExpr(
+      Expr* expr,
+      std::unordered_set<const TensorView*>& set) {
+    for (auto inp : expr->inputs()) {
+      if (ir_utils::isTV(inp)) {
+        auto tv = inp->as<TensorView>();
+        if (tv->getMemoryType() == MemoryType::Shared) {
+          set.insert(tv);
+        }
+      }
+    }
+  }
+
+ private:
+  // Track Shared Memory Inputs (Reads) for parent for-loop
+  std::unordered_set<const TensorView*> all_smem_inputs_;
+
+  // Track Shared Memory Outputs (Writes) for parent for-loop
+  std::unordered_set<const TensorView*> all_smem_outputs_;
+
+  // Shared Memory Writes at beginning of the for-loop
+  // before first SyncThreads
+  std::unordered_set<const TensorView*> initial_;
+
+  // Shared Memory Reads at end of the for-loop
+  // Cleared after each SyncThreads
+  std::unordered_set<const TensorView*> final_;
+
+  // Track first sync found in for-loop
+  bool initial_sync_ = false;
+
+  // Track sync was inserted for war hazard
+  bool has_war_hazard_sync_ = false;
+};
+
+} // namespace
+
+std::vector<Expr*> insertThreadSynchronization(
+    Fusion* fusion,
+    const std::vector<Expr*>& exprs) {
+  FUSER_PERF_SCOPE("insertThreadSynchronization");
+  FusionGuard fg(fusion);
+  std::vector<Expr*> mutated_exprs;
+  for (auto expr : exprs) {
+    LocalSyncInserter::InsertSyncs(expr);
+    mutated_exprs.push_back(expr);
+  }
+  return mutated_exprs;
+}
+
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
new file mode 100644
index 000000000000..e17d536de575
--- /dev/null
+++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
+#include <torch/csrc/jit/codegen/cuda/dispatch.h>
+#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+
+// Insert sync at end of for-loops to prevent write-after-read race condition.
+// WAR race condition occurs when the next iteration of the loop overwrites
+// shared memory value before a previous operation has finished reading it.
+
+// WAR Race Check:
+// Track all output shared memory TVs before first sync
+// Track all input shared memory TVs after last sync
+// If the intersection is non-empty, then there is a WAR race condition.
+// Recursively check each nested for-loop
+
+// Parent-Child For-Loop Recursive Relationship
+// Notation:
+// None - Zero Syncs
+//   1+ - One or more Syncs
+//  End - Sync is last op in for-loop to prevent WAR race condition
+
+// Default: Track all shared memory inputs and outputs
+
+// Parent - None
+//  Child - None => Append All Child Outputs to Parent Initial
+//  Child - 1+ => Parent first sync => Inherit Child Initial + Final
+//  Child - End => Parent first sync => Keep Child Initial / Clear Parent Final
+
+// Parent - 1+
+//  Child - None => Append All Child to Parent Last
+//  Child - 1+ => Child Final to Parent Final / Discard Child Initial
+//  Child - End => Clear Parent Last / Discard Child Initial
+
+// If Child - End and Parent has zero remaining operations, then
+// Parent inherits Child End.
+
+std::vector<Expr*> insertThreadSynchronization(
+    Fusion* fusion,
+    const std::vector<Expr*>& exprs);
+
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
index fd7033e50016..97c3feb50723 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
@@ -1,15 +1,30 @@
+
 #include <torch/csrc/jit/codegen/cuda/lower_loops.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
+#include <algorithm>
 #include <numeric>
 
 namespace torch {
 namespace jit {
 namespace fuser {
 
+LoopNestGenerator::LoopNestGenerator(
+    Fusion* fusion,
+    ThreadPredicateMap& thread_predicates,
+    const std::vector<Expr*>& exprs)
+    : fusion_(fusion),
+      thread_predicates_(thread_predicates),
+      ir_builder_(GpuLower::current()->kernel()) {
+  generate(exprs);
+}
+
 // Create, place, and return the allocation for tv
 Expr* LoopNestGenerator::pushAlloc(TensorView* tv) {
   TORCH_INTERNAL_ASSERT(
@@ -40,26 +55,35 @@ Expr* LoopNestGenerator::pushAlloc(TensorView* tv) {
         local_dim->isBroadcast()) {
       continue;
     }
-    alloc_dims.push_back(compute_at_dim->extent());
+    alloc_dims.push_back(compute_at_dim->rawExtent());
   }
 
   // Multiply all the dimensions we're going to use for the allocation together
   // to get the total size
   Val* size = nullptr;
   if (alloc_dims.size() == 0) {
-    size = new kir::Int(1);
+    size = ir_builder_.create<kir::Int>(1);
   } else {
-    size = kir::lowerValue(alloc_dims[0]);
+    size = GpuLower::lowerValue(alloc_dims[0]);
     for (size_t i = 1; i < alloc_dims.size(); i++) {
-      size = kir::mulExpr(size, kir::lowerValue(alloc_dims[i]));
+      size = ir_builder_.mulExpr(size, GpuLower::lowerValue(alloc_dims[i]));
     }
   }
 
   // Create the allocation node
-  const auto lowered_tv = new kir::TensorView(tv);
-  const auto alloc =
-      new kir::Allocate(lowered_tv, lowered_tv->getMemoryType(), size);
+  const auto lowered_tv = ir_builder_.create<kir::TensorView>(tv);
+  const auto alloc = ir_builder_.create<kir::Allocate>(
+      lowered_tv, lowered_tv->memoryType(), size);
+
+  // Track Shared Memory Allocation Nodes
+  if (tv->getMemoryType() == MemoryType::Shared) {
+    if (!size->isConstScalar()) {
+      dynamic_smem_.push_front(alloc);
+      return nullptr;
+    }
+  }
 
+  // Place the allocation
   if (alloc_loop != nullptr) {
     alloc_loop->body().insert(0, alloc);
   } else {
@@ -90,10 +114,11 @@ void LoopNestGenerator::popFor() {
 }
 
 void LoopNestGenerator::pushBack(Expr* expr) {
-  if (for_loops.size() == 0)
+  if (for_loops.size() == 0) {
     lowered_exprs.push_back(expr);
-  else
+  } else {
     scope_utils::pushBack(for_loops.back(), expr);
+  }
 }
 
 // Update for loop structure based on this TensorView, if there's an allocation
@@ -116,7 +141,7 @@ void LoopNestGenerator::initReduction(
     IterDomain* dim = tv->getComputeAtAxis(i).first;
     if (dim->isReduction())
       continue;
-    ids.push_back(kir::lowerValue(dim)->as<kir::IterDomain>());
+    ids.push_back(GpuLower::lowerValue(dim)->as<kir::IterDomain>());
   }
 
   // Unsafe clone, as we want an exact replica of tv so we can create a UnaryOp
@@ -146,11 +171,14 @@ void LoopNestGenerator::initReduction(
       // If based on a thread, make sure we get the named Int right
       std::stringstream ss;
       ss << id->getParallelType();
-      new_fl = new kir::ForLoop(
-          new kir::NamedScalar(ss.str(), DataType::Int), id, {}, inner_fl);
+      new_fl = ir_builder_.create<kir::ForLoop>(
+          ir_builder_.create<kir::NamedScalar>(ss.str(), DataType::Int),
+          id,
+          inner_fl);
     } else {
       // Otherwise it's just a new int-
-      new_fl = new kir::ForLoop(new kir::Int(c10::nullopt), id, {}, inner_fl);
+      new_fl = ir_builder_.create<kir::ForLoop>(
+          ir_builder_.create<kir::Int>(c10::nullopt), id, inner_fl);
     }
 
     if (init_loop_nest == nullptr) {
@@ -213,8 +241,10 @@ void LoopNestGenerator::handle(Expr* expr) {
           " cannot lower ",
           out->getValType().value());
 
-      pushBack(new kir::Allocate(
-          kir::lowerValue(out), MemoryType::Local, new kir::Int(1)));
+      pushBack(ir_builder_.create<kir::Allocate>(
+          GpuLower::lowerValue(out),
+          MemoryType::Local,
+          ir_builder_.create<kir::Int>(1)));
     }
     pushBack(expr);
     return;
@@ -227,7 +257,7 @@ void LoopNestGenerator::handle(Expr* expr) {
   }
   if (shared_memory_sync) {
     // push Sync to the back of the last for loop
-    scope_utils::pushBack(for_loops.back(), new kir::Sync());
+    scope_utils::pushBack(for_loops.back(), ir_builder_.create<kir::Sync>());
     cleanSharedMemory();
   }
 
@@ -311,8 +341,8 @@ void LoopNestGenerator::handle(Expr* expr) {
       // Nothing to open
       break;
     }
-    if (kir::lowerValue(loops_to_open.front().first)->as<kir::IterDomain>() ==
-        existing_loop->iter_domain()) {
+    if (GpuLower::lowerValue(loops_to_open.front().first)
+            ->as<kir::IterDomain>() == existing_loop->iter_domain()) {
       loops_to_open.pop_front();
     }
   }
@@ -334,8 +364,9 @@ void LoopNestGenerator::handle(Expr* expr) {
   //  If this is a reduction, initialize the output (open for loops to inner
   //  most, predicate, initialize, place next after allocation if exists, close
   //  to computeAt)
-  if (out->hasReduction())
+  if (out->hasReduction()) {
     initReduction(out, expr->as<ReductionOp>()->init(), alloc_expr);
+  }
 
   //  Place the expression
   pushBack(expr);
@@ -352,7 +383,7 @@ void LoopNestGenerator::handle(Expr* expr) {
     auto ca_axis = out->getThisComputeAtAxis() - 1;
     while (for_loops.size() > 0 &&
            for_loops.back()->iter_domain() !=
-               kir::lowerValue(out->getComputeAtAxis(ca_axis).first)
+               GpuLower::lowerValue(out->getComputeAtAxis(ca_axis).first)
                    ->as<kir::IterDomain>()) {
       popFor();
     }
@@ -394,9 +425,9 @@ void findTargetTensor(Expr* expr, TensorView*& target, unsigned& score) {
   auto axis = out_tv->getRelativeComputeAtAxis();
   target = out_tv->getComputeAtView();
   while (target->hasComputeAt()) {
-    if (target->getThisComputeAtAxis() < axis)
+    if (target->getThisComputeAtAxis() < axis) {
       break;
-    TORCH_INTERNAL_ASSERT(target->getThisComputeAtAxis() == axis);
+    }
     axis = target->getComputeAtRelPos(axis);
     target = target->getComputeAtView();
   }
@@ -466,6 +497,79 @@ void sortGroup(TensorView* target, ExprListT& exprs, ExprScoreMapT& scores) {
       });
 }
 
+// Reorder expressions that are computed at the same position in a
+// breadth-first order.
+void reorderSegmentBreadthFirst(
+    ExprListT::iterator seg_begin,
+    ExprListT::const_iterator seg_end) {
+  // mapping of each expression to a bool flag indicating if it's
+  // already been visited
+  std::unordered_map<const Expr*, bool> expr_status;
+  for (auto it = seg_begin; it != seg_end; ++it) {
+    expr_status.insert({*it, false});
+  }
+
+  while (seg_begin != seg_end) {
+    std::vector<const Expr*> visited_exprs;
+    for (auto it = seg_begin; it != seg_end; ++it) {
+      const auto expr = *it;
+      const auto& expr_inputs =
+          ir_utils::filterByType<TensorView>(expr->inputs());
+      // expr can be visited if all input expressions are already
+      // visited. If an input expression is not found in expr_status,
+      // that should be safe to ignore.
+      const bool ready_to_visit = std::all_of(
+          expr_inputs.begin(),
+          expr_inputs.end(),
+          [&expr_status](const TensorView* input) {
+            const Expr* input_origin = input->getOrigin();
+            return input_origin == nullptr ||
+                expr_status.find(input_origin) == expr_status.end() ||
+                expr_status.at(input_origin);
+          });
+      if (ready_to_visit) {
+        std::iter_swap(seg_begin, it);
+        TORCH_INTERNAL_ASSERT(*seg_begin == expr);
+        ++seg_begin;
+        visited_exprs.push_back(expr);
+      }
+    }
+    for (const auto& visited_expr : visited_exprs) {
+      expr_status.at(visited_expr) = true;
+    }
+  }
+}
+
+// Reorder expressions in a group in a breadth-first order. Reordering
+// is done within a subset of expressions that have the same score
+// (i.e., computeAt position). For each subset,
+// reorderSegmentBreadthFirst is called.
+void reorderGroupBreadthFirst(ExprListT& exprs, const ExprScoreMapT& scores) {
+  auto seg_begin = exprs.begin();
+  auto seg_end = exprs.begin();
+  ScoreT seg_score = scores.at(*seg_begin);
+  while (seg_end != exprs.end()) {
+    const auto expr = *seg_end;
+    const auto cur_score = scores.at(expr);
+    if (seg_score == cur_score) {
+      // advance further
+      ++seg_end;
+      continue;
+    } else if (seg_score < cur_score) {
+      // segment ended
+      reorderSegmentBreadthFirst(seg_begin, seg_end);
+      seg_begin = seg_end;
+      seg_score = cur_score;
+    } else {
+      // expre list is assumed to be sorted in the order of scores, so
+      // this should never be reachable
+      TORCH_INTERNAL_ASSERT(
+          false, "Unexpected expression: ", expr, ", score: ", cur_score);
+    }
+  }
+  reorderSegmentBreadthFirst(seg_begin, seg_end);
+}
+
 void mergeNonRootGroupsIntoRootGroups(
     TargetGroupMapT& computed_at_exprs,
     ExprTargetMapT& target_map) {
@@ -549,6 +653,8 @@ void reorderExprsForComputeAt(std::vector<Expr*>& exprs) {
   // 2. Sort each loop-nest group based on axis (i.e., score)
   for (auto& group : computed_at_exprs) {
     sortGroup(group.first, group.second, scores);
+    // Reorder expressions in a breadth-first order
+    reorderGroupBreadthFirst(group.second, scores);
   }
 
   // 3. Merge non-root loop-nests into root loop-nests
@@ -579,7 +685,7 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
   FusionGuard fg(fusion_);
 
   // Identify all shared memory TensorViews
-  // Initialize Modified status
+  // Insert into shared_memory map <tv, modify status>
   for (auto v : fusion_->vals()) {
     if (v->getValType().value() == ValType::TensorView) {
       if (v->as<TensorView>()->getMemoryType() == MemoryType::Shared) {
@@ -597,6 +703,11 @@ void LoopNestGenerator::generate(const std::vector<Expr*>& exprs) {
   for (auto* expr : reordered) {
     handle(expr);
   }
+
+  // Insert Dynamic Shared Memory at beginning of kernel
+  for (auto smem_alloc : dynamic_smem_) {
+    lowered_exprs.insert(lowered_exprs.begin(), smem_alloc);
+  }
 }
 
 void LoopNestGenerator::cleanSharedMemory() {
diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/torch/csrc/jit/codegen/cuda/lower_loops.h
index f15ea29d218f..efe056ae9fe8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.h
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.h
@@ -3,7 +3,9 @@
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
 
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 
 namespace torch {
@@ -27,22 +29,21 @@ namespace fuser {
  *
  */
 class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
- private:
-  // Lowered exprs to return
-  std::vector<Expr*> lowered_exprs;
-  // Fusion pointer for convenience
-  Fusion* fusion_;
-
-  // Keep all for loops conveniently to make unrolling easier, basically just a
-  // stack of the active for_loops
-  std::vector<kir::ForLoop*> for_loops;
-
-  // Track the active computeAt scope, and what view we're "computeAt-ing" into
-  std::vector<std::pair<IterDomain*, TensorView*>> compute_at_scope;
+ public:
+  static std::vector<Expr*> loweredExprs(
+      Fusion* fusion,
+      ThreadPredicateMap& thread_predicates,
+      const std::vector<Expr*>& exprs) {
+    FUSER_PERF_SCOPE("LoopNestGenerator::loweredExprs");
+    LoopNestGenerator generator(fusion, thread_predicates, exprs);
+    return generator.lowered_exprs;
+  }
 
-  // Predicates from ThreadPredicates that we will extend to reduction buffer
-  // initialization
-  ThreadPredicateMap& thread_predicates_;
+ private:
+  LoopNestGenerator(
+      Fusion* fusion,
+      ThreadPredicateMap& thread_predicates,
+      const std::vector<Expr*>& exprs);
 
   // Create the allocation for tv, place it inside the loop associated with
   // alloc_id, return the node
@@ -52,6 +53,10 @@ class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
   // Tracks if shared memory is modified
   std::unordered_map<Val*, bool> smem_;
 
+  // Track dynamic shared memory buffer
+  // Insert allocation at the beginning of the kernel
+  std::deque<kir::Allocate*> dynamic_smem_;
+
   // Clear the modify status for all shared memory buffers
   void cleanSharedMemory();
 
@@ -84,22 +89,26 @@ class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch {
   // Run the pass and accumulate output in lowered_exprs
   void generate(const std::vector<Expr*>& exprs);
 
-  LoopNestGenerator(
-      Fusion* _fusion,
-      ThreadPredicateMap& _thread_predicates,
-      const std::vector<Expr*>& exprs)
-      : fusion_(_fusion), thread_predicates_(_thread_predicates) {
-    generate(exprs);
-  }
+ private:
+  // Lowered exprs to return
+  std::vector<Expr*> lowered_exprs;
 
- public:
-  static std::vector<Expr*> loweredExprs(
-      Fusion* _fusion,
-      ThreadPredicateMap& _thread_predicates,
-      const std::vector<Expr*>& exprs) {
-    LoopNestGenerator generator(_fusion, _thread_predicates, exprs);
-    return generator.lowered_exprs;
-  }
+  // Fusion pointer for convenience
+  Fusion* fusion_;
+
+  // Keep all for loops conveniently to make unrolling easier, basically just a
+  // stack of the active for_loops
+  std::vector<kir::ForLoop*> for_loops;
+
+  // Track the active computeAt scope, and what view we're "computeAt-ing" into
+  std::vector<std::pair<IterDomain*, TensorView*>> compute_at_scope;
+
+  // Predicates from ThreadPredicates that we will extend to reduction buffer
+  // initialization
+  ThreadPredicateMap& thread_predicates_;
+
+  // Kernel IR builder
+  kir::IrBuilder ir_builder_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
index 7cfa01f29a0e..03311dc43ebf 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp
@@ -1,9 +1,13 @@
+
+#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
+
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -12,33 +16,38 @@ namespace {
 
 Val* getPredicatePerParallelType(
     ParallelType pt,
-    const ThreadPredicateMap::SourceMapType::mapped_type& sources) {
+    const ThreadPredicateMap::SourceMapType& source_map) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (pt == ParallelType::BIDx || pt == ParallelType::BIDy ||
       pt == ParallelType::BIDz) {
-    TORCH_INTERNAL_ASSERT(!sources.empty(), "No predicate source found");
-    TORCH_INTERNAL_ASSERT(sources.size() == 1, "Multiple sources detected");
-    auto src = *sources.begin();
+    auto source = source_map.at(pt);
+    TORCH_INTERNAL_ASSERT(!source.empty(), "No predicate source found");
+    TORCH_INTERNAL_ASSERT(source.size() == 1, "Multiple sources detected");
+    auto src = *source.begin();
     auto flag_name = kir::GridReduction::getPredicateFlagName(src);
-    return new kir::NamedScalar(flag_name, DataType::Bool);
+    return ir_builder.create<kir::NamedScalar>(flag_name, DataType::Bool);
   } else {
-    return kir::eqExpr(kir::NamedScalar::getParallelIndex(pt), new kir::Int(0));
+    return ir_builder.eqExpr(
+        kir::NamedScalar::getParallelIndex(pt), ir_builder.create<kir::Int>(0));
   }
 }
 
 kir::Bool* getPredicate(
     const ir_utils::ParallelTypeBitmap& bits,
-    const ThreadPredicateMap::SourceMapType& sources) {
+    const ThreadPredicateMap::SourceMapType& source_map) {
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (bits.none()) {
-    return new kir::Bool(true);
+    return ir_builder.create<kir::Bool>(true);
   }
 
   Val* pred = nullptr;
 
   for (const auto& pt_bool : bits.getMap()) {
     if (pt_bool.second) {
-      auto tp =
-          getPredicatePerParallelType(pt_bool.first, sources.at(pt_bool.first));
-      pred = (pred == nullptr) ? tp : kir::andExpr(pred, tp);
+      auto tp = getPredicatePerParallelType(pt_bool.first, source_map);
+      pred = (pred == nullptr) ? tp : ir_builder.andExpr(pred, tp);
     }
   }
 
@@ -88,10 +97,27 @@ void maskSouceMap(
   }
 }
 
+// A bit of a hack for now for GEMM tiling so we don't fetch tiles multiple
+// times. It's safe to do, there may simply be a better place to do it.
+void avoidRedundantWritesToSmem(
+    TensorView* out_tv,
+    ir_utils::ParallelTypeBitmap& pred) {
+  if (out_tv->getMemoryType() == MemoryType::Shared) {
+    for (size_t i = 0; i < out_tv->nDims(); i++) {
+      auto id = out_tv->getComputeAtAxis(i).first;
+      if (out_tv->axis(i)->isBroadcast() && id->isThreadDim()) {
+        pred.set(id->getParallelType(), true);
+      }
+    }
+  }
+}
+
 } // namespace
 
 // Update the reduction_deps bitset based on provided Expr
 void ThreadPredicateMap::updateBitSet(Expr* expr) {
+  FUSER_PERF_SCOPE("ThreadPredicateMap::updateBitSet");
+
   // Which predicates were set for the inputs
   ir_utils::ParallelTypeBitmap input_preds;
 
@@ -170,19 +196,23 @@ void ThreadPredicateMap::updateBitSet(Expr* expr) {
 
   // Get rid of any reductions which are bcasted
   output_preds &= bcast_reset_map;
-  // Similarly, drop non-relevant source tensos
+  // Similarly, drop non-relevant source tensors
   maskSouceMap(src_map, bcast_reset_map);
 
   // Run through outputs and set bitset predicates
-  for (const auto* out : expr->outputs()) {
+  for (auto* out : expr->outputs()) {
     if (!ir_utils::isTV(out))
       continue;
     TORCH_INTERNAL_ASSERT(find(ir_utils::asConstTV(out)) == end());
-    insert(ir_utils::asConstTV(out), output_preds, src_map);
+    auto pred_for_this_out = output_preds;
+    avoidRedundantWritesToSmem(ir_utils::asTV(out), pred_for_this_out);
+    insert(ir_utils::asConstTV(out), pred_for_this_out, src_map);
   }
 }
 
+// TODO(kir): revisit this - can we build it from the kernel IR?
 ThreadPredicateMap::ThreadPredicateMap(Fusion* _fusion) : fusion_(_fusion) {
+  FUSER_PERF_SCOPE("ThreadPredicateMap");
   // Initialize mapping for input tensors
   for (auto inp : fusion_->inputs()) {
     if (ir_utils::isTV(inp)) {
@@ -243,9 +273,9 @@ void ThreadPredicateMap::duplicate(
   }
 }
 
-kir::Bool* ThreadPredicateMap::getExpr(const TensorView* tv) const {
-  TORCH_INTERNAL_ASSERT(find(tv) != end(), "Couldn't find ", tv);
-  return getPredicate(at(tv).first, at(tv).second);
+kir::Bool* ThreadPredicateMap::getExpr(const TensorView* out_tv) const {
+  TORCH_INTERNAL_ASSERT(find(out_tv) != end(), "Couldn't find ", out_tv);
+  return getPredicate(at(out_tv).first, at(out_tv).second);
 }
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
index 184640280283..ab321dc530c8 100644
--- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
+++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
@@ -10,16 +10,17 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
-/*
- * Map from tensorview to bit set represnting <BIDx, BIDy, BIDz, TIDx, TIDy,
- * TIDz> If any dependency of TV had a parallelized reduction, we will track
- * it here. This will be used for predicate generation to prevent
- * parallelization on that axis. This is important if we have a reduction on
- * for example TIDx, as the reduced value is only valid on threadIdx.x == 0
- * therefore if we use that value later in the kernel we have that predicate.
- * If we follow a reduction parallelized on TIDx with a broadcast on TIDx we
- * no longer need the predicate and can reset the bit accordingly
- */
+//! Maps TensorViews to std::pair<ir_utils::ParallelTypeBitmap, SourceMapType>>
+//!
+//! Map from tensorview to bit set represnting <BIDx, BIDy, BIDz, TIDx, TIDy,
+//! TIDz> If any dependency of TV had a parallelized reduction, we will track
+//! it here. This will be used for predicate generation to prevent
+//! parallelization on that axis. This is important if we have a reduction on
+//! for example TIDx, as the reduced value is only valid on threadIdx.x == 0
+//! therefore if we use that value later in the kernel we have that predicate.
+//! If we follow a reduction parallelized on TIDx with a broadcast on TIDx we
+//! no longer need the predicate and can reset the bit accordingly
+//!
 class TORCH_CUDA_API ThreadPredicateMap {
  public:
   using SourceMapType = std::unordered_map<
@@ -41,13 +42,10 @@ class TORCH_CUDA_API ThreadPredicateMap {
 
   void duplicate(const TensorView* copy, const TensorView* origin);
 
-  // Returns a Bool predicate expression for a given TensorView.
-  kir::Bool* getExpr(const TensorView* tv) const;
+  // Returns a Bool predicate expression for a given output TensorView.
+  kir::Bool* getExpr(const TensorView* out_tv) const;
 
  private:
-  Fusion* fusion_;
-  MapType thread_predicates_;
-
   // Update the thread_predicates bitset based on provided Expr
   void updateBitSet(Expr*);
 
@@ -56,6 +54,10 @@ class TORCH_CUDA_API ThreadPredicateMap {
       const ir_utils::ParallelTypeBitmap& pred,
       const SourceMapType& src_map);
   void insert(const TensorView* tv, const MapType::mapped_type& pred_and_src);
+
+ private:
+  Fusion* fusion_ = nullptr;
+  MapType thread_predicates_;
 };
 
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
index 303875275e2d..51fd7f0b1b82 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
@@ -1,11 +1,15 @@
+
+#include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
+
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
 
-#include <torch/csrc/jit/codegen/cuda/lower_unroll.h>
-
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -36,8 +40,10 @@ void UnrollPass::handle(Expr* expr) {
     // If we need a predicate, put expr inside an if then else
     if (!(pred->isConst()) || !(pred->isConst() && pred->value().value())) {
       non_trivial_pred_found = true;
+      kir::IrBuilder ir_builder(GpuLower::current()->kernel());
       kir::IfThenElse* inline_ite =
-          new kir::IfThenElse(pred, {expr}, {}, for_loops.back());
+          ir_builder.create<kir::IfThenElse>(pred, for_loops.back());
+      inline_ite->thenBody().push_back(expr);
       for_loops.back()->body().insert_before(expr, inline_ite);
       for_loops.back()->body().erase(expr);
     }
@@ -72,13 +78,14 @@ void UnrollPass::handle(kir::ForLoop* fl) {
 
   kir::ForLoop* parent_scope = for_loops.empty() ? nullptr : for_loops.back();
 
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
   kir::IfThenElse* unroll_ite =
-      new kir::IfThenElse(unroll_pred, {}, {}, parent_scope);
+      ir_builder.create<kir::IfThenElse>(unroll_pred, parent_scope);
 
   // Get the loop nest for the unrolled path
   kir::ForLoop* unrolled_loop_nest = scope_utils::cloneLoopNest(fl, unroll_ite);
 
-  unroll_ite->body().push_back(unrolled_loop_nest);
+  unroll_ite->thenBody().push_back(unrolled_loop_nest);
 
   // Loop nest for inlined path
   kir::ForLoop* inlined_loop = scope_utils::cloneLoopNest(fl, unroll_ite);
@@ -99,6 +106,8 @@ void UnrollPass::handle(kir::ForLoop* fl) {
 
 // Generate the loop nest structure and place it in lowered_exprs
 void UnrollPass::computeMap() {
+  FUSER_PERF_SCOPE("UnrollPass::computeMap");
+
   FusionGuard fg(fusion_);
 
   // Run through loop nests and further lower the expressions
@@ -111,6 +120,7 @@ std::vector<Expr*> UnrollPass::runPass(
     Fusion* fusion,
     const std::vector<Expr*>& exprs,
     const ThreadPredicateMap& thread_predicates) {
+  FUSER_PERF_SCOPE("UnrollPass::runPass");
   FusionGuard fg(fusion);
   UnrollPass up(fusion, exprs, thread_predicates);
   up.computeMap();
diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/torch/csrc/jit/codegen/cuda/lower_unroll.h
index 238f4de30f60..f77b8f37c810 100644
--- a/torch/csrc/jit/codegen/cuda/lower_unroll.h
+++ b/torch/csrc/jit/codegen/cuda/lower_unroll.h
@@ -29,12 +29,12 @@ namespace fuser {
  *     if( i * 4 + 3 < I && j * 128 + 127 < J ){
  *       for( k : I0i{4} )
  *         for( l : I1i{128} )
- *           T0[ ( i * 4 + k ) * J + j * 128 + l ] = …
+ *           T0[ ( i * 4 + k ) * J + j * 128 + l ] = ...
  *     } else {
  *       for( k : I0i{4} )
  *         for( l : I1i{128} )
  *           if( i * 4 + k < I && j * 128 + l < J)
- *              T0[ ( i * 4 + k ) * J + j * 128 + l ] = …
+ *              T0[ ( i * 4 + k ) * J + j * 128 + l ] = ...
  *     }
  *
  *   }
@@ -50,7 +50,7 @@ namespace fuser {
 
 class TORCH_CUDA_API UnrollPass : public OptOutDispatch {
  private:
-  // Wrapper to access thread_predicates_
+  // Wrapper to access thread_predicates_ based on an output TV
   kir::Bool* getThreadPredicate(TensorView*);
 
   // We will track which loops in the incomming IR will be replaced and by what
@@ -92,7 +92,7 @@ class TORCH_CUDA_API UnrollPass : public OptOutDispatch {
       : fusion_(_fusion),
         incoming_exprs_(_incoming_exprs),
         thread_predicates_(_thread_predicates) {
-    auto p2c_root_map = loop_utils::p2cRootMap(_fusion->exprs(true));
+    p2c_root_map = loop_utils::p2cRootMap(_fusion->exprs(true));
   }
 
   // Generate the for Expr replacement map
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
index 1e25cb7a758e..262cb5a7d4c0 100644
--- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
@@ -1,7 +1,11 @@
+
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_thread_predicate.h>
 
 #include <algorithm>
@@ -48,7 +52,7 @@ class scopePushBack : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    ite->body().push_back(expr_);
+    ite->thenBody().push_back(expr_);
   }
 
   void handle(Expr* expr) final {
@@ -76,7 +80,7 @@ class scopeInsertBefore : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    ite->body().insert_before(ref_, expr_);
+    ite->thenBody().insert_before(ref_, expr_);
   }
 
   void handle(Expr* expr) final {
@@ -107,7 +111,7 @@ class ExprInScope : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    if (ite->body().contains(expr_)) {
+    if (ite->thenBody().contains(expr_)) {
       contains_ = true;
     }
   }
@@ -166,15 +170,15 @@ class CloneLoopNest : public OptOutMutator {
   Expr* to_clone_ = nullptr;
 
   Statement* mutate(kir::ForLoop* fl) final {
-    std::vector<Expr*> mutated_exprs;
+    kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+    const auto parent_scope =
+        fl == to_clone_ ? parent_scope_ : fl->parentScope();
+    auto new_loop = ir_builder.create<kir::ForLoop>(
+        fl->index(), fl->iter_domain(), parent_scope);
     for (Expr* expr : fl->body().exprs()) {
-      mutated_exprs.push_back(ir_utils::asExpr(OptOutMutator::mutate(expr)));
+      new_loop->body().push_back(ir_utils::asExpr(OptOutMutator::mutate(expr)));
     }
-    if (fl == to_clone_)
-      return new kir::ForLoop(
-          fl->index(), fl->iter_domain(), mutated_exprs, parent_scope_);
-    return new kir::ForLoop(
-        fl->index(), fl->iter_domain(), mutated_exprs, fl->parentScope());
+    return new_loop;
   }
 
   CloneLoopNest(Expr* _to_clone, Expr* _parent_scope)
@@ -223,7 +227,7 @@ class ReplaceExprsInScope : public OptOutDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    handleScope(ite->body());
+    handleScope(ite->thenBody());
     handleScope(ite->elseBody());
   }
 
@@ -246,7 +250,7 @@ class FirstInnerMostScope : private OptInDispatch {
   }
 
   void handle(kir::IfThenElse* ite) final {
-    for (auto expr : ite->body().exprs()) {
+    for (auto expr : ite->thenBody().exprs()) {
       if (ir_utils::isScope(expr)) {
         active_scope = expr;
         return;
@@ -323,15 +327,19 @@ Expr* getParent(Expr* scope) {
 
 // Open a new inner most for loop
 kir::ForLoop* openFor(Expr* scope, IterDomain* id) {
-  const auto kir_id = kir::lowerValue(id)->as<kir::IterDomain>();
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+  const auto kir_id = GpuLower::lowerValue(id)->as<kir::IterDomain>();
   kir::ForLoop* new_scope = nullptr;
   if (id->isThread()) {
     std::stringstream ss;
     ss << id->getParallelType();
-    new_scope = new kir::ForLoop(
-        new kir::NamedScalar(ss.str(), DataType::Int), kir_id, {}, scope);
+    new_scope = ir_builder.create<kir::ForLoop>(
+        ir_builder.create<kir::NamedScalar>(ss.str(), DataType::Int),
+        kir_id,
+        scope);
   } else {
-    new_scope = new kir::ForLoop(new kir::Int(c10::nullopt), kir_id, {}, scope);
+    new_scope = ir_builder.create<kir::ForLoop>(
+        ir_builder.create<kir::Int>(c10::nullopt), kir_id, scope);
   }
   if (scope != nullptr)
     pushBack(scope, new_scope);
@@ -601,8 +609,16 @@ ParallelTypeBitmap getParallelBroadcastDomains(
 
   ParallelTypeBitmap parallel_broadcast;
   const auto& iter_domains = out_tv->domain()->domain();
+  // If the output is on shared memory, assume that all subsequent
+  // reads from all threads in its CTA can be done with no parallel
+  // broadcast. Only one thread will write to shared memory followed
+  // by a proper _syncthreads.
+  const bool output_smem = out_tv->getMemoryType() == MemoryType::Shared;
   for (auto id : iter_domains) {
-    if (id->isBroadcast() && id->isThread()) {
+    if (!id->isBroadcast()) {
+      continue;
+    }
+    if (id->isBlockDim() || (!output_smem && id->isThreadDim())) {
       parallel_broadcast.set(id->getParallelType(), true);
     }
   }
@@ -633,7 +649,7 @@ std::pair<kir::ForLoop*, int64_t> getAllocPoint(
     // Grab the axis ID
 
     auto ca_id = tv->getComputeAtAxis(tv_i).first;
-    auto kir_ca_id = kir::lowerValue(ca_id)->as<kir::IterDomain>();
+    auto kir_ca_id = GpuLower::lowerValue(ca_id)->as<kir::IterDomain>();
 
     loops_it =
         std::find_if(loops_it, loops.end(), [&kir_ca_id](const auto& loop) {
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
index 593d6172c988..5e1715c51b89 100644
--- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -1,5 +1,6 @@
-
 #include <torch/csrc/jit/codegen/cuda/lower_validation.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
+#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
@@ -9,6 +10,8 @@ namespace jit {
 namespace fuser {
 
 void validateIr(Fusion* fusion) {
+  FUSER_PERF_SCOPE("validateIr");
+
   FusionGuard fg(fusion);
 
   auto used_vals = DependencyCheck::getAllValsBetween(
diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/torch/csrc/jit/codegen/cuda/manager.cpp
index 51407ea7fca9..ddddce75ad9e 100644
--- a/torch/csrc/jit/codegen/cuda/manager.cpp
+++ b/torch/csrc/jit/codegen/cuda/manager.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
@@ -47,20 +48,6 @@ namespace cuda {
 
 namespace {
 
-c10::Device getDevice(const at::ArrayRef<IValue>& inputs) {
-  // find device in inputs.
-  for (const auto& input : inputs) {
-    if (input.isTensor()) {
-      auto dev = input.toTensor().device();
-      TORCH_INTERNAL_ASSERT(
-          dev.is_cuda(), "Could only fuser operations on cuda device");
-      return dev;
-    }
-  }
-  TORCH_INTERNAL_ASSERT(
-      false, "Could not detect device of inputs to a fusion.");
-}
-
 // CudaFusionManager is not thread safe!
 // TODO: we should make the tradeoff here to use thread_local instead of global
 // singleton;
@@ -99,7 +86,6 @@ class CudaFusionManager {
 
   std::vector<at::Tensor> runFusionNode(
       int32_t kernel_id,
-      std::shared_ptr<Graph>& graph,
       const at::ArrayRef<IValue> inputs) {
     std::lock_guard<std::mutex> guard(mutex_);
     return graph_cache_[kernel_id]->runGraphWithInputs(inputs);
@@ -216,19 +202,32 @@ class CudaFusionManager {
 } // namespace
 
 void compileCudaFusionGroup(Node* fusion_node) {
+  FUSER_PERF_SCOPE("compileCudaFusionGroup");
+
   TORCH_CHECK(
       fusion_node->kind() == prim::CudaFusionGroup,
       "Only prim::CudaFusionGroup can be compiled");
   if (fusion_node->hasAttribute(attr::cache_id)) {
     TORCH_WARN("Double registration of CudaFusionGroup on CudaFusionManager");
   }
+  // This is not a critical code path, it's OK to do graph copy here;
+  auto graph = fusion_node->g(attr::Subgraph)->copy();
+
+  // type propagation is needed, as the protocol only requires scalar type on
+  // input tensors.
+  // Note that even for Profiling Executor, scalar type could still be missing,
+  // especially for output tensor from a given node (as profiling node only
+  // insert meta information after itself).
+  TypePropagate(graph);
+
   int32_t fusion_cache_id =
-      CudaFusionManager::getManager().registerOrGetCacheId(
-          fusion_node->g(attr::Subgraph));
+      CudaFusionManager::getManager().registerOrGetCacheId(graph);
   fusion_node->i_(attr::cache_id, fusion_cache_id);
 }
 
 void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
+  FUSER_PERF_SCOPE("runCudaFusionGroup");
+
   TORCH_CHECK(
       fusion_node->kind() == prim::CudaFusionGroup,
       "prim::CudaFusionGroup expected");
@@ -240,31 +239,14 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
   int32_t kernel_id = fusion_node->i(attr::cache_id);
 
   // Currently we just construct I/O tensors for static graph;
-  std::shared_ptr<Graph> graph = fusion_node->g(attr::Subgraph)->copy();
+
+  const auto nInputs = fusion_node->g(attr::Subgraph)->inputs().size();
 
   auto execute_lambda = [&]() {
-    const auto nInputs = graph->inputs().size();
     at::ArrayRef<IValue> inputs = last(stack, nInputs);
 
-    // TODO: we would/could want an extra layer of graph cache in order to
-    //       handle varying contiguity/broadcast;
-    // Only needed if we are doing codegen
-    // if no shape information available, we feed current shape into the kernel;
-    // This is needed because our current broadcast on size-1 dimension
-    if (!IsNewExecutorEnabled()) {
-      EraseShapeInformation(graph);
-      for (size_t i = 0; i < nInputs; i++) {
-        graph->inputs()[i]->setType(inputs[i].type());
-      }
-      // Type propagation that's here just to cover corner case, incase type
-      // propagation failed in the original subgraph. We currently need output
-      // types in order to support fp16, where we cast input to fp32 and output
-      // back to fp16.
-      TypePropagate(graph);
-    }
-
     auto outputs =
-        CudaFusionManager::getManager().runFusionNode(kernel_id, graph, inputs);
+        CudaFusionManager::getManager().runFusionNode(kernel_id, inputs);
 
     drop(stack, inputs.size());
     stack.insert(
@@ -286,8 +268,10 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
           "Failed for some reason. To debug try disable codegen fallback path"
           "via setting the env variable"
           "`export PYTORCH_CUDA_FUSER_DISABLE_FALLBACK=1`");
-      EraseShapeInformation(graph);
-      InterpreterState{Code(graph, "fallback_cuda_fuser")}.run(stack);
+      // copying graph here since we are eliminating shape information;
+      auto copied_graph = fusion_node->g(attr::Subgraph)->copy();
+      EraseShapeInformation(copied_graph);
+      InterpreterState{Code(copied_graph, "fallback_cuda_fuser")}.run(stack);
     }
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp
index b7ab0cf3017b..ea963332fa6d 100644
--- a/torch/csrc/jit/codegen/cuda/parser.cpp
+++ b/torch/csrc/jit/codegen/cuda/parser.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/parser.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 
@@ -32,37 +33,6 @@ typedef Expr* CgOp;
 typedef void (*ParseFuncPtr)(const Node*, std::unordered_map<size_t, CgValue>&);
 typedef bool (*MergeQueryFuncPtr)(const Node*);
 
-std::vector<int> reductionAxes(TensorView* tv) {
-  size_t n_dims = tv->nDims();
-  std::vector<int> reduction_axes;
-  for (size_t i = 0; i < n_dims; i++) {
-    if (tv->axis(i)->isReduction()) {
-      reduction_axes.emplace_back(i);
-    }
-  }
-  return reduction_axes;
-}
-
-// coalesces all reduction to the right side and returns total number of
-// reduction axes
-size_t coalescReduction(TensorView* tv) {
-  auto reduction_axes = reductionAxes(tv);
-  size_t n_dims = tv->nDims();
-  std::unordered_map<int, int> coalesc_permute;
-  for (size_t i = 0; i < reduction_axes.size(); i++) {
-    size_t new_pos = i + n_dims - reduction_axes.size();
-    if (new_pos == size_t(reduction_axes[i])) {
-      break;
-    } else {
-      coalesc_permute[reduction_axes[i]] = new_pos;
-    }
-  }
-  if (!coalesc_permute.empty()) {
-    tv->reorder(coalesc_permute);
-  }
-  return reduction_axes.size();
-}
-
 // TODO: add a mutex to make it thread safe.
 class IrParser {
   class RegistrationEntry {
@@ -507,6 +477,14 @@ class IrParser {
             // we don't support cast of output types yet;
             if (!node->inputs()[3]->type()->isSubtypeOf(
                     static_cast<c10::TypePtr>(NoneType::get()))) {
+              // We can only handle output as half and float;
+              if (const auto opt_ivalue = toIValue(node->input(3))) {
+                const auto scalar_type = opt_ivalue->toScalarType();
+                if (scalar_type == at::ScalarType::Float ||
+                    scalar_type == at::ScalarType::Half) {
+                  return true;
+                }
+              }
               return false;
             }
             // we don't support dynamic reduction axes;
@@ -661,6 +639,8 @@ bool isNodeParsible(const Node* node) {
 }
 
 std::unique_ptr<Fusion> parseJitIR(std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("parseJitIR");
+
   IrParser parser(graph);
   return parser.parse();
 }
diff --git a/torch/csrc/jit/codegen/cuda/partition.cpp b/torch/csrc/jit/codegen/cuda/partition.cpp
index b242a96b7665..5c839864665b 100644
--- a/torch/csrc/jit/codegen/cuda/partition.cpp
+++ b/torch/csrc/jit/codegen/cuda/partition.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/partition.h>
 #include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/parser.h>
 
 namespace torch {
@@ -290,6 +291,8 @@ bool createTrickyBroadcast(const Node* consumer, const Node* producer) {
 } // namespace
 
 bool isFusableCudaFusionGroup(const Node* node) {
+  FUSER_PERF_SCOPE("isFusableCudaFusionGroup");
+
   if (isFusableNode(node)) {
     return isFusableDevice(node);
   }
@@ -297,6 +300,8 @@ bool isFusableCudaFusionGroup(const Node* node) {
 }
 
 bool isFusableCudaFusionGroup(const Node* fusion, const Node* node) {
+  FUSER_PERF_SCOPE("isFusableCudaFusionGroup");
+
   // TODO: lift the restriction of not fusing producer containing reduction when
   //       we have proper scheduling.
   if (isFusableCudaFusionGroup(node) && !hasReductionOperation(node) &&
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
index 6f39e2f7dfc8..5a0eb3fcf8f4 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp
@@ -1,9 +1,13 @@
+
 #include <torch/csrc/jit/codegen/cuda/predicate_compute.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/index_compute.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_ir_builder.h>
+#include <torch/csrc/jit/codegen/cuda/lower2device.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 
@@ -15,6 +19,8 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
     const TensorView* tv,
     const std::vector<Val*>& indices,
     bool use_rfactor) {
+  FUSER_PERF_SCOPE("computePredicates");
+
   const std::vector<IterDomain*>& root =
       use_rfactor ? tv->getMaybeRFactorDomain() : tv->getRootDomain();
 
@@ -31,7 +37,9 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
     return {};
   }
 
-  auto true_bool = new kir::Bool(true);
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
+  auto true_bool = ir_builder.create<kir::Bool>(true);
   std::vector<kir::Bool*> preds(root.size(), true_bool);
   Val* extent = nullptr;
 
@@ -45,19 +53,21 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
       extent = nullptr;
       continue;
     } else if (zero_ind) {
-      if (root[i]->extent()->isOneInt())
+      if (root[i]->extent()->isOneInt()) {
         continue;
+      }
+      const auto lowered_extent = GpuLower::lowerValue(root[i]->extent());
       if (extent == nullptr) {
-        extent = kir::lowerValue(root[i]->extent());
+        extent = lowered_extent;
       } else {
-        extent = kir::mulExpr(extent, kir::lowerValue(root[i]->extent()));
+        extent = ir_builder.mulExpr(extent, lowered_extent);
       }
     } else {
-      auto local_extent = kir::lowerValue(root[i]->extent());
+      auto local_extent = GpuLower::lowerValue(root[i]->extent());
       if (extent != nullptr) {
-        local_extent = kir::mulExpr(extent, local_extent);
+        local_extent = ir_builder.mulExpr(extent, local_extent);
       }
-      auto pred = kir::ltExpr(indices[i], local_extent);
+      auto pred = ir_builder.ltExpr(indices[i], local_extent);
       extent = nullptr;
       TORCH_INTERNAL_ASSERT(
           pred->getValType().value() == ValType::KirScalar &&
@@ -71,9 +81,22 @@ std::vector<kir::Bool*> PredicateCompute::computePredicates(
 kir::Bool* PredicateCompute::getInlinePredicate(
     Expr* expr,
     const std::vector<kir::ForLoop*>& loops,
-    kir::Bool* thread_pred) {
+    kir::Bool* thread_pred,
+    bool ignore_block_grid_reductions) {
+  FUSER_PERF_SCOPE("getInlinePredicate");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   if (loops.empty()) {
-    return new kir::Bool(true);
+    return ir_builder.create<kir::Bool>(true);
+  }
+
+  // Handle these elsewhere
+  if (ignore_block_grid_reductions &&
+      expr->getExprType() == ExprType::ReductionOp &&
+      (expr->as<ReductionOp>()->out()->as<TensorView>()->hasBlockReduction() ||
+       expr->as<ReductionOp>()->out()->as<TensorView>()->hasGridReduction())) {
+    return ir_builder.create<kir::Bool>(true);
   }
 
   TORCH_INTERNAL_ASSERT(
@@ -117,7 +140,7 @@ kir::Bool* PredicateCompute::getInlinePredicate(
     // buffer. If we're initing a reduction buffer don't generate an inline
     // predicate.
     if (!has_tv_inputs) {
-      return new kir::Bool(true);
+      return ir_builder.create<kir::Bool>(true);
     }
   }
 
@@ -136,13 +159,13 @@ kir::Bool* PredicateCompute::getInlinePredicate(
       preds.push_back(pred);
 
   if (preds.empty()) {
-    return new kir::Bool(true);
+    return ir_builder.create<kir::Bool>(true);
   }
 
   Val* cond = preds[0];
 
   for (decltype(preds.size()) i{1}; i < preds.size(); i++) {
-    cond = kir::andExpr(cond, preds[i]);
+    cond = ir_builder.andExpr(cond, preds[i]);
   }
 
   TORCH_INTERNAL_ASSERT(
@@ -158,15 +181,19 @@ kir::Bool* UnrollPredicate::get(
     const std::vector<kir::ForLoop*>& outer_loops,
     kir::ForLoop* unrolled_loop,
     const std::unordered_map<IterDomain*, IterDomain*>& p2c_root_map) {
+  FUSER_PERF_SCOPE("UnrollPredicate::get");
+
+  kir::IrBuilder ir_builder(GpuLower::current()->kernel());
+
   UnrollPredicate up(outer_loops, unrolled_loop, p2c_root_map);
 
   std::unordered_set<kir::Bool*> pred_set;
-  for (auto entry : up.predicates) {
+  for (auto entry : up.predicates_) {
     pred_set.emplace(entry.second);
   }
 
-  if (up.predicates.empty()) {
-    return new kir::Bool(true);
+  if (up.predicates_.empty()) {
+    return ir_builder.create<kir::Bool>(true);
   }
 
   Val* unroll_pred = nullptr;
@@ -174,7 +201,7 @@ kir::Bool* UnrollPredicate::get(
     if (unroll_pred == nullptr) {
       unroll_pred = pred;
     } else {
-      unroll_pred = kir::andExpr(unroll_pred, pred);
+      unroll_pred = ir_builder.andExpr(unroll_pred, pred);
     }
   }
   TORCH_INTERNAL_ASSERT(
@@ -184,8 +211,11 @@ kir::Bool* UnrollPredicate::get(
 }
 
 void UnrollPredicate::predicateOn(Expr* tv_expr) {
-  if (for_loops.empty())
+  FUSER_PERF_SCOPE("UnrollPredicate::predicateOn");
+
+  if (for_loops_.empty()) {
     return;
+  }
 
   auto out_tv = ir_utils::getTVOutput(tv_expr);
 
@@ -210,7 +240,7 @@ void UnrollPredicate::predicateOn(Expr* tv_expr) {
   }
 
   auto pred_inds = Index::getConsumerRootPredIndices(
-      out_tv, for_loops, pred_contiguity, true);
+      out_tv, for_loops_, pred_contiguity, true);
   auto root_indices = pred_inds.first;
   auto use_rfactor = pred_inds.second;
 
@@ -229,12 +259,14 @@ void UnrollPredicate::predicateOn(Expr* tv_expr) {
       continue;
     }
     auto term_id = loop_utils::getTermIDInMap(root_dom[i], p2c_root_map_);
-    predicates[term_id] = all_preds[i];
+    predicates_[term_id] = all_preds[i];
   }
 }
 
 void UnrollPredicate::openLoop(kir::ForLoop* fl) {
-  for_loops.push_back(fl);
+  FUSER_PERF_SCOPE("UnrollPredicate::openLoop");
+
+  for_loops_.push_back(fl);
 
   for (auto expr : fl->body().exprs()) {
     if (ir_utils::isTVOp(expr)) {
@@ -244,14 +276,14 @@ void UnrollPredicate::openLoop(kir::ForLoop* fl) {
     }
   }
 
-  for_loops.pop_back();
+  for_loops_.pop_back();
 }
 
 UnrollPredicate::UnrollPredicate(
     std::vector<kir::ForLoop*> outer_loops,
     kir::ForLoop* unrolled_loop,
     const std::unordered_map<IterDomain*, IterDomain*>& _p2c_root_map)
-    : for_loops(std::move(outer_loops)), p2c_root_map_(_p2c_root_map) {
+    : for_loops_(std::move(outer_loops)), p2c_root_map_(_p2c_root_map) {
   openLoop(unrolled_loop);
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.h b/torch/csrc/jit/codegen/cuda/predicate_compute.h
index f4bee4f74dda..3c6d86106fe4 100644
--- a/torch/csrc/jit/codegen/cuda/predicate_compute.h
+++ b/torch/csrc/jit/codegen/cuda/predicate_compute.h
@@ -45,7 +45,8 @@ class PredicateCompute {
   static kir::Bool* getInlinePredicate(
       Expr* expr,
       const std::vector<kir::ForLoop*>& loops,
-      kir::Bool* thread_pred);
+      kir::Bool* thread_pred,
+      bool ignore_block_grid_reductions = true);
 };
 
 class TORCH_CUDA_API UnrollPredicate {
@@ -65,8 +66,9 @@ class TORCH_CUDA_API UnrollPredicate {
 
   void openLoop(kir::ForLoop*);
 
-  std::unordered_map<IterDomain*, kir::Bool*> predicates;
-  std::vector<kir::ForLoop*> for_loops;
+ private:
+  std::unordered_map<IterDomain*, kir::Bool*> predicates_;
+  std::vector<kir::ForLoop*> for_loops_;
 
   const std::unordered_map<IterDomain*, IterDomain*>& p2c_root_map_;
 };
diff --git a/torch/csrc/jit/codegen/cuda/scheduler.cpp b/torch/csrc/jit/codegen/cuda/scheduler.cpp
index c3e2f10c0f62..f9bc25ca711e 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/executor_utils.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
@@ -15,7 +16,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-constexpr int kUnrollFactor = 4;
+constexpr int kUnrollFactor = 1;
 
 namespace {
 
@@ -30,188 +31,109 @@ std::vector<int> reductionAxes(TensorView* tv) {
   return reduction_axes;
 }
 
-// coalesces all reduction to the right side and returns total number of
+// Merge all reduction to the right side and returns total number of
 // reduction axes
-size_t coalescReduction(TensorView* tv) {
-  auto reduction_axes = reductionAxes(tv);
-  size_t n_dims = tv->nDims();
-  std::unordered_map<int, int> coalesc_permute;
-  for (size_t i = 0; i < reduction_axes.size(); i++) {
-    size_t new_pos = i + n_dims - reduction_axes.size();
-    if ((int)new_pos == reduction_axes[i]) {
-      break;
+size_t mergeReduction(TensorView* tv) {
+  int prev_i = -1;
+  size_t num_merged = 0;
+  for (int i = static_cast<int>(tv->nDims()) - 1; i >= 0; i--) {
+    if (!tv->axis(i)->isReduction()) {
+      continue;
+    }
+    if (prev_i == -1) {
+      prev_i = i;
+    } else {
+      tv->merge(i, prev_i);
+      prev_i = i;
+      num_merged++;
+    }
+  }
+  if (prev_i == 0) {
+    tv->reorder({{prev_i, -1}});
+  }
+
+  return prev_i == -1 ? 0 : num_merged + 1;
+}
+
+// merge all non-reduction axes to the left side and returns total number of
+// iteration axes
+size_t mergeNonReduction(TensorView* tv) {
+  int prev_i = -1;
+  size_t num_merged = 0;
+  for (int i = static_cast<int>(tv->nDims()) - 1; i >= 0; i--) {
+    if (tv->axis(i)->isReduction()) {
+      continue;
+    }
+    if (prev_i == -1) {
+      prev_i = i;
     } else {
-      coalesc_permute[reduction_axes[i]] = new_pos;
+      tv->merge(i, prev_i);
+      prev_i = i;
+      num_merged++;
     }
   }
-  if (!coalesc_permute.empty()) {
-    tv->reorder(coalesc_permute);
+  if (prev_i != 0) {
+    tv->reorder({{prev_i, 0}});
   }
-  return reduction_axes.size();
+
+  return prev_i == -1 ? 0 : num_merged + 1;
 }
 
 } // namespace
 
 // This one is a total mess and it should go.
 bool scheduleFusion(Fusion* fusion, const at::ArrayRef<c10::IValue> inputs) {
+  FUSER_PERF_SCOPE("scheduleFusion");
+
   FusionGuard fg(fusion);
   // maybe has_reduction for scheudling should be done on a per output tensor
   // basis.
-  const bool has_reduction = fusion->hasReduction();
-  const bool disable_unroll = fusion->hasRNG();
-  bool fcd_reduction = false;
+  TORCH_INTERNAL_ASSERT(
+      !fusion->hasReduction(), "This scheduler only handles pointwise ops.");
+  const bool disable_unroll = fusion->isStochastic();
 
   for (auto out_val : fusion->outputs()) {
     auto out = out_val->as<TensorView>();
-    if (has_reduction) {
-      // TODO: this scheduling only works for a single reduction operation in
-      //       the fusion, in this case we can coalesc all reduction axes and
-      //       merge them together. (same applies to iteration axes)
-      // TODO: does this work for multiple outputs?
-
-      // query if fastest changing dimension (FCD) is a reduction
-      fcd_reduction = out->axis((int)out->nDims() - 1)->isReduction();
-
-      // We coalesc all reduction axes to the right;
-      size_t num_reduction_axes = coalescReduction(out);
-
-      // Merge all iteration dimensions
-      while (out->nDims() > num_reduction_axes + 1) {
-        // we merge the last two iterative axes;
-        out->merge(static_cast<int>(out->nDims() - num_reduction_axes) - 2);
-      }
-      // Merge all reduction dimensions
-      while (out->nDims() > 2) {
-        out->merge(-2, -1);
-      }
-    } else {
-      // Merge all dimensions because we're only supporting pointwise
-      while (out->nDims() > 1)
-        out->merge(-2, -1);
-    }
-  }
-
-  if (has_reduction) {
-    // Run through outputs, grab all inputs of outputs
-    // squeeze with computeAt to set overall structure.
-    for (auto output : fusion->outputs()) {
-      if (output->getValType() != ValType::TensorView)
-        continue;
-      TensorView* out_tv = output->as<TensorView>();
-
-      // launch configuratoin.
-      TensorView* intermediate = nullptr;
-      if (fcd_reduction) {
-        out_tv->split(-1, kFcdReductionThreadX);
-        // necessary to avoid dynamic allocation on intermediates;
-        intermediate = out_tv->rFactor({-2});
-      } else {
-        // TODO: we don't need a full warp here, this should be determined by
-        //       element data type
-        out_tv->split(0, kNonFcdReductionThreadX);
-        out_tv->split(
-            -1, kNonFcdReductionThreadY); // necessary to avoid dynamic
-                                          // allocation on intermediates;
-        intermediate = out_tv->rFactor({-2});
-      }
-      for (Val* inp : fusion->inputsOf(output)) {
-        // scheduling of inputs shouldn't change with different fcd_reduction
-        if (inp->getValType().value() == ValType::TensorView) {
-          inp->as<TensorView>()->computeAt(intermediate, -1);
-        }
-      }
-      // scheduling of inputs shouldn't change with different fcd_reduction
-      intermediate->computeAt(out_tv, -2);
-      if (fcd_reduction) {
-        out_tv->axis(0)->parallelize(ParallelType::BIDx);
-      } else {
-        out_tv->axis(0)->parallelize(ParallelType::BIDx);
-        out_tv->axis(1)->parallelize(ParallelType::TIDx);
-      }
-    }
-    // Run through all values, unroll, and bind their axes
-    for (auto val : fusion->vals()) {
-      if (val->getValType().value() != ValType::TensorView ||
-          fusion->hasInput(val))
-        continue;
-      TensorView* tv = val->as<TensorView>();
-      if (fcd_reduction) {
-        tv->axis(-1)->parallelize(ParallelType::TIDx);
-      } else {
-        tv->axis(-1)->parallelize(ParallelType::TIDy);
-      }
-    }
 
-    TensorView* out0 = fusion->outputs()[0]->as<TensorView>();
-    int ndim = (int)out0->nDims();
-    Val* numel = new Int(1);
-    for (int i = 0; i < ndim; i++) {
-      if (out0->axis(i)->isBlockDim()) {
-        numel = mul(numel, out0->axis(i)->rawExtent());
-      }
-    }
-  } else {
-    // Run through outputs, grab all inputs of outputs
-    // squeeze with computeAt to set overall structure.
-    for (auto output : fusion->outputs()) {
-      if (output->getValType() != ValType::TensorView)
-        continue;
-      TensorView* out_tv = output->as<TensorView>();
-
-      // Split into 128 which will be bockDim.x
-      out_tv->split(0, kPwThreadX);
-      // Split by another 4 which will be our unroll factor
-      auto ur_factor = disable_unroll ? 1 : kUnrollFactor;
-      if (!disable_unroll) {
-        out_tv->split(0, ur_factor);
-      }
+    // Merge all dimensions because we're only supporting pointwise
+    while (out->nDims() > 1) {
+      out->merge(-2, -1);
     }
+  }
 
-    for (auto output : fusion->outputs()) {
-      if (output->getValType() != ValType::TensorView)
-        continue;
-      TensorView* out_tv = output->as<TensorView>();
-      for (Val* inp : fusion->inputsOf(output)) {
-        if (inp->getValType().value() == ValType::TensorView)
-          inp->as<TensorView>()->computeAt(out_tv, 1);
-      }
-      out_tv->axis(0)->parallelize(ParallelType::BIDx);
+  // Run through outputs, grab all inputs of outputs
+  // squeeze with computeAt to set overall structure.
+  for (auto output : fusion->outputs()) {
+    if (output->getValType() != ValType::TensorView)
+      continue;
+    TensorView* out_tv = output->as<TensorView>();
+
+    // Split into 128 which will be bockDim.x
+    out_tv->split(0, kPwThreadX);
+    // Split by another 4 which will be our unroll factor
+    auto ur_factor = disable_unroll ? 1 : kUnrollFactor;
+    if (!disable_unroll) {
+      out_tv->split(0, ur_factor);
     }
+  }
 
-    // Run through all values, unroll, and bind their axes
-    for (auto val : fusion->vals()) {
-      if (val->getValType().value() != ValType::TensorView ||
-          fusion->hasInput(val))
-        continue;
-      TensorView* tv = val->as<TensorView>();
-
-      // Should be true for all intermediates, but if one isn't hooked
-      // up right, skip it and hope for the best for now
-      if (!disable_unroll && tv->nDims() == 3) {
-        tv->axis(-2)->parallelize(ParallelType::Unroll);
-        tv->axis(-1)->parallelize(ParallelType::TIDx);
-      } else {
-        if (tv->nDims() == 2)
-          tv->axis(-1)->parallelize(ParallelType::TIDx);
-      }
-    }
-    TensorView* out0 = fusion->outputs()[0]->as<TensorView>();
-    int ndim = (int)out0->nDims();
-    Val* numel = new Int(1);
-    for (int i = 0; i < ndim; i++) {
-      if (out0->axis(i)->isBlockDim()) {
-        numel = mul(numel, out0->axis(i)->rawExtent());
-      }
+  for (auto output : fusion->outputs()) {
+    if (output->getValType() != ValType::TensorView)
+      continue;
+    TensorView* out_tv = output->as<TensorView>();
+    for (Val* inp : fusion->inputsOf(output)) {
+      if (inp->getValType().value() == ValType::TensorView)
+        inp->as<TensorView>()->computeAt(out_tv, -1);
     }
+    out_tv->axis(0)->parallelize(ParallelType::BIDx);
+    out_tv->axis(1)->parallelize(ParallelType::Unroll);
+    out_tv->axis(2)->parallelize(ParallelType::TIDx);
   }
+
   return true;
 }
 
 namespace {
-constexpr int ceilDiv(int a, int b) {
-  return (a + b - 1) / b;
-}
-
 // Largest Power of 2 less-than n
 constexpr int lastPow2(int n) {
   n |= (n >> 1);
@@ -243,7 +165,10 @@ ReductionParams reductionHeuristic(
 
   // Is fastest dimension a reduction dimension?
   if (rparams.fastest_dim) {
-    bdimx = red_elems;
+    if (red_elems < rparams.loop_unroll) {
+      rparams.loop_unroll = 1;
+    }
+    bdimx = ceilDiv(red_elems, rparams.loop_unroll);
     bdimy = red_outputs;
   } else {
     bdimx = red_outputs;
@@ -365,21 +290,31 @@ ReductionParams reductionHeuristic(
 }
 } // anonymous namespace
 
-// fusion is the input IR that will be modified by this function
-c10::optional<ReductionParams> scheduleReduction(
+TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue>& fusion_inputs,
     TensorView* red_tv) {
+  FUSER_PERF_SCOPE("scheduleReduction");
+
   FusionGuard fg(fusion);
 
   if (!fusion->hasReduction()) {
     return c10::nullopt;
   }
+
+  auto red_root_dom = red_tv->getRootDomain();
+  const bool red_on_fastest_dim =
+      red_root_dom[red_root_dom.size() - 1]->isReduction();
+
   TORCH_INTERNAL_ASSERT(
       red_tv != nullptr, "Reduction TensorView wasn't found.");
+
+  if (!fusion->hasReduction()) {
+    return c10::nullopt;
+  }
+
   TORCH_INTERNAL_ASSERT(
       red_tv->hasReduction(), "TensorView doesn't have a reduction.");
-
   const auto red_expr = fusion->origin(red_tv);
 
   TORCH_INTERNAL_ASSERT(
@@ -387,41 +322,48 @@ c10::optional<ReductionParams> scheduleReduction(
           red_expr->getExprType().value() == ExprType::ReductionOp,
       "TensorView doesn't have a reduction.");
 
-  const bool red_on_fastest_dim =
-      red_tv->axis(static_cast<int>(red_tv->nDims()) - 1)->isReduction();
+  StatefulExpressionEvaluator evaluator(
+      executor_utils::statefulBindInputs(fusion_inputs, fusion));
+
+  int64_t red_outputs = 1;
+  int64_t red_elements = 1;
+
+  for (auto id : red_tv->getRootDomain()) {
+    auto inferred_val = evaluator.inferValue(id->rawExtent());
+    TORCH_INTERNAL_ASSERT(
+        inferred_val.has_value(), "Error inferring reduction size.");
+    if (id->isReduction()) {
+      red_elements *= inferred_val.value();
+    } else {
+      red_outputs *= inferred_val.value();
+    }
+  }
+
+  return reductionHeuristic(red_elements, red_outputs, red_on_fastest_dim);
+}
+
+// fusion is the input IR that will be modified by this function
+void scheduleReduction(
+    Fusion* fusion,
+    const ReductionParams& rparams,
+    TensorView* red_tv,
+    std::vector<TensorView*> outs_of_red) {
+  FusionGuard fg(fusion);
 
   // We coalesc all reduction axes to the right;
-  const size_t num_reduction_axes = coalescReduction(red_tv);
+  mergeReduction(red_tv);
 
   // Merge all iteration dimensions
-  while (red_tv->nDims() > num_reduction_axes + 1) {
-    red_tv->merge(static_cast<int>(red_tv->nDims() - num_reduction_axes) - 2);
+  mergeNonReduction(red_tv);
+  for (auto iter_tv : outs_of_red) {
+    mergeNonReduction(iter_tv);
   }
-  // Merge all reduction dimensions
-  while (red_tv->nDims() > 2) {
-    red_tv->merge(-2, -1);
-  }
-
-  EvaluationContext eval_context(
-      executor_utils::bindInputs(fusion_inputs, fusion));
 
   // Evaluate Dimensions of Reduction TensorView
   auto red_ids = red_tv->domain()->domain();
+
   TORCH_INTERNAL_ASSERT(
       red_ids.size() == 2, "We coalesced all dimensions into 2 previously.");
-  const auto red_outputs =
-      ExpressionEvaluator::evaluate(red_ids[0]->extent(), &eval_context);
-  const auto red_elems =
-      ExpressionEvaluator::evaluate(red_ids[1]->extent(), &eval_context);
-  TORCH_INTERNAL_ASSERT(
-      red_outputs != c10::nullopt,
-      "The number of reduction outputs is expected.");
-  TORCH_INTERNAL_ASSERT(
-      red_elems != c10::nullopt,
-      "The number of reduction elements is expected.");
-
-  ReductionParams rparams = reductionHeuristic(
-      red_elems.value(), red_outputs.value(), red_on_fastest_dim);
 
   constexpr int kLoopUnrollSplit = 4;
 
@@ -430,36 +372,29 @@ c10::optional<ReductionParams> scheduleReduction(
     // Do multiple reductions per block
     if (rparams.mul_reds_per_blk) {
       // Reduction Splits
-      //      [outputs, |rF-Leftover, rf-Unroll, X-Warp|]
-      // Idx:     0     |   1(-1)       2(-2)    3(-1) |
-      //                --------------------------------
-      //                Reduction Dimensions
-      red_tv->split(1, rparams.lparams.bdimx());
-      red_tv->split(1, kLoopUnrollSplit);
-
-      // Reordering the Unroll dimension eases applying computeAt()
-      // for preceeding operations and the rFactored Tensor.
-      //                               |- Reordered -|
-      //                               V             V
-      //      [outputs, |rF-Leftover, X-Warp, rF-Unroll|]
-      // Idx:     0     |   1(-3)      2(-2)    3(-1)  |
+      //      [outputs, |rF-Leftover, X-Warp, rf-Unroll|]
+      // Idx:     0     |   1(-1)      2(-2)     3(-1) |
       //                --------------------------------
       //                Reduction Dimensions
-      red_tv->reorder({{-1, -2}, {-2, -1}});
+      red_tv->split(1, rparams.loop_unroll);
+      red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
 
       // Output Splits
       //      [|Out-Leftover, Out-PerBlock|, <Reduction Dims>]
       // Idx:  |     0             1      |   2(-2) -- 3(-1)
       //       ----------------------------
       //       Output Dimensions
-      red_tv->split(0, rparams.lparams.bdimy());
+      red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDy));
+      }
 
       auto red_tv_rf = red_tv->rFactor({-3, -1});
 
       // WARNING: computeAt will coalesce the rFactored dimensions
       // rFactored Reduction Tensor after computeAt():
-      //      [<output dims>, |X-Warp, rF-Leftover, rF-Unroll|]
-      // Idx:      0 -- 1     | 2(-3)      3(-2)       4(-1)  |
+      //      [<output dims>, | rF-Leftover, X-Warp, rF-Unroll|]
+      // Idx:      0 -- 1     |    2(-3)      3(-2)     4(-1)  |
       //                      ---------------------------------
       //                      Reduction Dimensions
       red_tv_rf->computeAt(red_tv, -1);
@@ -468,11 +403,20 @@ c10::optional<ReductionParams> scheduleReduction(
       // Reduction Output Tensor:
       //      [Out-Leftover, Out-PerBlock, X-Warp]
       // Idx:       0              1       2(-1)
+      if (!outs_of_red.empty()) {
+        red_tv->computeAt(outs_of_red[0], -1);
+      }
 
       red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
       red_tv->axis(0)->parallelize(ParallelType::BIDx);
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+      }
       red_tv->axis(1)->parallelize(ParallelType::TIDy);
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->axis(1)->parallelize(ParallelType::TIDy);
+      }
       red_tv->axis(-1)->parallelize(ParallelType::TIDx);
 
       // Bind Inputs to Reduction
@@ -485,47 +429,44 @@ c10::optional<ReductionParams> scheduleReduction(
     } else {
       if (rparams.cross_grid) {
         // Reduction Splits
-        //      [outputs, |rF-Leftover, rf-Unroll, X-Grid, X-Block, X-Warp|]
-        // Idx:     0     |   1(-5)       2(-4)     3(-3)   4(-2)   5(-1) |
+        //      [outputs, |rF-Leftover, X-Grid, X-Block, X-Warp, rf-Unroll|]
+        // Idx:     0     |   1(-5)      2(-4)    3(-3)   4(-2)     5(-1) |
         //                -------------------------------------------------
         //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimx());
-        red_tv->split(1, rparams.lparams.bdimy());
-        red_tv->split(1, rparams.lparams.gdimy());
-        red_tv->split(1, kLoopUnrollSplit);
-
-        // Reordering the Unroll dimension eases applying computeAt()
-        // for preceeding operations and the rFactored Tensor.
-        //                                 |------ Reordered --------|
-        //                                 V                         V
-        //      [outputs, |rF-Leftover, X-Warp, X-Grid, X-Block, rf-Unroll|]
-        // Idx:     0     |   1(-5)     2(-4)    3(-3)    4(-2)    5(-1)  |
-        //                -------------------------------------------------
-        //                Reduction Dimensions
-        red_tv->reorder({{-1, -4}, {-4, -1}});
+        red_tv->split(1, rparams.loop_unroll);
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::BIDy));
 
         auto red_tv_rf = red_tv->rFactor(
             {-5, -1}); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
 
         // WARNING: computeAt will coalesce the rFactored dimensions
         // rFactored Reduction Tensor after computeAt():
-        //      [Outputs, |X-Warp, X-Grid, X-Block, rF-Leftover, rF-Unroll|]
-        // Idx:     0     | 1(-5)   2(-4)   3(-3)      4(-2)       5(-1)  |
+        //      [Outputs, |X-Grid, X-Block, X-Warp, rF-Leftover, rF-Unroll|]
+        // Idx:     0     | 1(-5)    2(-4)   3(-3)      4(-2)      5(-1)  |
         //                -------------------------------------------------
         //                Reduction Dimensions
         red_tv_rf->computeAt(red_tv, -1);
 
         // After the Reduction Tensor has rFactoring applied
         // Reduction Output Tensor:
-        //      [Outputs, X-Warp, X-Grid, X-Block]
-        // Idx:     0     1(-3)    2(-2)    3(-1)
+        //      [Outputs, X-Grid, X-Block, X-Warp]
+        // Idx:     0      1(-3)   2(-2)    3(-1)
+
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
 
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(-3)->parallelize(ParallelType::TIDx);
-        red_tv->axis(-2)->parallelize(ParallelType::BIDy);
-        red_tv->axis(-1)->parallelize(ParallelType::TIDy);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+        }
+        red_tv->axis(-1)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-2)->parallelize(ParallelType::TIDy);
+        red_tv->axis(-3)->parallelize(ParallelType::BIDy);
 
         // Bind Inputs to Reduction
         for (auto input : fusion->inputsOf(red_tv_rf)) {
@@ -535,29 +476,19 @@ c10::optional<ReductionParams> scheduleReduction(
         }
       } else {
         // Reduction Splits
-        //      [outputs, |rF-Leftover, rf-Unroll, X-Block, X-Warp|]
-        // Idx:     0     |   1(-4)       2(-3)     3(-2)   4(-1) |
-        //                -----------------------------------------
-        //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimx());
-        red_tv->split(1, rparams.lparams.bdimy());
-        red_tv->split(1, kLoopUnrollSplit);
-
-        // Reordering the Unroll dimension eases applying computeAt()
-        // for preceeding operations and the rFactored Tensor.
-        //                                 |--- Reordered ----|
-        //                                 V                  V
-        //      [outputs, |rF-Leftover, X-Warp, X-Block, rF-Unroll|]
-        // Idx:     0     |   1(-4)      2(-3)   3(-2)     4(-1)  |
+        //      [outputs, |rF-Leftover, X-Block, X-Warp, rf-Unroll|]
+        // Idx:     0     |   1(-4)       2(-3)   3(-2)     4(-1) |
         //                -----------------------------------------
         //                Reduction Dimensions
-        red_tv->reorder({{-1, -3}, {-3, -1}});
+        red_tv->split(1, rparams.loop_unroll);
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
 
         auto red_tv_rf = red_tv->rFactor({-4, -1});
 
         // WARNING: computeAt will coalesce the rFactored dimensions
         // rFactored Reduction Tensor after computeAt():
-        //      [Outputs, |X-Warp, X-Block, rF-Leftover, rF-Unroll|]
+        //      [Outputs, |X-Block, X-Warp, rF-Leftover, rF-Unroll|]
         // Idx:     0     | 1(-4)   2(-3)      3(-2)       4(-1)  |
         //                -----------------------------------------
         //                Reduction Dimensions
@@ -565,14 +496,21 @@ c10::optional<ReductionParams> scheduleReduction(
 
         // After the Reduction Tensor has rFactoring applied
         // Reduction Output Tensor:
-        //      [Outputs, X-Warp, X-Block]
-        // Idx:     0     1(-2)    2(-1)
+        //      [Outputs, X-Block, X-Warp]
+        // Idx:     0      1(-2)    2(-1)
+
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
 
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(-2)->parallelize(ParallelType::TIDx);
-        red_tv->axis(-1)->parallelize(ParallelType::TIDy);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+        }
+        red_tv->axis(-1)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-2)->parallelize(ParallelType::TIDy);
 
         // Bind Inputs to Reduction
         for (auto input : fusion->inputsOf(red_tv_rf)) {
@@ -590,8 +528,8 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:     0     |   1(-4)       2(-3)     3(-2)   4(-1) |
         //                -----------------------------------------
         //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimy());
-        red_tv->split(1, rparams.lparams.gdimy());
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::BIDy));
         red_tv->split(1, kLoopUnrollSplit);
 
         // Reordering the Unroll dimension eases applying computeAt()
@@ -609,7 +547,10 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:  |     0             1      |   2(-4) -- 5(-1)
         //       ----------------------------
         //       Output Dimensions
-        red_tv->split(0, rparams.lparams.bdimx());
+        red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        }
 
         auto red_tv_rf = red_tv->rFactor({-4, -1});
 
@@ -626,10 +567,19 @@ c10::optional<ReductionParams> scheduleReduction(
         //      [Out-Leftover, Out-PerBlock, X-Block, X-Grid]
         // Idx:       0              1        2(-2)   3(-1)
 
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
+
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(1)->parallelize(ParallelType::TIDx);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+          iter_tv->axis(1)->parallelize(ParallelType::TIDx);
+        }
+
+        red_tv->axis(-3)->parallelize(ParallelType::TIDx);
         red_tv->axis(-2)->parallelize(ParallelType::TIDy);
         red_tv->axis(-1)->parallelize(ParallelType::BIDy);
 
@@ -645,7 +595,7 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:     0     |   1(-3)       2(-2)     3(-1) |
         //                ---------------------------------
         //                Reduction Dimensions
-        red_tv->split(1, rparams.lparams.bdimy());
+        red_tv->split(1, NamedScalar::getParallelDim(ParallelType::TIDy));
         red_tv->split(1, kLoopUnrollSplit);
 
         // Reordering the Unroll dimension eases applying computeAt()
@@ -663,7 +613,10 @@ c10::optional<ReductionParams> scheduleReduction(
         // Idx:  |     0             1      |   2(-3) -- 4(-1)
         //       ----------------------------
         //       Output Dimensions
-        red_tv->split(0, rparams.lparams.bdimx());
+        red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+        }
 
         auto red_tv_rf = red_tv->rFactor({-3, -1});
 
@@ -680,10 +633,18 @@ c10::optional<ReductionParams> scheduleReduction(
         //      [Out-Leftover, Out-PerBlock, X-Block]
         // Idx:       0              1        2(-1)
 
+        if (!outs_of_red.empty()) {
+          red_tv->computeAt(outs_of_red[0], -1);
+        }
+
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(1)->parallelize(ParallelType::TIDx);
+        for (auto iter_tv : outs_of_red) {
+          iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+          iter_tv->axis(1)->parallelize(ParallelType::TIDx);
+        }
+        red_tv->axis(-2)->parallelize(ParallelType::TIDx);
         red_tv->axis(-1)->parallelize(ParallelType::TIDy);
 
         // Bind Inputs to Reduction
@@ -694,9 +655,21 @@ c10::optional<ReductionParams> scheduleReduction(
         }
       }
     } else {
-      red_tv->split(0, rparams.lparams.bdimx());
-      red_tv->axis(0)->parallelize(ParallelType::TIDx);
-      red_tv->axis(1)->parallelize(ParallelType::BIDx);
+      red_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->split(0, NamedScalar::getParallelDim(ParallelType::TIDx));
+      }
+
+      if (!outs_of_red.empty()) {
+        red_tv->computeAt(outs_of_red[0], -1);
+      }
+
+      red_tv->axis(0)->parallelize(ParallelType::BIDx);
+      red_tv->axis(1)->parallelize(ParallelType::TIDx);
+      for (auto iter_tv : outs_of_red) {
+        iter_tv->axis(0)->parallelize(ParallelType::BIDx);
+        iter_tv->axis(1)->parallelize(ParallelType::TIDx);
+      }
 
       for (auto input : fusion->inputsOf(red_tv)) {
         if (input->getValType().value() == ValType::TensorView) {
@@ -705,8 +678,6 @@ c10::optional<ReductionParams> scheduleReduction(
       }
     }
   }
-
-  return rparams;
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/scheduler.h b/torch/csrc/jit/codegen/cuda/scheduler.h
index 2b35b6586f30..5cac9d41f456 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler.h
@@ -15,44 +15,58 @@ TORCH_CUDA_API bool scheduleFusion(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue> inputs);
 
-// Parameters the Reduction Heuristic Generates to describe
-// the optimial schedule
+// Parameters the Reduction Heuristic Generates to describe the optimial
+// schedule. Warning: equal operator is intended for use in caching the kernel
+// associated with these reduction parameteres. It does not check if the launch
+// parameters are equivelent!
 struct ReductionParams {
-  // Reduction Attributes
+  // Reducing inner most dimension?
   bool fastest_dim = true;
+  // Reduce across the block?
   bool cross_block = false;
+  // Reduce across the grid?
   bool cross_grid = false;
+  // Perform multiple reductions per block?
   bool mul_reds_per_blk = false;
+  // Unrolling factor
+  int loop_unroll = 4;
 
   LaunchParams lparams;
 
+  // Warning: Does not check launch parameters!
   bool operator==(const ReductionParams& other) const {
     bool attr_equal = other.fastest_dim == fastest_dim &&
         other.cross_block == cross_block && other.cross_grid == cross_grid &&
-        other.mul_reds_per_blk == mul_reds_per_blk;
-    return attr_equal && lparams == other.lparams;
+        other.mul_reds_per_blk == mul_reds_per_blk &&
+        other.loop_unroll == loop_unroll;
+    return attr_equal;
   }
 };
 
+// Warning: Hash is not based on launch parameters!
 class ReductionParamsHash {
  public:
   size_t operator()(const ReductionParams& rp) const {
-    size_t lp_hash = rp.lparams.gdimx() ^ rp.lparams.gdimy() ^
-        rp.lparams.bdimx() ^ rp.lparams.bdimy();
     constexpr size_t bits = sizeof(std::size_t) * 8;
     size_t attr_hash = static_cast<size_t>(rp.fastest_dim) << (bits - 1) |
         static_cast<size_t>(rp.cross_block) << (bits - 2) |
         static_cast<size_t>(rp.cross_grid) << (bits - 3) |
         static_cast<size_t>(rp.mul_reds_per_blk) << (bits - 4);
-    return lp_hash | attr_hash;
+    return attr_hash;
   }
 };
 
-TORCH_CUDA_API c10::optional<ReductionParams> scheduleReduction(
+TORCH_CUDA_API c10::optional<ReductionParams> getReductionHeuristics(
     Fusion* fusion,
     const at::ArrayRef<c10::IValue>& fusion_inputs,
     TensorView* red_tv);
 
+TORCH_CUDA_API void scheduleReduction(
+    Fusion* fusion,
+    const ReductionParams& rparams,
+    TensorView* red_tv,
+    std::vector<TensorView*> outs_of_red);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/shape_inference.cpp b/torch/csrc/jit/codegen/cuda/shape_inference.cpp
index a247d182bb57..b06d586ec128 100644
--- a/torch/csrc/jit/codegen/cuda/shape_inference.cpp
+++ b/torch/csrc/jit/codegen/cuda/shape_inference.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/shape_inference.h>
 #include <c10/core/ScalarType.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/runtime/operator.h>
 
@@ -148,7 +149,15 @@ class NaiveTypePropagator {
         break;
       }
       case aten::sum: {
-        const auto out_type = node->input(0)->type()->cast<TensorType>();
+        auto out_type = node->input(0)->type()->cast<TensorType>();
+
+        // accept dtype input to `aten::sum` node
+        if (!node->input(3)->type()->isSubtypeOf(
+                static_cast<c10::TypePtr>(NoneType::get()))) {
+          if (auto opt_ivalue = toIValue(node->input(3))) {
+            out_type = out_type->withScalarType(opt_ivalue->toScalarType());
+          }
+        }
         const auto dims = constant_as<c10::List<int64_t>>(node->input(1));
         const auto keepdim = constant_as<bool>(node->input(2));
         TORCH_CHECK(
@@ -234,6 +243,7 @@ class NaiveTypePropagator {
 } // namespace
 
 void TypePropagate(std::shared_ptr<Graph>& graph) {
+  FUSER_PERF_SCOPE("TypePropagate");
   NaiveTypePropagator(graph).run();
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
index e8032c51925a..86ff7263af24 100644
--- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp
+++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
@@ -24,8 +24,8 @@ DataType aten_opt_type_map(const c10::optional<at::ScalarType>& scalar_type) {
 }
 } // namespace
 
-TensorView::TensorView(TensorDomain* _domain, DataType dtype)
-    : Val(ValType::TensorView, dtype), domain_(_domain) {}
+TensorView::TensorView(TensorDomain* _domain, DataType dtype, MemoryType mtype)
+    : Val(ValType::TensorView, dtype), domain_(_domain), memory_type_(mtype) {}
 
 TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
     : Val(ValType::TensorView,
@@ -67,11 +67,6 @@ TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
         stride_property_i->contiguous_.has_value() &&
         stride_property_i->contiguous_.value() == true) {
       const size_t index = stride_property_i->stride_index_.value();
-      // TODO: this is a temporary WAR to avoid contiguous_ flag on broadcasted
-      //       dim, which results in wrong indexing math. issue #230
-      if (sizes[index]->isBroadcast()) {
-        continue;
-      }
       if (i == 0) {
         // mark fastest changing dimension collapsible only when it's the last
         // dim;
@@ -81,10 +76,6 @@ TensorView::TensorView(const std::shared_ptr<c10::TensorType>& tensor_type)
         if (auto left_index_opt =
                 tensor_type->stride_properties()[static_cast<int>(i) - 1]
                     ->stride_index_) {
-          // TODO: `isBroadcast` -> issue #230
-          if (sizes[left_index_opt.value()]->isBroadcast()) {
-            continue;
-          }
           // collapse if two axes are neighboring in both sizes & stride_index;
           contig_info[index] = (left_index_opt.value() == (index + 1));
         }
@@ -116,6 +107,10 @@ bool TensorView::hasGridReduction() const {
   return domain()->hasGridReduction();
 }
 
+bool TensorView::hasBlockBroadcast() const {
+  return domain()->hasBlockBroadcast();
+}
+
 bool TensorView::hasBroadcast() const {
   return domain()->hasBroadcast();
 }
@@ -562,10 +557,6 @@ void TensorView::setMemoryType(MemoryType mt) {
     TORCH_INTERNAL_ASSERT(
         mt == MemoryType::Global,
         "Tried to set an input or output to the fusion to a non-global memory type.");
-  } else {
-    TORCH_INTERNAL_ASSERT(
-        mt != MemoryType::Global,
-        "Tried to set an intermediate tensor in the fusion to the global memory type.");
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.h b/torch/csrc/jit/codegen/cuda/transform_iter.h
index e3cdab856366..161fa547680e 100644
--- a/torch/csrc/jit/codegen/cuda/transform_iter.h
+++ b/torch/csrc/jit/codegen/cuda/transform_iter.h
@@ -154,6 +154,8 @@ class TORCH_CUDA_API BestEffortReplay {
   size_t counter = 0;
 
  public:
+  // replay_map: mapping of target root domains to corresponding
+  // replay root domains
   BestEffortReplay(
       const std::vector<IterDomain*>& replay_domain,
       const std::vector<IterDomain*>& target_domain,
diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.cpp b/torch/csrc/jit/codegen/cuda/transform_replay.cpp
index b694ba51ad08..8ea00bd28c56 100644
--- a/torch/csrc/jit/codegen/cuda/transform_replay.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_replay.cpp
@@ -1,6 +1,7 @@
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
@@ -128,6 +129,8 @@ class ReplaySelf : public ReplayTransformations {
 TensorDomain* TransformReplay::fullSelfReplay(
     const TensorDomain* new_self_root,
     const TensorDomain* self) {
+  FUSER_PERF_SCOPE("fullSelfReplay");
+
   TORCH_INTERNAL_ASSERT(
       new_self_root->nDims() == self->getRootDomain().size(),
       "Invalid number of IterDomains provided.");
@@ -181,6 +184,8 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayPasC(
     const TensorDomain* producer,
     const TensorDomain* consumer,
     int consumer_compute_at_axis) {
+  FUSER_PERF_SCOPE("replayPasC");
+
   if (consumer_compute_at_axis < 0)
     consumer_compute_at_axis += (int)consumer->nDims() + 1;
   TORCH_INTERNAL_ASSERT(
@@ -353,6 +358,8 @@ std::pair<TensorDomain*, unsigned int> TransformReplay::replayCasP(
     const TensorDomain* consumer,
     const TensorDomain* producer,
     int producer_compute_at_axis) {
+  FUSER_PERF_SCOPE("replayCasP");
+
   if (producer_compute_at_axis < 0)
     producer_compute_at_axis += (int)producer->nDims() + 1;
 
diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
index 448122d525f2..27a44a73d7ae 100644
--- a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
+++ b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
+#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
 
@@ -151,6 +152,8 @@ class ReplayRFactor : public ReplayTransformations {
 TensorDomain* TransformRFactor::runReplay(
     TensorDomain* orig_td,
     std::vector<int> axes) {
+  FUSER_PERF_SCOPE("runReplay");
+
   TORCH_CHECK(!axes.empty(), "No axes provided to rfactor replay.");
 
   int ndims = (int)orig_td->nDims();
@@ -300,6 +303,8 @@ TensorDomain* TransformRFactor::runReplay(
 TensorDomain* TransformRFactor::runReplay2(
     TensorDomain* orig_td,
     std::vector<int> axes) {
+  FUSER_PERF_SCOPE("runReplay2");
+
   int ndims = (int)orig_td->nDims();
 
   // Adjust and check provided axes
diff --git a/torch/csrc/jit/codegen/cuda/type.h b/torch/csrc/jit/codegen/cuda/type.h
index a1f2e412a500..bb60fb2e0d15 100644
--- a/torch/csrc/jit/codegen/cuda/type.h
+++ b/torch/csrc/jit/codegen/cuda/type.h
@@ -13,6 +13,14 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
+// https://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key
+struct TypeHash {
+  template <typename T>
+  std::size_t operator()(T t) const {
+    return static_cast<std::size_t>(t);
+  }
+};
+
 // Order of strength
 enum class ValType {
   TensorDomain,
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 08be561aad0d..fdc1e7c3d2fd 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -7,6 +7,11 @@ namespace torch {
 namespace jit {
 namespace fuser {
 
+// Common Functions
+constexpr int64_t ceilDiv(int64_t a, int64_t b) {
+  return (a + b - 1) / b;
+}
+
 // Simple mixin for suppressing copy & move operations, ex:
 //
 //  class Foo : public NonCopyable {
@@ -53,6 +58,16 @@ class PolymorphicBase {
     return downcast_ptr;
   }
 
+  // Check if the runtime time is T (or derived from T)
+  //
+  // NOTE: Don't use this for conditional casts. Use:
+  //
+  //  if (auto t = dynamic_cast<T>(p)) { ... }
+  //
+  // instead of:
+  //
+  //  if (p->isA<T>()) { auto t = p->as<T>(); ... }
+  //
   template <class T>
   bool isA() const {
     return dynamic_cast<const T*>(this) != nullptr;

From 241afc9188c7c88c03e5c62ad163d9f6a17b841d Mon Sep 17 00:00:00 2001
From: Xiong Wei <xiongw.fnst@cn.fujitsu.com>
Date: Fri, 25 Sep 2020 01:16:14 -0700
Subject: [PATCH 128/449] Migrate `addr` from the TH to Aten (CPU) (#44364)

Summary:
Related https://github.com/pytorch/pytorch/issues/24507
Fixes https://github.com/pytorch/pytorch/issues/24666

This PR is to modernize the CPU implementation of the vector `outer product`.
The existing TH implementation for `torch.attr` is migrated to `aten`, as the `torch.ger` manipulates the `addr` functions to calculate outer product,

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44364

Reviewed By: ezyang

Differential Revision: D23866733

Pulled By: mruberry

fbshipit-source-id: 5159ea22f0e3c991123fe7c19cc9beb6ad00301e
---
 aten/src/ATen/LegacyTHFunctionsCPU.cpp        | 255 ------------------
 aten/src/ATen/LegacyTHFunctionsCPU.h          |   3 -
 aten/src/ATen/cuda/CUDABlas.cpp               |  40 ---
 aten/src/ATen/native/LinearAlgebra.cpp        |  63 +++--
 aten/src/ATen/native/cuda/LinearAlgebra.cu    | 114 --------
 aten/src/ATen/native/native_functions.yaml    |  17 --
 aten/src/TH/generic/THBlas.cpp                |  49 ----
 aten/src/TH/generic/THBlas.h                  |   3 -
 aten/src/TH/generic/THTensorMath.cpp          |  72 -----
 aten/src/TH/generic/THTensorMath.h            |   2 -
 .../check_backward_compatibility.py           |   3 +
 test/test_linalg.py                           | 146 +++++++++-
 test/test_op_aliases.py                       |   2 +-
 test/test_torch.py                            |  69 -----
 torch/_torch_docs.py                          |  26 +-
 torch/csrc/jit/passes/normalize_ops.cpp       |   2 +-
 16 files changed, 194 insertions(+), 672 deletions(-)

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
index 40b7ccafbd9a..9136027c4c1e 100644
--- a/aten/src/ATen/LegacyTHFunctionsCPU.cpp
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -951,261 +951,6 @@ Tensor _th_trace(const Tensor & self) {
             AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type);
     }
 }
-Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toByte();
-            auto alpha_ = alpha.toByte();
-            THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toChar();
-            auto alpha_ = alpha.toChar();
-            THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toDouble();
-            auto alpha_ = alpha.toDouble();
-            THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toFloat();
-            auto alpha_ = alpha.toFloat();
-            THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toInt();
-            auto alpha_ = alpha.toInt();
-            THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toLong();
-            auto alpha_ = alpha.toLong();
-            THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toShort();
-            auto alpha_ = alpha.toShort();
-            THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toBFloat16();
-            auto alpha_ = alpha.toBFloat16();
-            THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_addr_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toByte();
-            auto alpha_ = alpha.toByte();
-            THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toChar();
-            auto alpha_ = alpha.toChar();
-            THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toDouble();
-            auto alpha_ = alpha.toDouble();
-            THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toFloat();
-            auto alpha_ = alpha.toFloat();
-            THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toInt();
-            auto alpha_ = alpha.toInt();
-            THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toLong();
-            auto alpha_ = alpha.toLong();
-            THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toShort();
-            auto alpha_ = alpha.toShort();
-            THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toBFloat16();
-            auto alpha_ = alpha.toBFloat16();
-            THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_addr not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toByte();
-            auto alpha_ = alpha.toByte();
-            THByteTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toChar();
-            auto alpha_ = alpha.toChar();
-            THCharTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toDouble();
-            auto alpha_ = alpha.toDouble();
-            THDoubleTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toFloat();
-            auto alpha_ = alpha.toFloat();
-            THFloatTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toInt();
-            auto alpha_ = alpha.toInt();
-            THIntTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toLong();
-            auto alpha_ = alpha.toLong();
-            THLongTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toShort();
-            auto alpha_ = alpha.toShort();
-            THShortTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        case ScalarType::BFloat16: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type);
-            auto beta_ = beta.toBFloat16();
-            auto alpha_ = alpha.toBFloat16();
-            THBFloat16Tensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_addr_ not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return self;
-}
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h
index e6e3fa0fb7e5..1bc9b66777bc 100644
--- a/aten/src/ATen/LegacyTHFunctionsCPU.h
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.h
@@ -39,9 +39,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_trace(const Tensor & self);
-Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
-Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
 std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
 std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index d4b7155b0591..c4e4793b1938 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -498,46 +498,6 @@ void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) {
 }
 #endif
 
-namespace {
-template<typename scalar_t>
-cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, scalar_t *alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda) {
-  TORCH_CHECK(false, "cublas ger is defined only for float and double");
-  return {};
-}
-template<>
-cublasStatus_t cublasGer<float>(const cublasHandle_t &handle, int64_t m, int64_t n, float *alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) {
-  return cublasSger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-template<>
-cublasStatus_t cublasGer<double>(const cublasHandle_t &handle, int64_t m, int64_t n, double *alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) {
-  return cublasDger(handle, m, n, alpha, x, incx, y, incy, a, lda);
-}
-} // anonymous namespace
-
-template<typename scalar_t>
-void ger(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda)
-{
-  _cublasAdjustLdLevel2(m, n, &lda);
-  TORCH_CHECK((m <= INT_MAX) &&
-              (n <= INT_MAX) &&
-              (lda <= INT_MAX) &&
-              (incx <= INT_MAX) &&
-              (incy <= INT_MAX),
-              "cublasSger/cublasDger only supports m, n, lda, incx, incy with "
-              "the bound [val] <= %d", INT_MAX);
-  int i_m = (int)m;
-  int i_n = (int)n;
-  int i_lda = (int)lda;
-  int i_incx = (int)incx;
-  int i_incy = (int)incy;
-
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  TORCH_CUDABLAS_CHECK(cublasGer<scalar_t>(
-    handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
-}
-template void ger<float>(int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda);
-template void ger<double>(int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda);
-
 /* LEVEL 1 BLAS FUNCTIONS */
 
 template <>
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index a8bb81b3e222..9c3742c129de 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -143,50 +143,61 @@ static void check_1d(const Tensor& t, const char* arg, const char* fn) {
 }
 
 Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  return at::_addr(b_self, vec1, vec2, beta, alpha);
+  TORCH_WARN(
+    "torch.addr is deprecated and may be removed in a future PyTorch release. "
+    "This function can be implemented using torch.outer as "
+    "alpha * torch.outer(vec1, vec2) + beta * input when beta is not zero, "
+    "alpha * torch.outer(vec1, vec2) when beta is zero.");
+
+  Tensor outer_result = at::outer(vec1, vec2) * alpha;
+  if (beta.to<double>() == 0.0) {
+    return outer_result;
+  }
+  return outer_result + (self * beta);
 }
 
 Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  return at::_addr_(self, vec1, vec2, beta, alpha);
+  return at::addr_out(self, self, vec1, vec2, beta, alpha);
 }
 
 Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) {
-  check_1d(vec1, "vec1", "addr");
-  check_1d(vec2, "vec2", "addr");
-  Tensor b_self;
-  std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr_out");
-  return at::_addr_out(result, b_self, vec1, vec2, beta, alpha);
+  auto addr_result = at::addr(self, vec1, vec2, beta, alpha);
+  // Validates safe casting
+  const auto result_dtype = addr_result.scalar_type();
+  TORCH_CHECK(canCast(result_dtype, result.scalar_type()),
+              "result type ", result_dtype,
+              " can't be cast to the desired output type ", result.scalar_type());
+
+  at::native::resize_output(result, addr_result.sizes().vec());
+  result.copy_(addr_result);
+  return result;
 }
 
+// torch.ger, alias for torch.outer
 Tensor& ger_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  check_1d(self, "self", "ger");
-  check_1d(vec2, "vec2", "ger");
-  if (result.dim() != 2 || result.size(0) != self.size(0) || result.size(1) != vec2.size(0)) {
-    result.resize_({ self.size(0), vec2.size(0) });
-  }
-  // resize_ does the "broadcasting", don't need to broadcast again.
-  return at::_addr_out(result, result, self, vec2, Scalar(0), Scalar(1));
+  TORCH_WARN("torch.ger is deprecated and will be removed in a future PyTorch release. "
+             "Use torch.outer instead.");
+  return at::outer_out(result, self, vec2);
 }
 
 Tensor ger(const Tensor& self, const Tensor& vec2) {
-  Tensor result = at::empty({0}, self.options());
-  at::ger_out(result, self, vec2);
-  return result;
+  return self.outer(vec2);
 }
 
-// torch.outer, alias for torch.ger
 Tensor& outer_out(Tensor &result, const Tensor& self, const Tensor& vec2) {
-  return at::ger_out(result, self, vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  // torch.outer is implemented as a composite op using reshape and mul
+  at::mul_out(result, self.reshape({self.size(0), 1}), vec2);
+  return result;
 }
 
 Tensor outer(const Tensor& self, const Tensor& vec2) {
-  return self.ger(vec2);
+  check_1d(self, "self", "outer");
+  check_1d(vec2, "vec2", "outer");
+
+  return self.reshape({self.size(0), 1}) * vec2;
 }
 
 static void addmm_impl_cpu_(
diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu
index c78029d6a7e0..76f5c0a99efe 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu
@@ -178,120 +178,6 @@ Tensor& addmm__cuda(Tensor& self, const Tensor& mat1, const Tensor& mat2,
   return self;
 }
 
-template<typename scalar_t>
-void addr_impl_ger_cuda(Tensor &out, const Tensor &self,
-                        const Tensor& vec1, const Tensor& vec2,
-                        scalar_t alpha, scalar_t beta) {
-  static_assert(std::is_same<scalar_t, float>::value ||
-                std::is_same<scalar_t, double>::value,
-                "addr_impl_ger_cuda: only float and double are supported");
-  if (&out != &self) {
-    at::native::resize_as_(out, self);
-    at::native::copy_(out, self);
-  }
-  if (beta == 0.0) {
-    at::native::zero_(out);
-  }
-  if (beta != 1.0) {
-    at::native::mul_(out, beta);
-  }
-  if (out.stride(0) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec1.size(0), vec2.size(0), alpha,
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(1)
-    );
-  } else if (out.stride(1) == 1) {
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-  } else {
-    Tensor cr = out.clone();
-    at::cuda::blas::ger<scalar_t>(
-      vec2.size(0), vec1.size(0), alpha,
-      vec2.data_ptr<scalar_t>(), vec2.stride(0),
-      vec1.data_ptr<scalar_t>(), vec1.stride(0),
-      out.data_ptr<scalar_t>(), out.stride(0)
-    );
-    out.set_(cr);
-  }
-}
-
-template<typename scalar_t>
-void addr_impl_cuda(Tensor &out, const Tensor &self,
-                    const Tensor& vec1, const Tensor& vec2,
-                    scalar_t alpha, scalar_t beta) {
-  // currently no Hger/SgerEx in Cublas.
-  Tensor vec2T = vec2.reshape({1, vec2.size(0)});
-  Tensor vec1M = vec1.reshape({vec1.size(0), 1});
-  addmm_out_cuda(out, self, vec1M, vec2T, beta, alpha);
-}
-template<>
-void addr_impl_cuda<float>(Tensor &out, const Tensor &self,
-                           const Tensor& vec1, const Tensor& vec2,
-                           float alpha, float beta) {
-  addr_impl_ger_cuda<float>(out, self, vec1, vec2, alpha, beta);
-}
-template<>
-void addr_impl_cuda<double>(Tensor &out, const Tensor &self,
-                            const Tensor& vec1, const Tensor& vec2,
-                            double alpha, double beta) {
-  addr_impl_ger_cuda<double>(out, self, vec1, vec2, alpha, beta);
-}
-
-Tensor& addr_out_cuda(Tensor &out, const Tensor& self,
-                      const Tensor& vec1, const Tensor& vec2,
-                      Scalar beta, Scalar alpha) {
-  TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1,
-              "vec1 and vec2 should be 1-dimensional vectors. Got dimensions ",
-              vec1.dim(), " and ", vec2.dim());
-
-  Tensor self_;
-  if (&out != &self) {
-    std::tie(self_) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr");
-  } else {
-    self_ = self;
-  }
-
-  TORCH_CHECK(out.device() == self_.device() &&
-              out.device() == vec1.device() &&
-              out.device() == vec2.device(),
-              "Expected all tensors to be on the same device. Found: ",
-              out.device(), ", ", self_.device(), ", ",
-              vec1.device(), " and ", vec2.device());
-  TORCH_CHECK(self_.dim() == 2,
-              "2D tensor expected, got ", self_.dim(), "D tensor for input");
-  TORCH_CHECK(self_.size(0) == vec1.size(0) && self_.size(1) == vec2.size(0),
-              "size mismatch",
-              ", input: ", self_.sizes(),
-              ", v1: ", vec1.sizes(),
-              ", v2: ", vec2.sizes());
-  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self_.scalar_type(), "addr_out_cuda", [&] {
-      addr_impl_cuda<scalar_t>(out, self_, vec1, vec2,
-                               alpha.to<scalar_t>(), beta.to<scalar_t>());
-  });
-  return out;
-}
-
-Tensor& addr__cuda(Tensor& self,
-                   const Tensor& vec1, const Tensor& vec2,
-                   Scalar beta, Scalar alpha) {
-  addr_out_cuda(self, self, vec1, vec2, beta, alpha);
-  return self;
-}
-
-Tensor addr_cuda(const Tensor& self,
-                  const Tensor& vec1, const Tensor& vec2,
-                  Scalar beta, Scalar alpha) {
-  Tensor out = at::empty({0}, self.options());
-  addr_out_cuda(out, self, vec1, vec2, beta, alpha);
-  return out;
-}
-
 Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self,
                         const Tensor& batch1, const Tensor& batch2,
                         Scalar beta, Scalar alpha) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 0d5582572d6e..78b6d3330300 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6238,23 +6238,6 @@
   use_c10_dispatcher: full
   variants: method, function
 
-- func: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
-  dispatch:
-    CPU: legacy::cpu::_th_addr
-    CUDA: addr_cuda
-
-- func: _addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
-  dispatch:
-    CPU: legacy::cpu::_th_addr_
-    CUDA: addr__cuda
-
-- func: _addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
-  dispatch:
-    CPU: legacy::cpu::_th_addr_out
-    CUDA: addr_out_cuda
-
 - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
   use_c10_dispatcher: full
   dispatch:
diff --git a/aten/src/TH/generic/THBlas.cpp b/aten/src/TH/generic/THBlas.cpp
index fd9fe5e6c233..64bc8106fbb3 100644
--- a/aten/src/TH/generic/THBlas.cpp
+++ b/aten/src/TH/generic/THBlas.cpp
@@ -14,8 +14,6 @@ TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
 TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
 TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
-TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
 
 void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy)
 {
@@ -111,51 +109,4 @@ void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y
   }
 }
 
-void THBlas_(ger)(
-  int64_t m,
-  int64_t n,
-  scalar_t alpha,
-  scalar_t *x,
-  int64_t incx,
-  scalar_t *y,
-  int64_t incy,
-  scalar_t *a,
-  int64_t lda)
-{
-  if(n == 1)
-    lda = m;
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) &&
-      (incx > 0) && (incx <= INT_MAX) &&
-      (incy > 0) && (incy <= INT_MAX) )
-  {
-    THArgCheck(lda >= THMax(1, m), 9,
-      "lda should be at least max(1, m=%d), but have %d", m, lda);
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#else
-    sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#endif
-    return;
-  }
-#endif
-  {
-    int64_t i, j;
-    for(j = 0; j < n; j++)
-    {
-      scalar_t *column_ = a+j*lda;
-      scalar_t z = alpha*y[j*incy];
-      for(i = 0; i < m; i++)
-        column_[i] += z*x[i*incx] ;
-    }
-  }
-}
-
 #endif
diff --git a/aten/src/TH/generic/THBlas.h b/aten/src/TH/generic/THBlas.h
index 4d3facea4d06..a70d99969d31 100644
--- a/aten/src/TH/generic/THBlas.h
+++ b/aten/src/TH/generic/THBlas.h
@@ -7,7 +7,4 @@ TH_API void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int
 TH_API void THBlas_(copy)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
 TH_API void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy);
 
-/* Level 2 */
-TH_API void THBlas_(ger)(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda);
-
 #endif
diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp
index 2450d58a7b57..eb3b593ac736 100644
--- a/aten/src/TH/generic/THTensorMath.cpp
+++ b/aten/src/TH/generic/THTensorMath.cpp
@@ -22,76 +22,4 @@
 // sense (rather than just having cut the file down the middle, which is
 // what I did when I split these up originally).
 
-
-#if !defined(TH_REAL_IS_BOOL) /* non bool only part */
-
-void THTensor_(addr)(THTensor *r_, THTensor *t, THTensor *vec1, THTensor *vec2, scalar_t beta, scalar_t alpha)
-{
-  if( (THTensor_nDimension(vec1) != 1) || (THTensor_nDimension(vec2) != 1) )
-    THError("vector and vector expected, got %dD, %dD tensors",
-        THTensor_nDimension(vec1), THTensor_nDimension(vec2));
-
-  if(t->dim() != 2)
-    THError("expected matrix, got %dD tensor for t", t->dim());
-
-  auto vec1_size = THTensor_(size)(vec1, 0);
-  auto vec2_size = THTensor_(size)(vec2, 0);
-  auto vec1_stride = THTensor_(stride)(vec1, 0);
-  auto vec2_stride = THTensor_(stride)(vec2, 0);
-
-  if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) {
-    THDescBuff bt  = THTensor_(sizeDesc)(t);
-    THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
-    THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
-    THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str);
-  }
-
-  if(r_ != t)
-  {
-    THTensor_(resizeAs)(r_, t);
-    at::Tensor r__wrap = THTensor_wrap(r_);
-    at::Tensor t_wrap = THTensor_wrap(t);
-    at::native::copy_(r__wrap, t_wrap);
-  }
-
-  if(beta == 0) {
-    THTensor_wrap(r_).zero_();
-  }
-  else if(beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  // n == 1 || lda >= max(1, m)
-  #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M)))
-
-  if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1)))
-  {
-    THBlas_(ger)(vec1_size, vec2_size,
-                 alpha, vec1->data<scalar_t>(), vec1_stride,
-                 vec2->data<scalar_t>(), vec2_stride,
-                 r_->data<scalar_t>(), r_->stride(1));
-  }
-  else if(r_->stride(1) == 1 && LDA_COND(vec2_size, vec1_size, r_->stride(0)))
-  {
-    THBlas_(ger)(vec2_size, vec1_size,
-                 alpha, vec2->data<scalar_t>(), vec2_stride,
-                 vec1->data<scalar_t>(), vec1_stride,
-                 r_->data<scalar_t>(), r_->stride(0));
-  }
-  else
-  {
-    THTensor *cr = THTensor_(newClone)(r_);
-
-    THBlas_(ger)(vec2_size, vec1_size,
-                 alpha, vec2->data<scalar_t>(), vec2_stride,
-                 vec1->data<scalar_t>(), vec1_stride,
-                 cr->data<scalar_t>(), cr->stride(0));
-
-    THTensor_(freeCopyTo)(cr, r_);
-  }
-
-  #undef LDA_COND
-}
-
-#endif /* !defined(TH_REAL_IS_BOOL) */
-
 #endif /* TH_GENERIC_FILE */
diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h
index 18ccaeb6eb80..1d0daf1206de 100644
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@@ -14,8 +14,6 @@ TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTe
 
 TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
 
-TH_API void THTensor_(addr)(THTensor *r_, THTensor *t, THTensor *vec1, THTensor *vec2, scalar_t beta, scalar_t alpha);
-
 #if !defined(TH_REAL_IS_BOOL)
 TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, scalar_t value);
 #endif
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 739a4de51951..92c4f1060a64 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -99,6 +99,9 @@
     ("preprocess", datetime.date(2020, 10, 1)),
     ("compile", datetime.date(2020, 10, 1)),
     ("execute", datetime.date(2020, 10, 1)),
+    ("aten::_addr", datetime.date(2020, 10, 31)),
+    ("aten::_addr_", datetime.date(2020, 10, 31)),
+    ("aten::_addr.out", datetime.date(2020, 10, 31)),
 ]
 
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 3dbf31497b77..d3e1905e8d24 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -7,7 +7,7 @@
 from torch.testing._internal.common_utils import \
     (TestCase, run_tests, TEST_NUMPY, IS_MACOS, IS_WINDOWS, TEST_WITH_ASAN)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack)
+    (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride)
 from torch.testing._internal.jit_metaprogramming_utils import gen_script_fn_and_args
 from torch.autograd import gradcheck
 
@@ -17,21 +17,143 @@
 class TestLinalg(TestCase):
     exact_dtype = True
 
-    # TODO: test out variant
-    # Tests torch.ger, and its alias, torch.outer, vs. NumPy
+    # Tests torch.outer, and its alias, torch.ger, vs. NumPy
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    @dtypes(torch.float)
+    @precisionOverride({torch.bfloat16: 1e-1})
+    @dtypes(*(torch.testing.get_all_dtypes()))
     def test_outer(self, device, dtype):
-        a = torch.randn(50, device=device, dtype=dtype)
-        b = torch.randn(50, device=device, dtype=dtype)
+        def run_test_case(a, b):
+            if dtype == torch.bfloat16:
+                a_np = a.to(torch.double).cpu().numpy()
+                b_np = b.to(torch.double).cpu().numpy()
+            else:
+                a_np = a.cpu().numpy()
+                b_np = b.cpu().numpy()
+            expected = np.outer(a_np, b_np)
+
+            self.assertEqual(torch.outer(a, b), expected)
+            self.assertEqual(torch.Tensor.outer(a, b), expected)
+
+            self.assertEqual(torch.ger(a, b), expected)
+            self.assertEqual(torch.Tensor.ger(a, b), expected)
+
+            # test out variant
+            out = torch.empty(a.size(0), b.size(0), device=device, dtype=dtype)
+            torch.outer(a, b, out=out)
+            self.assertEqual(out, expected)
+
+            out = torch.empty(a.size(0), b.size(0), device=device, dtype=dtype)
+            torch.ger(a, b, out=out)
+            self.assertEqual(out, expected)
 
-        ops = (torch.ger, torch.Tensor.ger,
-               torch.outer, torch.Tensor.outer)
+        a = torch.randn(50).to(device=device, dtype=dtype)
+        b = torch.randn(50).to(device=device, dtype=dtype)
+        run_test_case(a, b)
 
-        expected = np.outer(a.cpu().numpy(), b.cpu().numpy())
-        for op in ops:
-            actual = op(a, b)
-            self.assertEqual(actual, expected)
+        # test 0 strided tensor
+        zero_strided = torch.randn(1).to(device=device, dtype=dtype).expand(50)
+        run_test_case(zero_strided, b)
+        run_test_case(a, zero_strided)
+
+    @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+    @precisionOverride({torch.bfloat16: 1e-1})
+    @dtypes(*(torch.testing.get_all_dtypes()))
+    def test_addr(self, device, dtype):
+        def run_test_case(m, a, b, beta=1, alpha=1):
+            if dtype == torch.bfloat16:
+                a_np = a.to(torch.double).cpu().numpy()
+                b_np = b.to(torch.double).cpu().numpy()
+                m_np = m.to(torch.double).cpu().numpy()
+            else:
+                a_np = a.cpu().numpy()
+                b_np = b.cpu().numpy()
+                m_np = m.cpu().numpy()
+
+            if beta == 0:
+                expected = alpha * np.outer(a_np, b_np)
+            else:
+                expected = beta * m_np + alpha * np.outer(a_np, b_np)
+
+            self.assertEqual(torch.addr(m, a, b, beta=beta, alpha=alpha), expected)
+            self.assertEqual(torch.Tensor.addr(m, a, b, beta=beta, alpha=alpha), expected)
+
+            result_dtype = torch.addr(m, a, b, beta=beta, alpha=alpha).dtype
+            out = torch.empty_like(m, dtype=result_dtype)
+            torch.addr(m, a, b, beta=beta, alpha=alpha, out=out)
+            self.assertEqual(out, expected)
+
+        a = torch.randn(50).to(device=device, dtype=dtype)
+        b = torch.randn(50).to(device=device, dtype=dtype)
+        m = torch.randn(50, 50).to(device=device, dtype=dtype)
+
+        # when beta is zero
+        run_test_case(m, a, b, beta=0., alpha=2)
+
+        # when beta is not zero
+        run_test_case(m, a, b, beta=0.5, alpha=2)
+
+        # test transpose
+        m_transpose = torch.transpose(m, 0, 1)
+        run_test_case(m_transpose, a, b, beta=0.5, alpha=2)
+
+        # test 0 strided tensor
+        zero_strided = torch.randn(1).to(device=device, dtype=dtype).expand(50)
+        run_test_case(m, zero_strided, b, beta=0.5, alpha=2)
+
+        # test scalar
+        m_scalar = torch.tensor(1, device=device, dtype=dtype)
+        run_test_case(m_scalar, a, b)
+
+    @dtypes(*itertools.product(torch.testing.get_all_dtypes(),
+                               torch.testing.get_all_dtypes()))
+    def test_outer_type_promotion(self, device, dtypes):
+        a = torch.randn(5).to(device=device, dtype=dtypes[0])
+        b = torch.randn(5).to(device=device, dtype=dtypes[1])
+        for op in (torch.outer, torch.Tensor.outer, torch.ger, torch.Tensor.ger):
+            result = op(a, b)
+            self.assertEqual(result.dtype, torch.result_type(a, b))
+
+    @dtypes(*itertools.product(torch.testing.get_all_dtypes(),
+                               torch.testing.get_all_dtypes()))
+    def test_addr_type_promotion(self, device, dtypes):
+        a = torch.randn(5).to(device=device, dtype=dtypes[0])
+        b = torch.randn(5).to(device=device, dtype=dtypes[1])
+        m = torch.randn(5, 5).to(device=device,
+                                 dtype=torch.result_type(a, b))
+        for op in (torch.addr, torch.Tensor.addr):
+            # pass the integer 1 to the torch.result_type as both
+            # the default values of alpha and beta are integers (alpha=1, beta=1)
+            desired_dtype = torch.result_type(m, 1)
+            result = op(m, a, b)
+            self.assertEqual(result.dtype, desired_dtype)
+
+            desired_dtype = torch.result_type(m, 2.)
+            result = op(m, a, b, beta=0, alpha=2.)
+            self.assertEqual(result.dtype, desired_dtype)
+
+    # Tests migrated from test_torch.py
+    # 1) test the shape of the result tensor when there is empty input tensor
+    # 2) test the Runtime Exception when there is scalar input tensor
+    def test_outer_ger_addr_legacy_tests(self, device):
+        for size in ((0, 0), (0, 5), (5, 0)):
+            a = torch.rand(size[0], device=device)
+            b = torch.rand(size[1], device=device)
+
+            self.assertEqual(torch.outer(a, b).shape, size)
+            self.assertEqual(torch.ger(a, b).shape, size)
+
+            m = torch.empty(size, device=device)
+            self.assertEqual(torch.addr(m, a, b).shape, size)
+
+        m = torch.randn(5, 6, device=device)
+        a = torch.randn(5, device=device)
+        b = torch.tensor(6, device=device)
+        self.assertRaises(RuntimeError, lambda: torch.outer(a, b))
+        self.assertRaises(RuntimeError, lambda: torch.outer(b, a))
+        self.assertRaises(RuntimeError, lambda: torch.ger(a, b))
+        self.assertRaises(RuntimeError, lambda: torch.ger(b, a))
+        self.assertRaises(RuntimeError, lambda: torch.addr(m, a, b))
+        self.assertRaises(RuntimeError, lambda: torch.addr(m, b, a))
 
     # Tests torch.det and its alias, torch.linalg.det, vs. NumPy
     @skipCUDAIfNoMagma
diff --git a/test/test_op_aliases.py b/test/test_op_aliases.py
index 7ad691328c4b..8a106d7860d1 100644
--- a/test/test_op_aliases.py
+++ b/test/test_op_aliases.py
@@ -45,7 +45,7 @@ def __init__(self,
               decorators=(skipCPUIfNoLapack, skipCUDAIfNoMagma)),
     # NOTE: only runs on CPU because it leaks CUDA memory
     #   (see https://github.com/pytorch/pytorch/issues/43119)
-    AliasInfo('outer', torch.outer, 'ger', torch.ger,
+    AliasInfo('ger', torch.ger, 'outer', torch.outer,
               lambda d: torch.randn(20, device=d), get_args=lambda d: (torch.randn(20, device=d),),
               decorators=(onlyCPU,)),
     AliasInfo('arccosh', torch.arccosh, 'acosh', torch.acosh,
diff --git a/test/test_torch.py b/test/test_torch.py
index 8c355eb93570..6c875e68b12f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -229,10 +229,6 @@ def test_linear_algebra_scalar_raises(self) -> None:
             s = torch.tensor(7)
             self.assertRaises(RuntimeError, lambda: torch.mv(m, s))
             self.assertRaises(RuntimeError, lambda: torch.addmv(v, m, s))
-            self.assertRaises(RuntimeError, lambda: torch.ger(v, s))
-            self.assertRaises(RuntimeError, lambda: torch.ger(s, v))
-            self.assertRaises(RuntimeError, lambda: torch.addr(m, v, s))
-            self.assertRaises(RuntimeError, lambda: torch.addr(m, s, v))
 
         @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
         def test_mvlgamma(self):
@@ -13827,15 +13823,6 @@ def call_torch_fn(*args, **kwargs):
         self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,)))
         self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,), test_out=True))
 
-        # ger, addr
-        self.assertEqual((0, 0), fn(torch.ger, (0,), (0,)).shape)
-        self.assertEqual((5, 0), fn(torch.ger, (5,), (0,)).shape)
-        self.assertEqual((0, 4), fn(torch.ger, (0,), (4,)).shape)
-
-        self.assertEqual((0, 0), fn(torch.addr, (0, 0), (0,), (0,)).shape)
-        self.assertEqual((5, 0), fn(torch.addr, (5, 0), (5,), (0,)).shape)
-        self.assertEqual((0, 4), fn(torch.addr, (0, 4), (0,), (4,)).shape)
-
         # bmm, baddbmm
         self.assertEqual((0, 0, 0), fn(torch.bmm, (0, 0, 0), (0, 0, 0)).shape)
         self.assertEqual((3, 0, 5), fn(torch.bmm, (3, 0, 0), (3, 0, 5)).shape)
@@ -16334,52 +16321,6 @@ def tracker(worker):
 ---(input size: {:4}, eigenpairs:{:2}, units: relative error, maxiter={:4})---
 '''.format(tol, eq_err, eq_err_general, iters1, eq_err_scipy, eq_err_general_scipy, iters2, m, k, niter))
 
-    @slowTest
-    @onlyCPU
-    @dtypes(torch.bfloat16, torch.float, torch.double)
-    def test_ger(self, device, dtype):
-        def run_test(v0, v1):
-            res0 = torch.ger(v0, v1)
-            res1 = torch.zeros(100, 100, dtype=dtype, device=device)
-            for i in range(100):
-                for j in range(100):
-                    res1[i, j] = v0[i] * v1[j]
-            self.assertEqual(res0, res1)
-
-        v0 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype)
-        v1 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype)
-        run_test(v0, v1)
-
-        # Tests 0-strided
-        v0 = torch.randn(1, dtype=torch.float, device=device).expand(100).to(dtype=dtype)
-        v1 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype)
-        run_test(v0, v1)
-
-    @slowTest
-    @onlyCPU
-    @dtypes(torch.bfloat16, torch.float, torch.double)
-    def test_addr(self, device, dtype):
-        def run_test(m, v1, v2, m_transform=lambda x: x):
-            m = m_transform(m.clone())
-            ref = m.clone()
-            torch.addr(m, v1, v2, out=m)
-            for i in range(m.size(0)):
-                for j in range(m.size(1)):
-                    ref[i, j] += v1[i] * v2[j]
-            self.assertEqual(m, ref)
-
-        for h, w in [(100, 110), (1, 20), (200, 2)]:
-            m = torch.randn(h, w, dtype=torch.float, device=device).to(dtype=dtype)
-            v1 = torch.randn(h, dtype=torch.float, device=device).to(dtype=dtype)
-            v2 = torch.randn(w, dtype=torch.float, device=device).to(dtype=dtype)
-            run_test(m, v1, v2)
-            # test transpose
-            run_test(m, v2, v1, lambda x: x.transpose(0, 1))
-            # test 0 strided
-            v1 = torch.randn(1, dtype=torch.float, device=device).expand(h).to(dtype=dtype)
-            run_test(m, v1, v2)
-            run_test(m, v2, v1, lambda x: x.transpose(0, 1))
-
     def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False):
         dtype = t.dtype
         numpy_dtype = dtype
@@ -19891,16 +19832,6 @@ def inner(self, device, dtype):
         lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
         [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
-    ('addr', '', _medium_2d, lambda t, d: [_medium_1d(t, d), _medium_1d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2),
-    ('addr', 'scalar', _medium_2d,
-        lambda t, d: [_number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
-        [_wrap_maybe_warns("This overload of addr_? is deprecated")]),
-    ('addr', 'two_scalars', _medium_2d,
-        lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)],
-        1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True,
-        [_wrap_maybe_warns("This overload of addr_? is deprecated")]),
     ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _types, _types_no_half),
     ('angle', '', _small_3d, lambda t, d: [], 0, 0, 0, _types_no_half, [torch.bfloat16], False),
     ('fmod', 'value', _small_3d, lambda t, d: [3], 1e-3),
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 32806259df35..7b00ddbd1505 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -515,6 +515,12 @@ def merge_dicts(*dicts):
 For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
 :attr:`alpha` must be real numbers, otherwise they should be integers
 
+.. warning::
+    This function is deprecated and may be removed in a future release.
+    It can be implemented using :func:`torch.outer` as
+    ``alpha * torch.outer(vec1, vec2) + beta * input`` when :attr:`beta` is not zero,
+    and as ``alpha * torch.outer(vec1, vec2)`` when :attr:`beta` is zero.
+
 Args:
     input (Tensor): matrix to be added
     vec1 (Tensor): the first vector of the outer product
@@ -2982,13 +2988,6 @@ def merge_dicts(*dicts):
 add_docstr(torch.outer, r"""
 outer(input, vec2, *, out=None) -> Tensor
 
-Alias of :func:`torch.ger`.
-""")
-
-add_docstr(torch.ger,
-           r"""
-ger(input, vec2, *, out=None) -> Tensor
-
 Outer product of :attr:`input` and :attr:`vec2`.
 If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of
 size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`.
@@ -3006,13 +3005,24 @@ def merge_dicts(*dicts):
 
     >>> v1 = torch.arange(1., 5.)
     >>> v2 = torch.arange(1., 4.)
-    >>> torch.ger(v1, v2)
+    >>> torch.outer(v1, v2)
     tensor([[  1.,   2.,   3.],
             [  2.,   4.,   6.],
             [  3.,   6.,   9.],
             [  4.,   8.,  12.]])
 """)
 
+add_docstr(torch.ger,
+           r"""
+ger(input, vec2, *, out=None) -> Tensor
+
+Alias of :func:`torch.outer`.
+
+.. warning::
+    This function is deprecated and will be removed in a future PyTorch release.
+    Use :func:`torch.outer` instead.
+""")
+
 add_docstr(torch.solve,
            r"""
 torch.solve(input, A, *, out=None) -> (Tensor, Tensor)
diff --git a/torch/csrc/jit/passes/normalize_ops.cpp b/torch/csrc/jit/passes/normalize_ops.cpp
index 6db3f00562ba..75ad2a4499ce 100644
--- a/torch/csrc/jit/passes/normalize_ops.cpp
+++ b/torch/csrc/jit/passes/normalize_ops.cpp
@@ -10,7 +10,7 @@ namespace {
 static const std::unordered_map<Symbol, Symbol> alias_map = {
     {aten::absolute, aten::abs},     {aten::absolute_, aten::abs_},
     {aten::clip, aten::clamp},       {aten::clip_, aten::clamp_},
-    {aten::linalg_det, aten::det},   {aten::outer, aten::ger},
+    {aten::linalg_det, aten::det},   {aten::ger, aten::outer},
     {aten::arccos, aten::acos},      {aten::arccos_, aten::acos_},
     {aten::arcsin, aten::asin},      {aten::arcsin_, aten::asin_},
     {aten::arctan, aten::atan},      {aten::arctan_, aten::atan_},

From 76ee58e2ec48d6922c49d647b276c1575a96aa09 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Fri, 25 Sep 2020 02:27:47 -0700
Subject: [PATCH 129/449] [TensorExpr] Move inner loops vectorization logic to
 its own method (#45287)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45287

Test Plan: CI, build

Reviewed By: gmagogsfm

Differential Revision: D23913432

Pulled By: asuhan

fbshipit-source-id: 3bf8fe09753f349e3c857863a43d2b1fca5101c1
---
 torch/csrc/jit/tensorexpr/kernel.cpp   | 63 +------------------------
 torch/csrc/jit/tensorexpr/loopnest.cpp | 65 ++++++++++++++++++++++++++
 torch/csrc/jit/tensorexpr/loopnest.h   |  4 ++
 3 files changed, 70 insertions(+), 62 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 833881cc0e4f..539abe941e19 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1376,68 +1376,7 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
   l.prepareForCodegen();
 
   if (backendType == kLLVMCodeGen && !hasReduction) {
-    std::vector<For*> innerLoops;
-    std::vector<For*> worklist;
-
-    // Find outer-most For loops
-    if (For* rootF = dynamic_cast<For*>(l.root_stmt())) {
-      worklist.push_back(rootF);
-    } else if (Block* body = dynamic_cast<Block*>(l.root_stmt())) {
-      std::vector<Block*> blocks = {body};
-      while (blocks.size()) {
-        Block* b = blocks.back();
-        blocks.pop_back();
-
-        for (Stmt* s : *b) {
-          if (For* f = dynamic_cast<For*>(s)) {
-            worklist.push_back(f);
-          } else if (Block* b2 = dynamic_cast<Block*>(s)) {
-            blocks.push_back(b2);
-          }
-        }
-      }
-    }
-
-    // Traverse the For loop nest find inner-most loops, which are
-    // vectorization candidates.
-    while (worklist.size()) {
-      For* f = worklist.back();
-      worklist.pop_back();
-
-      bool containsSubLoops = false;
-      if (Block* body = dynamic_cast<Block*>(f->body())) {
-        for (Stmt* s2 : *body) {
-          if (For* f2 = dynamic_cast<For*>(s2)) {
-            containsSubLoops = true;
-            worklist.push_back(f2);
-          }
-        }
-      }
-
-      if (!containsSubLoops) {
-        innerLoops.push_back(f);
-      }
-    }
-
-    // vectorize inner loops.
-    for (For* loop : innerLoops) {
-      For* outer1;
-      For* split1;
-      For* tail1;
-
-      static const int kBodyVectorWidth = 8;
-      l.splitWithTail(loop, kBodyVectorWidth, &outer1, &split1, &tail1);
-      l.vectorize(split1);
-
-      if (tail1) {
-        For* outer2;
-        For* split2;
-        For* tail2;
-        static const int kTailVectorWidth = 4;
-        l.splitWithTail(tail1, kTailVectorWidth, &outer2, &split2, &tail2);
-        l.vectorize(split2);
-      }
-    }
+    l.vectorizeInnerLoops();
   }
 
   Stmt* stmt = l.root_stmt();
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 2cbc7bdf186d..091b931bb809 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -837,6 +837,71 @@ void LoopNest::prepareForCodegen() {
   root_stmt_ = insertAllocFree(root_stmt_);
 }
 
+void LoopNest::vectorizeInnerLoops() {
+  std::vector<For*> innerLoops;
+  std::vector<For*> worklist;
+
+  // Find outer-most For loops
+  if (For* rootF = dynamic_cast<For*>(root_stmt_)) {
+    worklist.push_back(rootF);
+  } else if (Block* body = dynamic_cast<Block*>(root_stmt_)) {
+    std::vector<Block*> blocks = {body};
+    while (blocks.size()) {
+      Block* b = blocks.back();
+      blocks.pop_back();
+
+      for (Stmt* s : *b) {
+        if (For* f = dynamic_cast<For*>(s)) {
+          worklist.push_back(f);
+        } else if (Block* b2 = dynamic_cast<Block*>(s)) {
+          blocks.push_back(b2);
+        }
+      }
+    }
+  }
+
+  // Traverse the For loop nest find inner-most loops, which are
+  // vectorization candidates.
+  while (worklist.size()) {
+    For* f = worklist.back();
+    worklist.pop_back();
+
+    bool containsSubLoops = false;
+    if (Block* body = dynamic_cast<Block*>(f->body())) {
+      for (Stmt* s2 : *body) {
+        if (For* f2 = dynamic_cast<For*>(s2)) {
+          containsSubLoops = true;
+          worklist.push_back(f2);
+        }
+      }
+    }
+
+    if (!containsSubLoops) {
+      innerLoops.push_back(f);
+    }
+  }
+
+  // vectorize inner loops.
+  for (For* loop : innerLoops) {
+    For* outer1;
+    For* split1;
+    For* tail1;
+
+    static const int kBodyVectorWidth = 8;
+    splitWithTail(loop, kBodyVectorWidth, &outer1, &split1, &tail1);
+    vectorize(split1);
+
+    if (tail1) {
+      For* outer2;
+      For* split2;
+      For* tail2;
+      static const int kTailVectorWidth = 4;
+      splitWithTail(tail1, kTailVectorWidth, &outer2, &split2, &tail2);
+      vectorize(split2);
+    }
+  }
+}
+
 void LoopNest::sliceHead(For* f, int factor, For** head, For** tail) {
   if (dynamic_cast<const IntImm*>(f->start()) &&
       dynamic_cast<const IntImm*>(f->stop())) {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 4aa975475a67..391bdbeb1c37 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -71,6 +71,10 @@ class TORCH_API LoopNest {
 
   void prepareForCodegen();
 
+  // Find the inner-most loops and vectorize them. Currently, this only works
+  // for the LLVM backend, when no reductions are involved.
+  void vectorizeInnerLoops();
+
  private:
   std::vector<Tensor*> findAllNeededTensors(
       const std::vector<Tensor*>& tensors);

From 00e704e75705b8441641a314f4c5cd9b7ffeb113 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Fri, 25 Sep 2020 07:51:11 -0700
Subject: [PATCH 130/449] [fix] torch.repeat : dim-0 backward (#45212)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45201

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45212

Reviewed By: mrshenli

Differential Revision: D23905545

Pulled By: albanD

fbshipit-source-id: c5bf9cf481c8cf3ccc1fdbfb364006b29f67dc9f
---
 tools/autograd/derivatives.yaml                       | 2 +-
 torch/csrc/autograd/FunctionsManual.cpp               | 7 ++++++-
 torch/csrc/autograd/FunctionsManual.h                 | 2 +-
 torch/testing/_internal/common_methods_invocations.py | 2 ++
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 70ddaee5226f..3bc0199ebf47 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -887,7 +887,7 @@
   self: renorm_backward(grad, self, p, dim, maxnorm)
 
 - name: repeat(Tensor self, int[] repeats) -> Tensor
-  self: repeat_backward(grad, self.dim(), repeats)
+  self: repeat_backward(grad, repeats, self.sizes())
 
 # DO NOT define a backward for reshape!
 # reshape is special in that it sometimes returns a view, and sometimes not.
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 1e73ebac2a2a..5a91038a94e6 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -592,7 +592,12 @@ Tensor sum_tensorlist(TensorList tl) {
   return sum;
 }
 
-Tensor repeat_backward(Tensor grad, int64_t input_dims, IntArrayRef repeats) {
+Tensor repeat_backward(Tensor grad, IntArrayRef repeats, IntArrayRef input_shape) {
+  auto find_iter = std::find(repeats.cbegin(), repeats.cend(), 0);
+  if (find_iter != repeats.cend()) {
+    return at::zeros(input_shape, grad.options());
+  }
+  const auto input_dims = input_shape.size();
   int64_t num_unsqueezed = grad.dim() - input_dims;
   for (int64_t i = 0; i < num_unsqueezed; ++i) {
     grad = grad.sum(0, false);
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 8fd0e9b08cc4..1e2af0772b52 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -73,7 +73,7 @@ at::Tensor mm_mat2_backward(const at::Tensor & grad, const at::Tensor & mat1, at
 at::Tensor _sparse_addmm_sparse_backward(const at::Tensor& grad, const at::Tensor& sparse_, const at::Tensor& dense, const at::Scalar& alpha);
 at::Tensor renorm_backward(const at::Tensor & grad, const at::Tensor & self, at::Scalar p, int64_t dim, at::Scalar maxnorm);
 at::Tensor sum_tensorlist(at::TensorList tl);
-at::Tensor repeat_backward(at::Tensor grad, int64_t input_dims, at::IntArrayRef repeats);
+at::Tensor repeat_backward(at::Tensor grad, at::IntArrayRef repeats, at::IntArrayRef input_shape);
 at::Tensor _fused_dropout_backward(at::Tensor grad, at::Tensor mask, double p1m);
 at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value);
 at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b208f220e30a..a434d69a8654 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -879,6 +879,8 @@ def method_tests():
         ('repeat', (), (2, 3), 'scalar'),
         ('repeat', (2, 2), (3, 2)),
         ('repeat', (2, 2), (1, 3, 1, 2), 'unsqueeze'),
+        ('repeat', (S,), (0, ), 'zero_dim'),
+        ('repeat', (S,), (0, 2), 'zero_dim_multi'),
         ('logcumsumexp', (S, S, S), (0,), 'dim0', (), [0]),
         ('logcumsumexp', (S, S, S), (1,), 'dim1', (), [0]),
         ('logcumsumexp', (), (0,), 'dim0_scalar', (), [0]),

From 2739a7c599d6012b9e7b64f663854874b86df3e8 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 25 Sep 2020 08:05:01 -0700
Subject: [PATCH 131/449] Byte-for-byte compatibility fixes in codegen (#44879)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44879

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23825163

Pulled By: bdhirsh

fbshipit-source-id: 4d8028274f82c401b393c4fe1b9e32de3f4909c6
---
 aten/src/ATen/native/VariableMethodStubs.cpp |  4 +-
 tools/codegen/api/cpp.py                     |  3 -
 tools/codegen/gen.py                         | 79 ++------------------
 tools/codegen/local.py                       | 11 +--
 torch/csrc/autograd/VariableTypeManual.cpp   |  4 +-
 5 files changed, 12 insertions(+), 89 deletions(-)

diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
index d06c27f69e3d..7d5cea725cf1 100644
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -12,7 +12,7 @@ void backward(const Tensor& self, const Tensor& gradient, c10::optional<bool> ke
   AT_ERROR("backward is not implemented for Tensor");
 }
 
-void set_data(const Tensor& self, const Tensor& new_data) {
+void set_data(Tensor& self, const Tensor& new_data) {
   AT_ERROR("set_data is not implemented for Tensor");
 }
 
@@ -36,7 +36,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
   AT_ERROR("requires_grad_ is not implemented for Tensor");
 }
 
-void retain_grad(const Tensor& self) {
+void retain_grad(Tensor& self) {
   AT_ERROR("retain_grad is not implemented for Tensor");
 }
 
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index 452c3721ab92..d8445f02ee54 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -71,9 +71,6 @@ def argumenttype_type(t: Type, *, mutable: bool) -> str:
     if r is not None:
         return r
 
-    if str(t) == 'Tensor' and mutable and local.hack_const_mutable_self():
-        return 'const Tensor &'
-
     if isinstance(t, BaseType):
         if t.name == BaseTy.Tensor:
             if mutable:
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 83d9fa04cf37..0871d8c55ae1 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -46,14 +46,6 @@
 #     the dispatcher API, and the legacy disaptcher API.  See each
 #     of these respective files for more information
 
-
-# Note [Byte-for-byte compatibility]
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Some special cases we have made in this codegen have been strictly
-# to make sure that git diff -w reports no changes, but we believe
-# they are not semantically meaningful.  After landing the new codegen,
-# we should remove these special cases
-
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                         HELPER FUNCTIONS
@@ -111,8 +103,6 @@ def wrapper(f: NativeFunction) -> T:
         with context(f'in {f.loc}:\n  {f.func}'):
             with local.parametrize(
                 use_c10_dispatcher=f.use_c10_dispatcher,
-                # See Note [Byte-for-byte compatibility]
-                hack_const_mutable_self=str(f.func.name) in ["set_data", "retain_grad"],
             ):
                 return func(f)
     return wrapper
@@ -224,11 +214,7 @@ def func(f: NativeFunction) -> Optional[str]:
 
             args_exprs_str = ', '.join(map(lambda a: a.name, args))
 
-            # See Note [Byte-for-byte compatibility]
-            # (return void_func() is valid C++)
             return_kw = "    return "
-            if returns_type == "void":
-                return_kw = " "
 
             cuda_guard = ""
             if dispatch is None or 'CUDA' in dispatch or 'Vulkan' == dispatch:
@@ -241,14 +227,6 @@ def func(f: NativeFunction) -> Optional[str]:
                 # Only tensor like arguments are eligible
                 device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None)
 
-                # See Note [Byte-for-byte compatibility]
-                # I wasn't able to figure out the internal logic for
-                # these device guards
-                if str(f.func.name) == "_thnn_fused_lstm_cell_backward":
-                    device_of = "cx"
-                elif str(f.func.name) == "_thnn_differentiable_lstm_cell_backward":
-                    device_of = "input_gates"
-
                 has_tensor_options = any(isinstance(a.argument, TensorOptionsArguments) for a in args)
 
                 # TODO: There is probably a simpler version of this that
@@ -257,9 +235,6 @@ def func(f: NativeFunction) -> Optional[str]:
                     cuda_guard = """\
     const DeviceGuard device_guard(options.device());
 """
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        cuda_guard = f"\n{cuda_guard}"
                 elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options:
                     cuda_guard = """\
     globalContext().lazyInitCUDA();
@@ -269,16 +244,10 @@ def func(f: NativeFunction) -> Optional[str]:
                     cuda_guard = f"""\
     const OptionalDeviceGuard device_guard(device_of({device_of}));
 """
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        cuda_guard = f"\n{cuda_guard}"
                 else:
                     cuda_guard = """\
     // DeviceGuard omitted
 """
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        cuda_guard = f"\n{cuda_guard}"
 
             return f"""\
 {returns_type} {name}({args_str}) {{
@@ -304,14 +273,9 @@ def func(f: NativeFunction) -> Optional[str]:
             if not def_only and not f.manual_kernel_registration and (dispatch is not None or f.dispatch is None):
                 # Figure out which signature the function is
                 if local.use_c10_dispatcher() is UseC10Dispatcher.full:
-                    # See Note [Byte-for-byte compatibility]
-                    if dispatch is not None:
-                        nl = "\n"
-                    else:
-                        nl = ""
 
                     payload = "c10::impl::hacky_wrapper_for_legacy_signatures<" \
-                        f"{returns_type} ({dispatcher_args_types_str})>({nl}TORCH_FN({type_name}))"
+                        f"{returns_type} ({dispatcher_args_types_str})>(TORCH_FN({type_name}))"
 
                 else:
                     payload = f"torch::CppFunction::makeUnboxedOnly(&{type_name})"
@@ -477,9 +441,10 @@ def go(f: NativeFunction) -> Optional[str]:
         dispatcher_exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args)
 
         if target is Target.DEFINITION:
-            # See Note [Byte-for-byte compatibility]
             # I don't think there's actually a good reason to generate
             # these two cases differently
+            # The first case could probably be improved though- it calls dispatchTypeId(),
+            # which looks at TLS dispatch keys- there should not be any by the time we reach backend select.
             if legacy_dispatcher_tensor_args:
                 tensor_args = ', '.join(a.name for a in legacy_dispatcher_tensor_args)
                 compute_dk = f"""\
@@ -638,23 +603,8 @@ def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[
             name = f.func.out_arguments[i].name
         # If the return argument is explicitly named...
         elif r.name:
-            # See Note [Byte-for-byte compatibility]
-            #
-            # Check if it would conflict with an existing argument.
-            # Downstream codegen assumes that return names and argument
-            # names don't conflict with each other, so we disambiguate
-            # (by adding a trailing _return) this case.  Notice that
-            # historically, the collision check was buggy: it just did a
-            # straight string contains test on the entirety of the
-            # inputs part of the format string, meaning that it also
-            # picked up occurrences of the argument name in the NAME of
-            # the function, as well as substring occurrences of the name
-            # in arguments.  We have simulated the old logic here...
-            buggy_name_conflict = r.name in str(f.func.name) or \
-                any(r.name in a.name for a in f.func.schema_order_arguments())
-            # ... but a more correct version is simply
-            # name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments())
-            if buggy_name_conflict and not f.func.is_out_fn():
+            name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments())
+            if name_conflict and not f.func.is_out_fn():
                 name = f'{r.name}_return'
             else:
                 name = r.name
@@ -715,20 +665,9 @@ def compute_argument_yaml(a: Argument, *, schema_order: bool, kwarg_only_set: Se
         arg['default'] = pythonify_default(cpp.default_expr(a.default, a.type))
     if a.name in kwarg_only_set:
         arg['kwarg_only'] = True
-    # See Note [Byte-for-byte compatibility]
-    # The default value of kwarg_only is False; this case exists for
-    # byte-for-byte compatibility
-    elif a.name in out_arg_set:
-        arg['kwarg_only'] = False
     if a.name in out_arg_set:
         arg['output'] = True
-        # See Note [Byte-for-byte compatibility]
-        # This is probably a bug in the original implementation, where
-        # the specification of allocate was not properly propagated to
-        # the schema-order arguments.  In any case, this field
-        # is redundant with the output field
-        if not schema_order:
-            arg['allocate'] = True
+        arg['allocate'] = True
         # See Note [name and field_name]
         if a.name in name_to_field_name:
             arg['field_name'] = name_to_field_name[a.name]
@@ -756,9 +695,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         for cpp_a in cpp_args
     ]
 
-    # See Note [Byte-for-byte compatibility]
-    # NB: NOT actually schema order.  This is almost certainly a BUG.
-    schema_order_jit_arguments = list(itertools.chain(f.func.arguments, f.func.out_arguments, f.func.kwarg_only_arguments))
+    schema_order_jit_arguments = list(f.func.schema_order_arguments())
 
     schema_order_arguments = [
         compute_argument_yaml(
@@ -1067,8 +1004,6 @@ def computeSchemaRegister() -> Dict[str, object]:
             schema_registrations = list(mapMaybe(
                 compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=None, def_only=True),
                 native_functions))
-            # See Note [Byte-for-byte compatibility]
-            schema_registrations.sort()
             return {
                 'schema_registrations': schema_registrations,
             }
diff --git a/tools/codegen/local.py b/tools/codegen/local.py
index 9244cb181aec..41deef4884f0 100644
--- a/tools/codegen/local.py
+++ b/tools/codegen/local.py
@@ -18,7 +18,6 @@
 
 class Locals(threading.local):
     use_c10_dispatcher: Optional[UseC10Dispatcher] = None
-    hack_const_mutable_self: bool = False
 _locals = Locals()
 
 # The use_c10_dispatcher field in native_functions.yaml is used to
@@ -31,19 +30,11 @@ def use_c10_dispatcher() -> UseC10Dispatcher:
         "need to initialize local.use_c10_dispatcher with local.parametrize"
     return _locals.use_c10_dispatcher
 
-# This is used to maintain compat, see Note [Byte-for-byte compatibility]
-# It can be removed when we drop compat.
-def hack_const_mutable_self() -> bool:
-    return _locals.hack_const_mutable_self
-
 @contextmanager
-def parametrize(*, use_c10_dispatcher: UseC10Dispatcher, hack_const_mutable_self: bool) -> Iterator[None]:
+def parametrize(*, use_c10_dispatcher: UseC10Dispatcher) -> Iterator[None]:
     old_use_c10_dispatcher = _locals.use_c10_dispatcher
-    old_hack_const_mutable_self = _locals.hack_const_mutable_self
     try:
         _locals.use_c10_dispatcher = use_c10_dispatcher
-        _locals.hack_const_mutable_self = hack_const_mutable_self
         yield
     finally:
         _locals.use_c10_dispatcher = old_use_c10_dispatcher
-        _locals.hack_const_mutable_self = old_hack_const_mutable_self
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index 18e5e4f54820..9dfc4573188a 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -93,7 +93,7 @@ void backward(
   torch::autograd::backward({self}, {_gradient}, std::move(keep_graph), create_graph);
 }
 
-void set_data(const Tensor & self, const Tensor & new_data) {
+void set_data(Tensor & self, const Tensor & new_data) {
   // `var.set_data(new_data)` shallow-copies all non-autograd TensorImpl fields
   // from `new_data` to `var`. It requires that `new_data` and `var` have compatible
   // tensor type.
@@ -160,7 +160,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
   return self.set_requires_grad(_requires_grad);
 }
 
-void retain_grad(const Tensor & self) {
+void retain_grad(Tensor & self) {
   TORCH_CHECK(self.requires_grad(), "can't retain_grad on Tensor that has requires_grad=False");
   if (self.is_leaf()) {  // no-op for leaves
     return;

From bf8cd21f2a785ce9721c9be7a13be9eca61c16cd Mon Sep 17 00:00:00 2001
From: Vinod Kumar S <vinods.kumar@gmail.com>
Date: Fri, 25 Sep 2020 08:20:44 -0700
Subject: [PATCH 132/449] Py transformer coder test (#43976)

Summary:
Fixes #{[37756](https://github.com/pytorch/pytorch/issues/37756)}

Added the missing Transformer coder python test scripts from C++ API test scripts

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43976

Reviewed By: jamesr66a

Differential Revision: D23873250

Pulled By: glaringlee

fbshipit-source-id: cdeae53231e02208463e7629ba2c1f00990150ea
---
 test/test_nn.py | 487 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 487 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index 8b9bf9156106..4020eb0cf308 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -5161,6 +5161,493 @@ def test_transformerdecoderlayer_gelu(self):
         self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
         torch.testing.assert_allclose(result, ref_output)
 
+    def test_transformerencoder(self):
+        def get_a_test_layer(use_cuda, activation):
+            d_model = 4
+            nhead = 2
+            dim_feedforward = 16
+            dropout = 0.0
+            device = torch.device("cuda" if use_cuda else "cpu")
+
+            layer = nn.TransformerEncoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                activation=activation).to(device)
+
+            with torch.no_grad():
+                # set constant weights of the model
+                for idx, p in enumerate(layer.parameters()):
+                    x = p.data
+                    sz = x.view(-1).size(0)
+                    shape = x.shape
+                    x = torch.cos(torch.arange(0, sz).float().view(shape))
+                    p.data.copy_(x)
+
+            return layer
+
+        # this is a deterministic test for TransformerEncoder
+        activation = "relu"
+        use_cuda = torch.cuda.is_available()
+        device = torch.device("cuda" if use_cuda else "cpu")
+
+        encoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation)
+
+        model = nn.TransformerEncoder(encoder_layer, 1).to(device)
+
+        # deterministic input
+        encoder_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                       [0.5387, 0.1655, 0.3565, 0.0471]],
+                                      [[0.8335, 0.2799, 0.5031, 0.2947],
+                                       [0.1402, 0.0318, 0.7636, 0.1346]],
+                                      [[0.6333, 0.9344, 0.1376, 0.9938],
+                                       [0.8924, 0.2872, 0.6692, 0.2944]],
+                                      [[0.9897, 0.6915, 0.3154, 0.1733],
+                                       [0.8645, 0.3513, 0.3064, 0.0767]],
+                                      [[0.8117, 0.2366, 0.4838, 0.7881],
+                                       [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                     ).to(device)
+        result = model(encoder_input)
+        ref_output = torch.Tensor([[[2.428589, 0.020835, -0.602055, -0.085249],
+                                    [2.427987, 0.021213, -0.602496, -0.084103]],
+                                   [[2.424689, 0.019155, -0.604793, -0.085672],
+                                    [2.413863, 0.022211, -0.612486, -0.072490]],
+                                   [[2.433774, 0.021598, -0.598343, -0.087548],
+                                    [2.425104, 0.019748, -0.604515, -0.084839]],
+                                   [[2.436185, 0.022682, -0.596625, -0.087261],
+                                    [2.433556, 0.021891, -0.598509, -0.086832]],
+                                   [[2.416246, 0.017512, -0.610712, -0.082961],
+                                    [2.422901, 0.024187, -0.606178, -0.074929]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # all 0
+        mask = torch.zeros([2, 5]).to(device) == 1
+        result = model(encoder_input, src_key_padding_mask=mask)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+        mask[0, 1] = 1
+        mask[1, 3] = 1
+        mask[1, 4] = 1
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor([[[2.429026, 0.020793, -0.601741, -0.085642],
+                                    [2.428811, 0.021445, -0.601912, -0.084252]],
+                                   [[2.425009, 0.019155, -0.604566, -0.085899],
+                                    [2.415408, 0.02249, -0.611415, -0.073]],
+                                   [[2.434199, 0.021682, -0.598039, -0.087699],
+                                    [2.42598, 0.019941, -0.603896, -0.085091]],
+                                   [[2.436457, 0.022736, -0.59643, -0.08736],
+                                    [2.434021, 0.022093, -0.598179, -0.08679]],
+                                   [[2.416531, 0.017498, -0.610513, -0.083181],
+                                    [2.4242, 0.024653, -0.605266, -0.074959]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # test case 2, multiple layers no norm
+        model = nn.TransformerEncoder(encoder_layer, 2).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[2.419051, 0.017446, -0.608738, -0.085003],
+              [2.419102, 0.017452, -0.608703, -0.085026]],
+             [[2.419043, 0.017445, -0.608744, -0.084999],
+              [2.419052, 0.017446, -0.608738, -0.085004]],
+             [[2.419067, 0.017448, -0.608727, -0.085010],
+              [2.419098, 0.017452, -0.608706, -0.085024]],
+             [[2.419072, 0.017449, -0.608724, -0.085012],
+              [2.419119, 0.017455, -0.608691, -0.085034]],
+             [[2.419019, 0.017442, -0.608761, -0.084989],
+              [2.419075, 0.017449, -0.608722, -0.085014]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        model = nn.TransformerEncoder(encoder_layer, 6).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]],
+             [[2.419101, 0.017453, -0.608703, -0.085025],
+              [2.419101, 0.017453, -0.608704, -0.085025]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # test case 3, multiple layers with norm
+        # d_model = 4
+        norm = nn.LayerNorm(4)
+        model = nn.TransformerEncoder(encoder_layer, 2, norm=norm).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[1.695949, -0.357635, -0.893077, -0.445238],
+              [1.695955, -0.357639, -0.893050, -0.445266]],
+             [[1.695948, -0.357634, -0.893082, -0.445233],
+              [1.695950, -0.357635, -0.893077, -0.445238]],
+             [[1.695951, -0.357636, -0.893069, -0.445246],
+              [1.695955, -0.357639, -0.893052, -0.445264]],
+             [[1.695952, -0.357636, -0.893066, -0.445249],
+              [1.695957, -0.357641, -0.893041, -0.445276]],
+             [[1.695946, -0.357632, -0.893095, -0.445220],
+              [1.695952, -0.357637, -0.893065, -0.445251]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        model = nn.TransformerEncoder(encoder_layer, 6, norm=norm).to(device)
+        result = model(encoder_input, src_key_padding_mask=mask)
+        ref_output = torch.Tensor(
+            [[[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]],
+             [[1.695955, -0.357639, -0.893051, -0.445265],
+              [1.695955, -0.357639, -0.893051, -0.445265]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+
+    def test_transformerdecoder(self):
+        def get_a_test_layer(use_cuda, activation):
+            d_model = 4
+            nhead = 2
+            dim_feedforward = 16
+            dropout = 0.0
+            device = torch.device("cuda" if use_cuda else "cpu")
+
+            layer = nn.TransformerDecoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                activation=activation).to(device)
+
+            with torch.no_grad():
+                # set constant weights of the model
+                for idx, p in enumerate(layer.parameters()):
+                    x = p.data
+                    sz = x.view(-1).size(0)
+                    shape = x.shape
+                    x = torch.cos(torch.arange(0, sz).float().view(shape))
+                    p.data.copy_(x)
+
+            return layer
+
+        # this is a deterministic test for TransformerDecoder
+        activation = "relu"
+        use_cuda = torch.cuda.is_available()
+        device = torch.device("cuda" if use_cuda else "cpu")
+
+        decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation)
+
+        model = nn.TransformerDecoder(decoder_layer, 1).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.314351, 0.094805, -0.671322, 0.101977]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[9, 10, 11, 12]],
+                                     [[11, 12, 13, 14]]]).to(device)
+        memory_input = torch.Tensor([[[1, 2, 3, 4]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.422245, 0.051716, -0.606338, -0.024756]],
+             [[2.422245, 0.051716, -0.606338, -0.024756]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[1, 2, 3, 4]],
+                                      [[5, 6, 7, 8]]]).to(device)
+        memory_input = torch.Tensor([[[9, 10, 11, 12]],
+                                     [[11, 12, 13, 14]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.343536, 0.085561, -0.654954, 0.074991]],
+             [[2.343536, 0.085561, -0.654954, 0.074991]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                       [0.2678, 0.3677, 0.4459, 0.7166]],
+                                      [[0.8100, 0.3716, 0.4096, 0.1976],
+                                       [0.6958, 0.8844, 0.6081, 0.8315]],
+                                      [[0.0494, 0.9343, 0.5955, 0.3830],
+                                       [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096],
+                                    [2.431935, 0.028907, -0.599809, -0.072488]],
+                                   [[2.428457, 0.027053, -0.602275, -0.073462],
+                                    [2.431970, 0.029387, -0.599789, -0.071621]],
+                                   [[2.431934, 0.028196, -0.599802, -0.073809],
+                                    [2.432306, 0.028858, -0.599542, -0.072846]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # key_padding_mask
+        key_padding_mask = torch.zeros(2, 3).to(device) == 1
+        result = model(decoder_input,
+                       memory_input,
+                       tgt_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096],
+                                    [2.431935, 0.028907, -0.599809, -0.072488]],
+                                   [[2.428457, 0.027053, -0.602275, -0.073462],
+                                    [2.431970, 0.029387, -0.599789, -0.071621]],
+                                   [[2.431934, 0.028196, -0.599802, -0.073809],
+                                    [2.432306, 0.028858, -0.599542, -0.072846]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # key_padding_mask
+        key_padding_mask[0, 2] = 1
+        key_padding_mask[1, 1] = 1
+        key_padding_mask[1, 2] = 1
+        result = model(decoder_input,
+                       memory_input,
+                       tgt_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.430025, 0.027643, -0.601164, -0.073476],
+                                    [2.4323, 0.029375, -0.599553, -0.071881]],
+                                   [[2.428523, 0.026838, -0.602226, -0.07391],
+                                    [2.432634, 0.029842, -0.599318, -0.071253]],
+                                   [[2.432278, 0.028152, -0.599555, -0.074139],
+                                    [2.432659, 0.029244, -0.599294, -0.072382]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # memory_key_padding_mask
+        key_padding_mask = torch.zeros(2, 5).to(device) == 1
+        result = model(decoder_input,
+                       memory_input,
+                       memory_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096],
+                                    [2.431935, 0.028907, -0.599809, -0.072488]],
+                                   [[2.428457, 0.027053, -0.602275, -0.073462],
+                                    [2.431970, 0.029387, -0.599789, -0.071621]],
+                                   [[2.431934, 0.028196, -0.599802, -0.073809],
+                                    [2.432306, 0.028858, -0.599542, -0.072846]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # memory_key_padding_mask
+        key_padding_mask[0, 4] = 1
+        key_padding_mask[1, 3] = 1
+        key_padding_mask[1, 4] = 1
+        result = model(decoder_input,
+                       memory_input,
+                       memory_key_padding_mask=key_padding_mask)
+        ref_output = torch.Tensor([[[2.429757, 0.027358, -0.601351, -0.073816],
+                                    [2.432692, 0.028583, -0.599263, -0.073634]],
+                                   [[2.428247, 0.02662, -0.602419, -0.074123],
+                                    [2.432657, 0.029055, -0.599293, -0.072732]],
+                                   [[2.431515, 0.027687, -0.600096, -0.074459],
+                                    [2.433075, 0.028543, -0.598987, -0.073985]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers no norm
+        model = nn.TransformerDecoder(decoder_layer, 2).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.31316, 0.0950293, -0.671995, 0.102802]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers no norm
+        model = nn.TransformerDecoder(decoder_layer, 6).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                      [0.2678, 0.3677, 0.4459, 0.7166]],
+                                     [[0.8100, 0.3716, 0.4096, 0.1976],
+                                      [0.6958, 0.8844, 0.6081, 0.8315]],
+                                     [[0.0494, 0.9343, 0.5955, 0.3830],
+                                      [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.42794, 0.026164, -0.60263, -0.0747591],
+              [2.43113, 0.0279516, -0.600376, -0.0736896]],
+             [[2.42794, 0.026164, -0.60263, -0.0747591],
+              [2.43113, 0.0279516, -0.600376, -0.0736896]],
+             [[2.42794, 0.026164, -0.60263, -0.0747591],
+              [2.43113, 0.0279516, -0.600376, -0.0736896]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers with norm
+        # d_model = 4
+        norm = nn.LayerNorm(4)
+        model = nn.TransformerDecoder(decoder_layer, 2, norm=norm).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[1.66166, -0.326986, -1.01466, -0.320017]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # multiple layers with norm
+        model = nn.TransformerDecoder(decoder_layer, 6, norm=norm).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                       [0.2678, 0.3677, 0.4459, 0.7166]],
+                                      [[0.8100, 0.3716, 0.4096, 0.1976],
+                                       [0.6958, 0.8844, 0.6081, 0.8315]],
+                                      [[0.0494, 0.9343, 0.5955, 0.3830],
+                                       [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[1.69559, -0.357291, -0.894741, -0.443553],
+              [1.69571, -0.357363, -0.894154, -0.444196]],
+             [[1.69559, -0.357291, -0.894741, -0.443553],
+              [1.69571, -0.357363, -0.894154, -0.444196]],
+             [[1.69559, -0.357291, -0.894741, -0.443553],
+              [1.69571, -0.357363, -0.894154, -0.444196]]]
+        ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5)
+
+        # gelu activation test cases
+        activation = "gelu"
+        use_cuda = torch.cuda.is_available()
+        device = torch.device("cuda" if use_cuda else "cpu")
+
+        decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation)
+
+        model = nn.TransformerDecoder(decoder_layer, 1).to(device)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device)
+        memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]
+                                  ).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[9, 10, 11, 12]],
+                                      [[11, 12, 13, 14]]]).to(device)
+        memory_input = torch.Tensor([[[1, 2, 3, 4]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.415448, 0.054389, -0.610932, -0.0156613]],
+             [[2.415448, 0.054389, -0.610932, -0.0156613]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[1, 2, 3, 4]],
+                                      [[5, 6, 7, 8]]]).to(device)
+        memory_input = torch.Tensor([[[9, 10, 11, 12]],
+                                     [[11, 12, 13, 14]]]).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.338531, 0.087709, -0.65776, 0.080646]],
+             [[2.338531, 0.087709, -0.65776, 0.080646]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+        # deterministic input
+        decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034],
+                                       [0.2678, 0.3677, 0.4459, 0.7166]],
+                                      [[0.8100, 0.3716, 0.4096, 0.1976],
+                                       [0.6958, 0.8844, 0.6081, 0.8315]],
+                                      [[0.0494, 0.9343, 0.5955, 0.3830],
+                                       [0.5404, 0.3464, 0.9378, 0.6200]]]
+                                     ).to(device)
+        memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891],
+                                      [0.5387, 0.1655, 0.3565, 0.0471]],
+                                     [[0.8335, 0.2799, 0.5031, 0.2947],
+                                      [0.1402, 0.0318, 0.7636, 0.1346]],
+                                     [[0.6333, 0.9344, 0.1376, 0.9938],
+                                      [0.8924, 0.2872, 0.6692, 0.2944]],
+                                     [[0.9897, 0.6915, 0.3154, 0.1733],
+                                      [0.8645, 0.3513, 0.3064, 0.0767]],
+                                     [[0.8117, 0.2366, 0.4838, 0.7881],
+                                      [0.3718, 0.4945, 0.9511, 0.0864]]]
+                                    ).to(device)
+        result = model(decoder_input, memory_input)
+        ref_output = torch.Tensor(
+            [[[2.42049104, 0.03443088, -0.60793706, -0.05436271],
+              [2.42210631, 0.03546578, -0.60679895, -0.05357488]],
+             [[2.41907674, 0.0336104, -0.60892977, -0.05490462],
+              [2.42216881, 0.03586554, -0.6067524, -0.05289126]],
+             [[2.42205716, 0.03488046, -0.60683681, -0.05460596],
+              [2.42240309, 0.0354595, -0.60659063, -0.05378816]]]).to(device)
+        self.assertEqual(tuple(result.shape), tuple(ref_output.shape))
+        torch.testing.assert_allclose(result, ref_output)
+
+
     @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available')
     def test_cudnn_rnn_dropout_states_device(self):
         rnn = nn.RNN(10, 20, num_layers=2, dropout=.5)

From 043bd51b48fa7597383a6386f4207d3966c685e2 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 25 Sep 2020 08:59:25 -0700
Subject: [PATCH 133/449] Remove hacky_wrapper from VariableType and TraceType
 (#44005)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44005

Previously, VariableType and TraceType kernels were still written in the legacy way, i.e. they took one TensorOptions argument instead of scattered dtype, layout, device, pin_memory,  and they used hacky_wrapper to be callable.

Now with this PR, variable and tracing kernels are written in the new way and no hacky_wrapper is needed for them.
ghstack-source-id: 112825791

Test Plan:
waitforsandcastle

https://www.internalfb.com/intern/fblearner/details/215954270/

Reviewed By: ezyang

Differential Revision: D23466042

fbshipit-source-id: bde730a9e3bb4cb80ad484417be1ebecbdc2d377
---
 tools/autograd/gen_autograd.py            | 16 ----------
 tools/autograd/gen_variable_type.py       | 37 +++++++++++++++++++----
 tools/autograd/templates/VariableType.cpp |  1 -
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 82d908de6180..2aaba65b9a79 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -115,20 +115,6 @@ def has_tensoroptions_argument(declaration):
             return True
     return False
 
-def process_schema_order_arg(schema_order_arg):
-    if schema_order_arg == 'dtype':
-        return 'optTypeMetaToScalarType(options.dtype_opt())'
-    elif schema_order_arg == 'layout':
-        return 'options.layout_opt()'
-    elif schema_order_arg == 'device':
-        return 'options.device_opt()'
-    elif schema_order_arg == 'pin_memory':
-        return 'options.pinned_memory_opt()'
-    elif schema_order_arg == 'memory_format':
-        return 'c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)'
-    else:
-        return schema_order_arg
-
 
 def load_aten_declarations(path):
     with open(path, 'r') as f:
@@ -151,8 +137,6 @@ def load_aten_declarations(path):
                                                for arg in declaration['schema_order_arguments']]
         declaration['args'] = [arg['name'] for arg in declaration['arguments']]
         declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']]
-        if has_tensoroptions_argument(declaration):
-            declaration['schema_order_args'] = [process_schema_order_arg(arg) for arg in declaration['schema_order_args']]
         declaration['api_name'] = declaration['name']
         if declaration.get('overload_name'):
             declaration['type_wrapper_name'] = "{}_{}".format(
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 89bf64d8149e..8175390fc7e6 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -232,7 +232,7 @@
 
 WRAPPER_REGISTRATION = CodeTemplate("""\
 m.impl("${unqual_operator_name_with_overload}",
-       c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>(TORCH_FN(${class_type}::${type_wrapper_name}))
+       TORCH_FN(${class_type}::${type_wrapper_name})
 );
 """)
 
@@ -240,6 +240,9 @@
 auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""")
 
 UNPACK_OPTIONS = CodeTemplate("""\
+auto ${arg_name}_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);""")
+
+LEGACY_UNPACK_OPTIONS = CodeTemplate("""\
 auto ${arg_name}_ = TensorOptions(${arg_name});""")
 
 DECLARE_GRAD_FN = CodeTemplate("""\
@@ -491,8 +494,15 @@ def format_trace_op_name(declaration):
 
 
 def format_trace_inputs(declaration):
+    gather_tensor_options = "TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)"
+
     def dispatch_trace_input(arg_spec):
         name, value, simple_type, nullable = arg_spec
+        if declaration['use_c10_dispatcher'] == 'full':
+            if value == "options":
+                value = gather_tensor_options
+        else:
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
         # XXX: For arg that have type of Tensor?[], tracer will pass allow_undefined to addInputs
         if simple_type == 'TensorList' and nullable:
             return '''jit::tracer::addInputs(node, "{}", {}, {});'''.format(name, value, "true")
@@ -515,7 +525,13 @@ def dispatch_trace_input(arg_spec):
     if is_out_overload(declaration):
         # for *_out functions, handle the result argument differently for inplace/outplace.
         # For inplace: just add the input to the end to confirm with the JIT schema
-        inplace = ADD_TRACE_INPUT.substitute(name=out_input['name'], input=out_input['name'])
+        value = out_input['name']
+        if declaration['use_c10_dispatcher'] == 'full':
+            if value == "options":
+                value = gather_tensor_options
+        else:
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+        inplace = ADD_TRACE_INPUT.substitute(name=out_input['name'], input=value)
 
         # for outplace: do nothing, except if the declaration is a factory.
         # Factories are a bit special because their out-of-place overloads
@@ -681,12 +697,17 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade
 
     for declaration in aten_declarations:
         formal_types = [arg['type'] for arg in declaration['arguments']]
-        type_declarations.append(METHOD_DECLARATION.substitute(declaration))
+        if declaration['use_c10_dispatcher'] == 'full':
+            formals = declaration['schema_order_formals']
+        else:
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+            formals = declaration['formals']
+        type_declarations.append(METHOD_DECLARATION.substitute(declaration, formals=formals))
         strategy = dispatch_strategy(declaration)
         if declaration['name'] not in MANUAL_AUTOGRAD and strategy == 'use_derived':
             body = emit_body(declaration)
             type_definitions.append(METHOD_DEFINITION.substitute(
-                declaration, type_definition_body=body))
+                declaration, type_definition_body=body, formals=formals))
             if declaration['use_c10_dispatcher'] == 'full':
                 wrapper_registrations.append(WRAPPER_REGISTRATION.substitute(
                     declaration, class_type='VariableType'))
@@ -702,7 +723,7 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade
         if declaration['name'] not in MANUAL_TRACER:
             trace_body = emit_trace_body(declaration)
             trace_method_definitions.append(METHOD_DEFINITION.substitute(
-                declaration, type_definition_body=trace_body))
+                declaration, type_definition_body=trace_body, formals=formals))
 
             if declaration['use_c10_dispatcher'] == 'full':
                 trace_wrapper_registrations.append(WRAPPER_REGISTRATION.substitute(
@@ -1224,7 +1245,11 @@ def requires_unpack(arg):
             # Okay, we are abusing the definition of 'unpack' here a bit,
             # although it's still getting the non-variable from the variable
             # (in this case via TensorOptions rather than Variable/Tensor).
-            body.append(UNPACK_OPTIONS.substitute(arg_name=arg['name']))
+            if declaration['use_c10_dispatcher'] == 'full':
+                body.append(UNPACK_OPTIONS.substitute(arg_name=arg['name']))
+            else:
+                assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+                body.append(LEGACY_UNPACK_OPTIONS.substitute(arg_name=arg['name']))
 
         unpacked_args.append(arg['name'] + '_')
         unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type']
diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp
index efddffbe7610..079427cd97dc 100644
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@@ -3,7 +3,6 @@
 
 #include <ATen/TypeDefault.h>
 #include <torch/library.h>
-#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 
 // ${generated_comment}
 

From 2ac7de7d53025db49d4b3398262c00578a1184ef Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 25 Sep 2020 08:59:25 -0700
Subject: [PATCH 134/449] Remove hacky_wrapper from BackendSelect kernels
 (#44062)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44062

Previously, BackendSelect kernels were still written in the legacy way, i.e. they took one TensorOptions argument instead of scattered dtype, layout, device, pin_memory,  and they used hacky_wrapper to be callable. This caused a re-wrapping step. Calling into a BackencSelect kernel required taking the individual scattered arguments, packing them into a TensorOptions, and the kernel itself then gathered them again for redispatch.

Now with this PR, BackendSelect kernels are written in the new way and no hacky_wrapper or rewrapping is needed for them.
ghstack-source-id: 112825789

Test Plan:
vs master: https://www.internalfb.com/intern/fblearner/details/216117032/

vs previous diff: https://www.internalfb.com/intern/fblearner/details/216170194/

Reviewed By: ezyang

Differential Revision: D23484192

fbshipit-source-id: e8fb49c4692404b6b775d18548b990c4cdddbada
---
 aten/src/ATen/native/MetaTensor.cpp           |   2 +-
 .../ATen/templates/BackendSelectRegister.cpp  |   1 -
 c10/core/DefaultDtype.cpp                     |   7 +-
 c10/core/DefaultDtype.h                       |   2 +
 c10/core/TensorOptions.h                      | 154 ++++++++++--------
 caffe2/core/tensor.h                          |   2 +-
 tools/codegen/api/dispatcher.py               |   3 +
 tools/codegen/gen.py                          |  21 ++-
 torch/csrc/DynamicTypes.cpp                   |   3 +-
 9 files changed, 115 insertions(+), 80 deletions(-)

diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp
index 2ae5fb0f9d59..f8f0231b181c 100644
--- a/aten/src/ATen/native/MetaTensor.cpp
+++ b/aten/src/ATen/native/MetaTensor.cpp
@@ -25,7 +25,7 @@ Tensor empty_meta(
     // participate in dispatch, but so that tests like is_sparse/is_cuda
     // give the correct result (a CUDA meta tensor "is cuda").  If we don't
     // like this, remove the computeDispatchKey line
-    DispatchKeySet{DispatchKey::Meta, computeDispatchKey(options)},
+    DispatchKeySet{DispatchKey::Meta, options.computeDispatchKey()},
     dtype,
     device
   );
diff --git a/aten/src/ATen/templates/BackendSelectRegister.cpp b/aten/src/ATen/templates/BackendSelectRegister.cpp
index db7276913201..bcbf25f3117f 100644
--- a/aten/src/ATen/templates/BackendSelectRegister.cpp
+++ b/aten/src/ATen/templates/BackendSelectRegister.cpp
@@ -7,7 +7,6 @@
 #include <ATen/ATen.h>
 #include <ATen/Dispatch.h>
 #include <torch/library.h>
-#include <ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h>
 #include <c10/core/TensorOptions.h>
 
 namespace at {
diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp
index daae181db9d7..c4f420ab6e22 100644
--- a/c10/core/DefaultDtype.cpp
+++ b/c10/core/DefaultDtype.cpp
@@ -3,11 +3,13 @@
 
 namespace c10 {
 static auto default_dtype = caffe2::TypeMeta::Make<float>();
+static auto default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
 static auto default_complex_dtype = caffe2::TypeMeta::Make<c10::complex<float>>();
 
 void set_default_dtype(caffe2::TypeMeta dtype) {
   default_dtype = std::move(dtype);
-  if(dtype == caffe2::TypeMeta::Make<double>()) {
+  default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
+  if(default_dtype_as_scalartype == ScalarType::Double) {
     default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<double>>());
   } else {
     default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<float>>());
@@ -17,6 +19,9 @@ void set_default_dtype(caffe2::TypeMeta dtype) {
 const caffe2::TypeMeta& get_default_dtype() {
   return default_dtype;
 }
+ScalarType get_default_dtype_as_scalartype() {
+  return default_dtype_as_scalartype;
+}
 const caffe2::TypeMeta& get_default_complex_dtype() {
   return default_complex_dtype;
 }
diff --git a/c10/core/DefaultDtype.h b/c10/core/DefaultDtype.h
index 402a6069bfc3..eda34b217727 100644
--- a/c10/core/DefaultDtype.h
+++ b/c10/core/DefaultDtype.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/macros/Macros.h>
+#include <c10/core/ScalarType.h>
 
 namespace caffe2 {
 class TypeMeta;
@@ -9,5 +10,6 @@ class TypeMeta;
 namespace c10 {
 C10_API void set_default_dtype(caffe2::TypeMeta dtype);
 C10_API const caffe2::TypeMeta& get_default_dtype();
+C10_API ScalarType get_default_dtype_as_scalartype();
 C10_API const caffe2::TypeMeta& get_default_complex_dtype();
 } // namespace c10
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index a42f4d4284f4..a3db4f9accc1 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -17,6 +17,25 @@
 #include <utility>
 
 namespace c10 {
+
+DispatchKey computeDispatchKey(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device);
+
+inline ScalarType dtype_or_default(c10::optional<ScalarType> dtype) {
+  return dtype.has_value() ? *dtype : get_default_dtype_as_scalartype();
+}
+
+inline caffe2::TypeMeta dtype_or_default(c10::optional<caffe2::TypeMeta> dtype) {
+  return dtype.has_value() ? *dtype : get_default_dtype();
+}
+
+inline Layout layout_or_default(c10::optional<Layout> layout) {
+  return layout.has_value() ? *layout : kStrided;
+}
+
+inline Device device_or_default(c10::optional<Device> device) {
+  return device.has_value() ? *device : Device(kCPU);
+}
+
 /// A class to encapsulate construction axes of an Tensor.  TensorOptions was
 /// designed to support the Python style API for specifying construction options
 /// on factory functions, e.g.,
@@ -228,7 +247,7 @@ struct C10_API TensorOptions {
 
   /// Returns the device of the `TensorOptions`.
   Device device() const noexcept {
-    return has_device_ ? device_ : Device(kCPU);
+    return device_or_default(device_opt());
   }
 
   /// Returns whether the device is specified.
@@ -249,7 +268,7 @@ struct C10_API TensorOptions {
 
   /// Returns the dtype of the `TensorOptions`.
   caffe2::TypeMeta dtype() const noexcept {
-    return has_dtype_ ? dtype_ : get_default_dtype();
+    return dtype_or_default(dtype_opt());
   }
 
   /// Returns whether the dtype is specified.
@@ -265,7 +284,7 @@ struct C10_API TensorOptions {
 
   /// Returns the layout of the `TensorOptions`.
   Layout layout() const noexcept {
-    return has_layout_ ? layout_ : kStrided;
+    return layout_or_default(layout_opt());
   }
 
   /// Returns whether the layout is specified.
@@ -370,65 +389,7 @@ struct C10_API TensorOptions {
   }
 
   inline DispatchKey computeDispatchKey() const {
-    switch (layout()) {
-      case Layout::Strided:
-        switch (device().type()) {
-          case DeviceType::CPU: {
-            auto dtype_tmp = typeMetaToScalarType(dtype());
-            if (isQIntType(dtype_tmp)) {
-              return DispatchKey::QuantizedCPU;
-            }
-            return DispatchKey::CPU;
-            }
-            case DeviceType::CUDA: {
-              auto dtype_tmp = typeMetaToScalarType(dtype());
-              if (isQIntType(dtype_tmp)) {
-                return DispatchKey::QuantizedCUDA;
-              }
-              return DispatchKey::CUDA;
-            }
-          case DeviceType::MKLDNN:
-            return DispatchKey::MKLDNN;
-          case DeviceType::OPENGL:
-            return DispatchKey::OpenGL;
-          case DeviceType::OPENCL:
-            return DispatchKey::OpenCL;
-          case DeviceType::IDEEP:
-            return DispatchKey::IDEEP;
-          case DeviceType::HIP:
-            return DispatchKey::HIP;
-          case DeviceType::FPGA:
-            return DispatchKey::FPGA;
-          case DeviceType::MSNPU:
-            return DispatchKey::MSNPU;
-          case DeviceType::XLA:
-            return DispatchKey::XLA;
-          case DeviceType::Vulkan:
-            return DispatchKey::Vulkan;
-          default:
-            AT_ERROR("Unsupported device type for dense layout: ", device().type());
-        }
-      case Layout::Sparse:
-        switch (device().type()) {
-          case DeviceType::CPU:
-            return DispatchKey::SparseCPU;
-          case DeviceType::CUDA:
-            return DispatchKey::SparseCUDA;
-          case DeviceType::HIP:
-            return DispatchKey::SparseHIP;
-          default:
-            AT_ERROR("Unsupported device type for sparse layout: ", device().type());
-        }
-      case Layout::Mkldnn:
-        switch (device().type()) {
-          case DeviceType::CPU:
-            return DispatchKey::MkldnnCPU;
-          default:
-            AT_ERROR("Unsupported device type for mkldnn layout: ", device().type());
-        }
-      default:
-        AT_ERROR("Unsupported layout: ", layout());
-    }
+    return c10::computeDispatchKey(optTypeMetaToScalarType(dtype_opt()), layout_opt(), device_opt());
   }
 
  private:
@@ -611,13 +572,68 @@ inline std::string toString(const TensorOptions options) {
 
 // This is intended to be a centralized location by which we can determine
 // what an appropriate DispatchKey for a tensor is.
-//
-// This takes a TensorOptions, rather than just a DeviceType and Layout, because
-// we reserve the right to change dispatch based on *any* aspect of
-// TensorOptions.  WARNING: If you do this, you need to fix the calls
-// to computeDispatchKey in caffe2/tensor.h
-inline DispatchKey computeDispatchKey(TensorOptions options) {
-  return options.computeDispatchKey();
+inline DispatchKey computeDispatchKey(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device) {
+  const auto layout_ = layout_or_default(layout);
+  const auto device_ = device_or_default(device);
+  switch (layout_) {
+      case Layout::Strided: {
+        const auto dtype_ = dtype_or_default(dtype);
+        switch (device_.type()) {
+          case DeviceType::CPU: {
+            if (isQIntType(dtype_)) {
+              return DispatchKey::QuantizedCPU;
+            }
+            return DispatchKey::CPU;
+          }
+          case DeviceType::CUDA: {
+            if (isQIntType(dtype_)) {
+              return DispatchKey::QuantizedCUDA;
+            }
+            return DispatchKey::CUDA;
+          }
+          case DeviceType::MKLDNN:
+            return DispatchKey::MKLDNN;
+          case DeviceType::OPENGL:
+            return DispatchKey::OpenGL;
+          case DeviceType::OPENCL:
+            return DispatchKey::OpenCL;
+          case DeviceType::IDEEP:
+            return DispatchKey::IDEEP;
+          case DeviceType::HIP:
+            return DispatchKey::HIP;
+          case DeviceType::FPGA:
+            return DispatchKey::FPGA;
+          case DeviceType::MSNPU:
+            return DispatchKey::MSNPU;
+          case DeviceType::XLA:
+            return DispatchKey::XLA;
+          case DeviceType::Vulkan:
+            return DispatchKey::Vulkan;
+          default:
+            AT_ERROR("Unsupported device type for dense layout: ", device_.type());
+        }
+      }
+      case Layout::Sparse:
+        switch (device_.type()) {
+          case DeviceType::CPU:
+            return DispatchKey::SparseCPU;
+          case DeviceType::CUDA:
+            return DispatchKey::SparseCUDA;
+          case DeviceType::HIP:
+            return DispatchKey::SparseHIP;
+          default:
+            AT_ERROR("Unsupported device type for sparse layout: ", device_.type());
+        }
+      case Layout::Mkldnn:
+        switch (device_.type()) {
+          case DeviceType::CPU:
+            return DispatchKey::MkldnnCPU;
+          default:
+            AT_ERROR("Unsupported device type for mkldnn layout: ", device_.type());
+        }
+      default:
+        AT_ERROR("Unsupported layout: ", layout_);
+    }
 }
 
 // We deliberately ignore handling AutogradCPU/CUDA/XLA... keys to
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 18a7be64d670..27f8b471b71b 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -70,7 +70,7 @@ class CAFFE2_API Tensor final {
   explicit Tensor(at::Device device)
       : impl_(c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(
             Storage::create_legacy(device),
-            c10::computeDispatchKey(at::device(device).layout(at::kStrided)),
+            c10::computeDispatchKey(c10::nullopt, at::kStrided, device),
             TypeMeta())) {}
 
   /**
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index 34960534275f..f28135510b0e 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -107,3 +107,6 @@ def cpparguments_exprs(args: Sequence[CppArgument]) -> Sequence[DispatcherExpr]:
 # close
 def legacydispatcherarguments_exprs(args: Sequence[LegacyDispatcherArgument]) -> Sequence[DispatcherExpr]:
     return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args])
+
+def exprs(args: Sequence[DispatcherArgument]) -> Sequence[DispatcherExpr]:
+    return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args])
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 0871d8c55ae1..df6f08189307 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -438,7 +438,18 @@ def go(f: NativeFunction) -> Optional[str]:
 
         dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
         dispatcher_args = dispatcher.arguments(f.func)
-        dispatcher_exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args)
+
+        args: Union[Sequence[DispatcherArgument], Sequence[LegacyDispatcherArgument]]
+        if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            returns_type = dispatcher_returns_type
+            args = dispatcher_args
+            exprs = dispatcher.exprs(dispatcher_args)
+            dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
+        else:
+            returns_type = legacy_dispatcher_returns_type
+            args = legacy_dispatcher_args
+            exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args)
+            dispatch_key = "options.computeDispatchKey()"
 
         if target is Target.DEFINITION:
             # I don't think there's actually a good reason to generate
@@ -448,14 +459,14 @@ def go(f: NativeFunction) -> Optional[str]:
             if legacy_dispatcher_tensor_args:
                 tensor_args = ', '.join(a.name for a in legacy_dispatcher_tensor_args)
                 compute_dk = f"""\
-DispatchKeySet _dk_set = DispatchKeySet(options.computeDispatchKey()) | c10::detail::multi_dispatch_key_set({tensor_args});
+DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args});
   DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect);
   DispatchKey _dk = c10::impl::dispatchTypeId(_dk_set, _dk_mask);"""
             else:
-                compute_dk = "DispatchKey _dk = options.computeDispatchKey();"
+                compute_dk = f"DispatchKey _dk = {dispatch_key};"
             return f"""\
 // aten::{f.func}
-{legacy_dispatcher_returns_type} {name}({', '.join(a.str_with_default() for a in legacy_dispatcher_args)}) {{
+{returns_type} {name}({', '.join(str(a) for a in args)}) {{
   static auto op = c10::Dispatcher::singleton()
     .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
     .typed<{dispatcher_returns_type} ({', '.join(a.type for a in dispatcher_args)})>();
@@ -464,7 +475,7 @@ def go(f: NativeFunction) -> Optional[str]:
   // This trick allows calling Autograd backend kernel first and then backend kernel,
   // without adding another AutogradBackendSelect dispatch key.
   DispatchKey _current_dk = at::impl::variable_excluded_from_dispatch() ? _dk : _autograd_dk;
-  return op.callWithDispatchKey(_current_dk, {', '.join(a.expr for a in dispatcher_exprs)});
+  return op.callWithDispatchKey(_current_dk, {', '.join(a.expr for a in exprs)});
 }}
 """
         elif target is Target.REGISTRATION:
diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp
index 6969ac0449c0..f7e48c3b682d 100644
--- a/torch/csrc/DynamicTypes.cpp
+++ b/torch/csrc/DynamicTypes.cpp
@@ -61,9 +61,8 @@ PyTypeObject* getPyTypeObject(
     const at::Storage& storage,
     const caffe2::TypeMeta& dtype) {
   at::ScalarType scalarType = at::typeMetaToScalarType(dtype);
-  at::TensorOptions options = at::TensorOptions(storage.device_type()).dtype(scalarType);
   auto attype = &at::getDeprecatedTypeProperties(
-      at::dispatchKeyToBackend(at::computeDispatchKey(options)),
+      at::dispatchKeyToBackend(c10::computeDispatchKey(scalarType, c10::nullopt, storage.device_type())),
       scalarType);
   auto it = attype_to_py_storage_type.find(attype);
   if (it != attype_to_py_storage_type.end()) {

From 78fcde9c50e53aae52f01766efed917b34461002 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 25 Sep 2020 08:59:25 -0700
Subject: [PATCH 135/449] Trace scattered tensor options arguments (#44071)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44071

Previously, tracing re-gathered ScalarType, Layout, Device, bool into a TensorOptions object and called `tracer::addInput()` on the gathered TensorOptions argument. `tracer::addInput()` then scattered them again and added the individual scattered arguments to the traced graph. This PR avoids the extraneous gathering and re-scattering step and calls `tracer::addInput()` on the individual arguments directly. This avoid the perf hit for an unnecessary gathering step.

This applies to both c10-full and non-c10-full ops. In the case of c10-full ops, the tracing kernels takes scattered arguments and we can directly pass them to `tracer::addInput()`. In the case of non-c10-full ops, the kernel takes a `TensorOptions` argument but we still call `tracer::addInput()` on the scattered arguments.
ghstack-source-id: 112825793

Test Plan:
waitforsandcastle

vs master: https://www.internalfb.com/intern/fblearner/details/216129483/

vs previous diff: https://www.internalfb.com/intern/fblearner/details/216170069/

Reviewed By: ezyang

Differential Revision: D23486638

fbshipit-source-id: e0b53e6673cef8d7f94158e718301eee261e5d22
---
 c10/core/TensorOptions.h            |  6 +++-
 tools/autograd/gen_variable_type.py | 46 +++++++++++++++++++----------
 torch/csrc/jit/frontend/tracer.cpp  |  9 ------
 torch/csrc/jit/frontend/tracer.h    |  4 ---
 4 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index a3db4f9accc1..dd92f919662f 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -36,6 +36,10 @@ inline Device device_or_default(c10::optional<Device> device) {
   return device.has_value() ? *device : Device(kCPU);
 }
 
+inline bool pinned_memory_or_default(c10::optional<bool> pinned_memory) {
+  return pinned_memory.has_value() ? *pinned_memory : false;
+}
+
 /// A class to encapsulate construction axes of an Tensor.  TensorOptions was
 /// designed to support the Python style API for specifying construction options
 /// on factory functions, e.g.,
@@ -317,7 +321,7 @@ struct C10_API TensorOptions {
 
   /// Returns the `pinned_memory` property of the `TensorOptions`.
   bool pinned_memory() const noexcept {
-    return has_pinned_memory_ ? pinned_memory_ : false;
+    return pinned_memory_or_default(pinned_memory_opt());
   }
 
   /// Returns whether the `pinned_memory` is specified.
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 8175390fc7e6..0b374e201fd1 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -71,12 +71,18 @@
 # arguments (inside of the `native_functions.yaml`)
 RENAME_TRACE_ADD_ARGS = {
     'fill': '''\
-    jit::tracer::addInputs(node, "options", TensorOptions());
+    jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
     c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
     jit::tracer::addInputs(node, "memory_format", memory_format);
 ''',
     'zero': '''\
-    jit::tracer::addInputs(node, "options", TensorOptions());
+    jit::tracer::addInputs(node, "options", c10::optional<ScalarType>());
+    jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt));
+    jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt));
     c10::optional<MemoryFormat> memory_format = c10::MemoryFormat::Preserve;
     jit::tracer::addInputs(node, "memory_format", memory_format);
 ''',
@@ -498,18 +504,24 @@ def format_trace_inputs(declaration):
 
     def dispatch_trace_input(arg_spec):
         name, value, simple_type, nullable = arg_spec
-        if declaration['use_c10_dispatcher'] == 'full':
-            if value == "options":
-                value = gather_tensor_options
-        else:
-            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
         # XXX: For arg that have type of Tensor?[], tracer will pass allow_undefined to addInputs
         if simple_type == 'TensorList' and nullable:
             return '''jit::tracer::addInputs(node, "{}", {}, {});'''.format(name, value, "true")
         else:
-            return ADD_TRACE_INPUT.substitute(name=name, input=value)
+            if value == "options":
+                result = ""
+                result += ADD_TRACE_INPUT.substitute(name=name, input="optTypeMetaToScalarType(options.dtype_opt())") + "\n"
+                result += ADD_TRACE_INPUT.substitute(name=name, input="options.layout()") + "\n"
+                result += ADD_TRACE_INPUT.substitute(name=name, input="options.device()") + "\n"
+                result += ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()")
+                return result
+            else:
+                return ADD_TRACE_INPUT.substitute(name=name, input=value)
 
-    trace_inputs = declaration['arguments']
+    if declaration['use_c10_dispatcher'] == 'full':
+        trace_inputs = declaration['schema_order_arguments']
+    else:
+        trace_inputs = declaration['arguments']
 
     if is_out_overload(declaration):
         # *_out functions take the result as a first argument, but they are the
@@ -517,7 +529,10 @@ def dispatch_trace_input(arg_spec):
         out_input = trace_inputs[0]
         trace_inputs = trace_inputs[1:]
 
-    trace_input_spec = [(i['name'], i['name'], i['simple_type'], i.get('is_nullable')) for i in trace_inputs]
+    if declaration['use_c10_dispatcher'] == 'full':
+        trace_input_spec = [(i['name'], i['name'], i['type'], i.get('is_nullable')) for i in trace_inputs]
+    else:
+        trace_input_spec = [(i['name'], i['name'], i['simple_type'], i.get('is_nullable')) for i in trace_inputs]
 
     trace_inputs = \
         '\n'.join(dispatch_trace_input(arg_spec) for arg_spec in trace_input_spec)
@@ -526,11 +541,6 @@ def dispatch_trace_input(arg_spec):
         # for *_out functions, handle the result argument differently for inplace/outplace.
         # For inplace: just add the input to the end to confirm with the JIT schema
         value = out_input['name']
-        if declaration['use_c10_dispatcher'] == 'full':
-            if value == "options":
-                value = gather_tensor_options
-        else:
-            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
         inplace = ADD_TRACE_INPUT.substitute(name=out_input['name'], input=value)
 
         # for outplace: do nothing, except if the declaration is a factory.
@@ -539,7 +549,11 @@ def dispatch_trace_input(arg_spec):
         trace_name = uninplace_api_name(declaration['api_name'])
         has_factory_name = trace_name in FACTORY_FUNCTION_NAMES
         if has_factory_name:
-            outplace = ADD_TRACE_INPUT.substitute(name='out', input='out.options()')
+            outplace = ""
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='optTypeMetaToScalarType(out.options().dtype_opt())') + "\n"
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().layout()') + "\n"
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().device()') + "\n"
+            outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().pinned_memory()')
         else:
             outplace = ''
 
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index c4f749271a53..7c544c1aa18e 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -696,15 +696,6 @@ void addInputs(
   }
 }
 
-void addInputs(Node* n, const char* name, const at::TensorOptions& options) {
-  // [TensorOptions in script] - update this when you change how we schematize
-  // TensorOptions
-  addInputs(n, name, options.dtype_opt());
-  addInputs(n, name, options.layout());
-  addInputs(n, name, options.device());
-  addInputs(n, name, options.pinned_memory());
-}
-
 void addInputs(Node* n, const char* name, at::IntArrayRef value) {
   using ArgumentStash = jit::tracer::ArgumentStash;
   std::vector<Value*> info = ArgumentStash::hasIntArrayRef(name)
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 82ce500c532c..5b1243c424b8 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -266,10 +266,6 @@ TORCH_API void addInputs(
     Node* n,
     const char* name,
     const c10::optional<std::string>& value);
-TORCH_API void addInputs(
-    Node* n,
-    const char* name,
-    const at::TensorOptions& value);
 TORCH_API void addInputs(Node* n, const char* name, at::Device value);
 TORCH_API void addInputs(Node* n, const char* name, at::Layout value);
 TORCH_API void addInputs(Node* n, const char* name, at::ScalarType value);

From b70fac75acd0a7f3682cd4903067f9ad8b6f7ed4 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Fri, 25 Sep 2020 09:14:49 -0700
Subject: [PATCH 136/449] CMake: Fix python dependencies in codegen (#45275)

Summary:
I noticed while working on https://github.com/pytorch/pytorch/issues/45163 that edits to python files in the  `tools/codegen/api/` directory wouldn't trigger rebuilds. This tells CMake about all of the dependencies, so rebuilds are triggered automatically.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45275

Reviewed By: zou3519

Differential Revision: D23922805

Pulled By: ezyang

fbshipit-source-id: 0fbf2b6a9b2346c31b9b0384e5ad5e0eb0f70e9b
---
 cmake/Codegen.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 63e2d9f4d934..b4bfc421dab0 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -144,7 +144,7 @@ if(INTERN_BUILD_ATEN_OPS)
   endforeach()
   list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
 
-  file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
+  file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
 
   set(GEN_ROCM_FLAG)
   if(USE_ROCM)

From 8b00c4c7949fd6a5c82eed48751d41be42f3338e Mon Sep 17 00:00:00 2001
From: Shinichiro Hamaji <shinichiro.hamaji@gmail.com>
Date: Fri, 25 Sep 2020 09:24:56 -0700
Subject: [PATCH 137/449] [ONNX] Correct a minor typo in warning (#45187)

Summary:
The warning for batch_norm was mentioning dropout.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45187

Reviewed By: glaringlee

Differential Revision: D23873215

Pulled By: ezyang

fbshipit-source-id: 1dcc82ad16522215f49b4cd0fc0e357b2094e4f2
---
 torch/onnx/symbolic_opset9.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 0acb254327e3..b6b9aeda949d 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1176,7 +1176,7 @@ def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, gr
 
 @parse_args('v', 'v', 'v', 'v', 'v', 'i', 'f', 'f', 'i')
 def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled):
-    sym_help.assert_training_mode(training, "dropout")
+    sym_help.assert_training_mode(training, "batch_norm")
     input_sizes = input.type().sizes()
 
     if weight is None or sym_help._is_none(weight):

From a117d968f6ec20eb6b0b3d856e1adc6887a1d582 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Fri, 25 Sep 2020 09:50:26 -0700
Subject: [PATCH 138/449] [quant][graph] Remove redundant aten::wait calls in
 the graph (#45257)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45257

Currently we inline fork-wait calls when we insert observers for quantization
In the case where fork and wait are in different subgraphs, inlining the fork-wait calls
only gets rid of the fork. This leaves the aten::wait call in the graph with a torch.Tensor as input,
which is currently not supported.
To avoid this we check to make sure input to all wait calls in the graph is of type Future[tensor]
in the cleanup phase

Test Plan:
python test/test_quantization.py TestQuantizeJitPasses.test_quantize_fork_wait

Imported from OSS

Reviewed By: qizzzh

Differential Revision: D23895412

fbshipit-source-id: 3c58c6be7d7e7904eb6684085832ac21f827a399
---
 test/quantization/test_quantize_jit.py  | 47 +++++++++++++++++++++++++
 torch/csrc/jit/passes/freeze_module.cpp | 26 ++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py
index 6d94919eee1f..7a14816ee8ad 100644
--- a/test/quantization/test_quantize_jit.py
+++ b/test/quantization/test_quantize_jit.py
@@ -74,6 +74,7 @@
 # Standard library
 import itertools
 import unittest
+import io
 
 class TestQuantizeJitPasses(QuantizationTestCase):
     """ Test graph mode quantization passes used by quantize_jit
@@ -1361,6 +1362,52 @@ def forward(self, x, y):
         FileCheck().check("quantized::embedding_bag_byte_rowwise_offsets") \
                    .run(m.graph)
 
+    @skipIfNoFBGEMM
+    def test_quantize_fork_wait(self):
+        """ Tests the case where fork and wait calls are in different subgraphs
+        Calling inline fork-wait only removes the fork call and leaves aten::wait
+        calls in the graph, with Tensor as input (instead of Future[Tensor])
+        """
+        class MainModule(nn.Module):
+            def __init__(self):
+                super(MainModule, self).__init__()
+                self.fork_ops = ForkModule()
+
+            def init_values(self, x):
+                shared_module = self.fork_ops(x)
+                self.fork_dict = shared_module
+
+            def forward(self, x):
+                val = torch.jit._wait(self.fork_ops(x))
+                return val
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+
+            def forward(self, x):
+                w = torch.ones(5, 5)
+                b = torch.zeros(5)
+                return torch.nn.functional.linear(x, w, b)
+
+        class ForkModule(nn.Module):
+            def __init__(self):
+                super(ForkModule, self).__init__()
+                self.test = TestModule()
+
+            def forward(self, x):
+                fut = torch.jit._fork(self.test.forward, x)
+                return fut
+
+        model = MainModule().eval()
+        traced = torch.jit.trace(model, (torch.randn(5, 5),))
+        model = prepare_dynamic_jit(traced, {'' : default_qconfig})
+        model = convert_dynamic_jit(model)
+        FileCheck().check("quantized::linear_dynamic") \
+                   .run(model.graph)
+        # Make sure model save works
+        b = io.BytesIO()
+        torch.jit.save(model, b)
 
 class TestQuantizeJitOps(QuantizationTestCase):
     """ Test graph mode post training static quantization works
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index bec7bf144201..45eb6db91608 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -506,6 +506,31 @@ class AttributePropagator {
     return false;
   }
 
+  void removeExtraWaitCalls(Block* b) {
+    auto nodes = b->nodes();
+    for (auto it = nodes.begin(); it != nodes.end(); it++) {
+      auto node = *it;
+      if (node->kind() != aten::wait) {
+        continue;
+      }
+      TORCH_INTERNAL_ASSERT(node->inputs().size() == 1);
+      TORCH_INTERNAL_ASSERT(node->outputs().size() == 1);
+      // If input type is not a from aten::fork call then the
+      // aten::wait operator can be deleted.
+      if (node->input()->type()->kind() != TypeKind::FutureType) {
+        node->output()->replaceAllUsesWith(node->input());
+        it.destroyCurrent();
+      }
+    }
+    // For the remaining nodes, recurse.
+    for (auto it = nodes.begin(); it != nodes.end(); it++) {
+      auto node = *it;
+      for (auto sub_b : node->blocks()) {
+        removeExtraWaitCalls(sub_b);
+      }
+    }
+  }
+
   // cleanupFrozenModule function cleans up the Frozen module. It performs the
   // following:
   // 1) Remove unused attributes.
@@ -516,6 +541,7 @@ class AttributePropagator {
       auto graph = function->graph();
       recordReferencedAttrs(graph);
       handleSharedClassType(module_, graph);
+      removeExtraWaitCalls(graph->block());
     }
     removeUnusedAttrs();
   }

From 536580e9768bad4c0532d2f36c449dcbd8e5efc7 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Fri, 25 Sep 2020 10:13:50 -0700
Subject: [PATCH 139/449] Vectorize bitwise_not (#45103)

Summary:
Benchmark (Debian 10, Release build, gcc 8.3, no turbo, Intel(R) Xeon(R)
E-2136 CPU @ 3.30GHz):

```python
import timeit
for dtype in ('torch.int64', 'torch.int32', 'torch.int16', 'torch.int8', 'torch.uint8'):
    for n, t in [(10_000, 100000),
                (100_000, 10000)]:
        print(f'torch.bitwise_not(a), numel() == {n} for {t} times, dtype={dtype}')
        print(timeit.timeit('torch.bitwise_not(a)', setup=f'import torch; a = torch.arange(-{n//2}, {n//2}, dtype={dtype})', number=t))
```

Before:

```
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int64
0.5479081739904359
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int64
0.3350257440470159
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int32
0.39590477803722024
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int32
0.25563537096604705
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int16
0.31152817397378385
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int16
0.20817365101538599
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int8
0.8573925020173192
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int8
0.4150037349900231
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.uint8
0.8551108679967001
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.uint8
0.37137620500288904
```

After:

```
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int64
0.5232444299617782
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int64
0.33852163201663643
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int32
0.3931163849774748
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int32
0.24392802000511438
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int16
0.3122224889229983
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int16
0.1977886479580775
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.int8
0.26711542706470937
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.int8
0.18208567495457828
torch.bitwise_not(a), numel() == 10000 for 100000 times, dtype=torch.uint8
0.2615354140289128
torch.bitwise_not(a), numel() == 100000 for 10000 times, dtype=torch.uint8
0.17972210398875177
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45103

Reviewed By: ailzhang

Differential Revision: D23848675

Pulled By: ezyang

fbshipit-source-id: 6dde1ab32d9a343a49de66ad9f9b062fa23824d2
---
 aten/src/ATen/cpu/vec256/vec256_base.h      | 8 ++++++++
 aten/src/ATen/cpu/vec256/vec256_int.h       | 4 ++++
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp | 7 +++++--
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h
index 49acbc518dca..b6cc1db24028 100644
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@@ -736,6 +736,14 @@ inline Vec256<T> operator^(const Vec256<T>& a, const Vec256<T>& b) {
 
 #endif
 
+template<class T, typename std::enable_if_t<!std::is_base_of<Vec256i, Vec256<T>>::value, int> = 0>
+inline Vec256<T> operator~(const Vec256<T>& a) {
+  Vec256<T> ones;  // All bits are 1
+  memset((T*) ones, 0xFF, 32);
+  return a ^ ones;
+}
+
+
 template <typename T>
 inline Vec256<T>& operator += (Vec256<T>& a, const Vec256<T>& b) {
   a = a + b;
diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index 98afd8bdd33c..6486d06182eb 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -1039,6 +1039,10 @@ template<class T, typename std::enable_if_t<std::is_base_of<Vec256i, Vec256<T>>:
 inline Vec256<T> operator^(const Vec256<T>& a, const Vec256<T>& b) {
   return _mm256_xor_si256(a, b);
 }
+template<class T, typename std::enable_if_t<std::is_base_of<Vec256i, Vec256<T>>::value, int> = 0>
+inline Vec256<T> operator~(const Vec256<T>& a) {
+  return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
+}
 
 Vec256<int64_t> Vec256<int64_t>::eq(const Vec256<int64_t>& other) const {
   return (*this == other) & Vec256<int64_t>(1);
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 45c7e4e23762..9435921039b3 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -213,11 +213,14 @@ static void bitwise_not_kernel(TensorIterator& iter) {
           });
   } else {
     AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_not_cpu", [&]() {
-      cpu_kernel(
+      cpu_kernel_vec(
           iter,
           [](scalar_t a) -> scalar_t {
             return ~a;
-      });
+          },
+          [](Vec256<scalar_t> a) -> Vec256<scalar_t> {
+            return ~a;
+          });
     });
   }
 }

From d1d9017a66cb018ccc9e0f39607c1e93534d3e37 Mon Sep 17 00:00:00 2001
From: Nick Gibson <nickg@fb.com>
Date: Fri, 25 Sep 2020 10:50:58 -0700
Subject: [PATCH 140/449] [NNC] fix Half conversion of immediates in Cuda
 backend (#45213)

Summary:
The Cuda HalfChecker casts up all loads and stores of Half to Float, so we do math in Float on the device. It didn't cast up HalfImmediate (ie. constants) so they could insert mixed-size ops. Fix is to do that.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45213

Reviewed By: ezyang

Differential Revision: D23885287

Pulled By: nickgg

fbshipit-source-id: 912991d85cc06ebb282625cfa5080d7525c8eba9
---
 test/cpp/tensorexpr/test_cuda.cpp             | 49 +++++++++++++++++++
 test/cpp/tensorexpr/tests.h                   |  1 +
 test/test_jit_fuser_te.py                     |  2 +-
 torch/csrc/jit/tensorexpr/cuda_half_support.h |  5 ++
 4 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index 2ad70e158ebf..12102b1c604c 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -985,6 +985,55 @@ void testCudaHalfSupport() {
   cudaFree(dDev);
 }
 
+void testCudaHalfPropagation() {
+  KernelScope kernel_scope;
+  auto half = ToDtype<at::Half>();
+  Buffer a("a", half, {4});
+  Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
+    return Max::make(a(i), ExprHandle(new HalfImm(0)), true);
+  });
+
+  LoopNest l({relu});
+  l.prepareForCodegen();
+  Stmt* s = l.root_stmt();
+  CudaCodeGen cg(s, {a, relu});
+
+  std::ostringstream oss;
+  oss << *cg.stmt();
+
+  // Check the types used by the Max are Float.
+  const std::string& verification_pattern =
+      R"IR(
+# CHECK: for (
+# CHECK:  float v = float(a[n]);
+# CHECK:  relu[n] = half(Max(v, 0.f
+# CHECK: })IR";
+
+  torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+  std::vector<at::Half> aData(4, 2.0f);
+  std::vector<at::Half> reluData(4, 0.0f);
+  at::Half* aDev = nullptr;
+  at::Half* reluDev = nullptr;
+  auto aSize = aData.size() * sizeof(aData[0]);
+  auto reluSize = reluData.size() * sizeof(reluData[0]);
+
+  cudaMalloc(&aDev, aSize);
+  cudaMalloc(&reluDev, reluSize);
+  cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice);
+  cudaMemcpy(reluDev, reluData.data(), reluSize, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+
+  cg.call({aDev, reluDev});
+  cudaMemcpy(reluData.data(), reluDev, reluSize, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+
+  assertAllEqual(aData, reluData);
+
+  cudaFree(aDev);
+  cudaFree(reluDev);
+}
+
 void testCudaPrioritizeDependents() {
   KernelScope kernel_scope;
   Buffer a("a", kFloat, {10});
diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h
index 34eeaa0de19a..dc21373f241f 100644
--- a/test/cpp/tensorexpr/tests.h
+++ b/test/cpp/tensorexpr/tests.h
@@ -445,6 +445,7 @@ namespace jit {
   _(CudaSigmoid)                           \
   _(CudaHalfCast)                          \
   _(CudaHalfSupport)                       \
+  _(CudaHalfPropagation)                   \
   _(CudaPrioritizeDependents)              \
   _(CudaMaskBlockDim)                      \
   _(CudaMaskThreadDim)                     \
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 6fab65006927..356d396b4c02 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1268,7 +1268,7 @@ def apply(fn):
             torch.int16,
             torch.int32,
             torch.int64,
-            # torch.float16,
+            torch.float16,
             torch.float32,
             torch.float64,
             torch.bool,
diff --git a/torch/csrc/jit/tensorexpr/cuda_half_support.h b/torch/csrc/jit/tensorexpr/cuda_half_support.h
index 62e3ff21fb72..11725c2d7b55 100644
--- a/torch/csrc/jit/tensorexpr/cuda_half_support.h
+++ b/torch/csrc/jit/tensorexpr/cuda_half_support.h
@@ -38,6 +38,11 @@ class CudaHalfChecker : public IRMutator {
     return new Store(v->buf(), v->indices(), new_val, v->mask());
   }
 
+  const Expr* mutate(const HalfImm* v) override {
+    hasHalf_ = true;
+    return new Cast(kFloat, v);
+  }
+
  private:
   bool hasHalf_{false};
 };

From d1a11618f59bbabb4594cc4bc38ae18b3a1f5c26 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Fri, 25 Sep 2020 11:01:10 -0700
Subject: [PATCH 141/449] [static runtime] Add _out variants and reuse memory
 (#44128)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44128

Test Plan: Imported from OSS

Reviewed By: hlu1

Differential Revision: D23604304

Pulled By: bwasti

fbshipit-source-id: 06a23cb75700a0fc733069071843b7b498e7b9e9
---
 benchmarks/static_runtime/CMakeLists.txt |   6 +-
 caffe2/CMakeLists.txt                    |   2 +
 test/test_static_runtime.py              |  13 ++-
 tools/build_variables.bzl                |   1 +
 torch/csrc/jit/runtime/static/impl.cpp   |  63 ++---------
 torch/csrc/jit/runtime/static/impl.h     |   3 +-
 torch/csrc/jit/runtime/static/ops.cpp    | 128 +++++++++++++++++++++++
 torch/csrc/jit/runtime/static/ops.h      |  41 ++++++++
 8 files changed, 192 insertions(+), 65 deletions(-)
 create mode 100644 torch/csrc/jit/runtime/static/ops.cpp
 create mode 100644 torch/csrc/jit/runtime/static/ops.h

diff --git a/benchmarks/static_runtime/CMakeLists.txt b/benchmarks/static_runtime/CMakeLists.txt
index 6191150dc61b..0a263c2a5a91 100644
--- a/benchmarks/static_runtime/CMakeLists.txt
+++ b/benchmarks/static_runtime/CMakeLists.txt
@@ -1,3 +1,7 @@
-list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc)
 list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
+list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc)
 set(STATIC_RUNTIME_BENCHMARK_SRCS ${STATIC_RUNTIME_BENCHMARK_SRCS} PARENT_SCOPE)
+
+list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc)
+list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc)
+set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE)
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 6ea848bd32e5..cea1eb56287c 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1247,7 +1247,9 @@ endif()
 if(BUILD_STATIC_RUNTIME_BENCHMARK)
   add_subdirectory(${TORCH_ROOT}/benchmarks/static_runtime ${PROJECT_BINARY_DIR}/bin)
   add_executable(static_runtime_bench "${STATIC_RUNTIME_BENCHMARK_SRCS}")
+  add_executable(static_runtime_test "${STATIC_RUNTIME_TEST_SRCS}")
   target_link_libraries(static_runtime_bench torch_library benchmark)
+  target_link_libraries(static_runtime_test torch_library gtest_main)
 endif()
 
 if(BUILD_MOBILE_BENCHMARK)
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 407ea03acda6..582cb320bbad 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -106,7 +106,8 @@ def test_multihead_attention_layer(self):
         DROPOUT = 0.1
         device = torch.device("cpu")
         attention = MultiHeadAttentionLayer(HID_DIM, HEADS, DROPOUT, device).to(device)
-        src = torch.randn(BATCH_SIZE, QUERY_LEN, HID_DIM).to(device)
+        with torch.no_grad():
+            src = torch.randn(BATCH_SIZE, QUERY_LEN, HID_DIM).to(device)
         src_mask = (src > 0)[:, :, 0].unsqueeze(1).unsqueeze(2).to(device)
 
         attention.eval()
@@ -129,8 +130,9 @@ def test_mlp(self):
         bot_l_acc = StaticRuntime(bot_l)
         top_l = create_mlp(ln_top, sigmoid_top)
         top_l_acc = StaticRuntime(top_l)
-        bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
-        top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
+        with torch.no_grad():
+            bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
+            top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
         ref_bot = bot_l(bot_inp)
         acc_bot = bot_l_acc(bot_inp)[0]
         torch.testing.assert_allclose(acc_bot, ref_bot)
@@ -138,8 +140,9 @@ def test_mlp(self):
         acc_top = top_l_acc(top_inp)[0]
         torch.testing.assert_allclose(acc_top, ref_top)
         for _ in range(5):
-            bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
-            top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
+            with torch.no_grad():
+                bot_inp = torch.randn(2048, 512)  # torch.Size([2048, 512])
+                top_inp = torch.randn(2048, 100)  # torch.Size([2048, 100])
             ref_bot = bot_l(bot_inp)
             acc_bot = bot_l_acc(bot_inp)[0]
             torch.testing.assert_allclose(acc_bot, ref_bot)
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 26ab975373a8..0b670724ba92 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -219,6 +219,7 @@ core_sources_full = [
     "torch/csrc/jit/runtime/profiling_record.cpp",
     "torch/csrc/jit/runtime/symbolic_script.cpp",
     "torch/csrc/jit/runtime/static/impl.cpp",
+    "torch/csrc/jit/runtime/static/ops.cpp",
     "torch/csrc/jit/serialization/import.cpp",
     "torch/csrc/jit/serialization/import_export_helpers.cpp",
     "torch/csrc/jit/serialization/import_source.cpp",
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 5b3c1e029a90..69171f3295cb 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -4,6 +4,7 @@
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <torch/csrc/jit/runtime/static/ops.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
 namespace torch {
@@ -12,48 +13,6 @@ namespace jit {
 using c10::DispatchKey;
 using c10::RegisterOperators;
 
-static auto reg =
-    RegisterOperators()
-        .op("static::add(Tensor a, Tensor b) -> Tensor",
-            RegisterOperators::options().kernel(
-                DispatchKey::CPU,
-                [](at::Tensor a, at::Tensor b) -> at::Tensor { return a + b; }))
-        .op("static::mul.a(Tensor a, Tensor b) -> Tensor",
-            RegisterOperators::options().kernel(
-                DispatchKey::CPU,
-                [](at::Tensor a, at::Tensor b) -> at::Tensor { return a * b; }))
-        .op("static::mul.b(Tensor a, int b) -> Tensor",
-            RegisterOperators::options().kernel(
-                DispatchKey::CPU,
-                [](at::Tensor a, int64_t b) -> at::Tensor { return a * b; }));
-
-#define SUPPORTED_OPS(F) \
-  F(aten::__getitem__)   \
-  F(aten::add)           \
-  F(aten::addmm)         \
-  F(aten::bmm)           \
-  F(aten::cat)           \
-  F(aten::clamp)         \
-  F(aten::contiguous)    \
-  F(aten::div)           \
-  F(aten::flatten)       \
-  F(aten::index_put_)    \
-  F(aten::isnan)         \
-  F(aten::matmul)        \
-  F(aten::mul)           \
-  F(aten::permute)       \
-  F(aten::relu)          \
-  F(aten::sigmoid)       \
-  F(aten::size)          \
-  F(aten::softmax)       \
-  F(aten::t)             \
-  F(aten::to)            \
-  F(aten::transpose)     \
-  F(aten::view)          \
-  F(prim::Constant)      \
-  F(prim::ListConstruct) \
-  F(prim::TupleConstruct)
-
 StaticRuntime::StaticRuntime(const torch::jit::Module& m)
     : module_(m.copy()), graph_(nullptr) {
   module_.eval();
@@ -84,19 +43,6 @@ StaticRuntime::StaticRuntime(const torch::jit::Module& m)
     }
   }
 
-  SubgraphRewriter sr;
-  sr.RegisterRewritePattern(
-      R"IR(
-  graph(%x, %w, %s):
-    %r = aten::add(%x, %w, %s)
-    return (%r))IR",
-      R"IR(
-  graph(%x, %w, %s):
-    %y = static::add(%x, %w)
-    %r = static::mul(%y, %s)
-    return (%r))IR");
-  sr.runOnGraph(graph_);
-
   // remove unused input 0 from graph
   if (graph_->inputs().at(0)->type()->is_module()) {
     if (!graph_->inputs().at(0)->hasUses()) {
@@ -157,10 +103,13 @@ ProcessedNode::ProcessedNode(Node* node) : node_(node) {
     CHECK(op.hasOperation());
     op_ = op.getOperation(node);
   }
+  if (canRunOutOfPlace(node)) {
+    fn_ = getOutOfPlaceOperation(node);
+  }
 }
 
 void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
-  if (use_stack_) {
+  if (!fn_) {
     std::vector<IValue> stack;
     const size_t size = node_->inputs().size();
     stack.reserve(size);
@@ -201,7 +150,7 @@ void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
       workspace[node_->outputs()[i]] = stack[i];
     }
   } else {
-    TORCH_CHECK(0, "Non-stack execution not yet implemented");
+    (*fn_)(workspace);
   }
 }
 
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 2274d2883fb5..3fc8a2ee2a2b 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -53,8 +53,7 @@ class ProcessedNode {
  private:
   Node* node_;
   c10::optional<Operation> op_;
-  // if false, we have an optimized version
-  bool use_stack_ = true;
+  c10::optional<std::function<void(StaticRuntime::ConstantMap&)>> fn_;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
new file mode 100644
index 000000000000..291168e368a0
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -0,0 +1,128 @@
+#include <torch/csrc/jit/runtime/static/ops.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+bool canRunOutOfPlace(Node* n) {
+  auto str = std::string(n->kind().toQualString());
+  if ((str == "aten::add") || (str == "aten::mul") || (str == "aten::addmm") ||
+      (str == "aten::bmm") || (str == "aten::sigmoid") ||
+      (str == "aten::cat")) {
+    return true;
+  }
+  return false;
+}
+
+std::function<void(StaticRuntime::ConstantMap&)> getOutOfPlaceOperation(
+    Node* n) {
+  auto create_empty_from = [](const at::Tensor& t) {
+    return at::empty({0}, t.options());
+  };
+
+  if (n->kind() == c10::Symbol::fromQualString("aten::add")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      auto in2_s = ws.at(in2).toScalar();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::add_out(out_t, in0_t, in1_t, in2_s);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::mul")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::mul_out(out_t, in0_t, in1_t);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::addmm")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    auto in3 = n->inputs().at(3);
+    auto in4 = n->inputs().at(4);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      auto in2_t = ws.at(in2).toTensor();
+      auto in3_s = ws.at(in3).toScalar();
+      auto in4_s = ws.at(in3).toScalar();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::addmm_cpu_out(out_t, in0_t, in1_t, in2_t, in3_s, in4_s);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::clamp")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_s = ws.at(in1).toScalar();
+      auto in2_s = ws.at(in2).toScalar();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::clamp_out(out_t, in0_t, in1_s, in2_s);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::bmm")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_t = ws.at(in1).toTensor();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::bmm_out_cpu(out_t, in0_t, in1_t);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::cat")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_tl = ws.at(in0).toTensorVector();
+      auto in1_i = ws.at(in1).toInt();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_tl[0]));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::cat_out(out_t, in0_tl, in1_i);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::sigmoid")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      if (!ws.count(out)) {
+        ws.emplace(out, create_empty_from(in0_t));
+      }
+      auto out_t = ws.at(out).toTensor();
+      at::native::sigmoid_out(out_t, in0_t);
+    };
+  }
+
+  return [](StaticRuntime::ConstantMap&) { TORCH_CHECK(0); };
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
new file mode 100644
index 000000000000..e00416d786d3
--- /dev/null
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+
+namespace torch {
+namespace jit {
+
+bool canRunOutOfPlace(Node* n);
+std::function<void(StaticRuntime::ConstantMap&)> getOutOfPlaceOperation(
+    Node* n);
+
+#define SUPPORTED_OPS(F) \
+  F(aten::__getitem__)   \
+  F(aten::add)           \
+  F(aten::addmm)         \
+  F(aten::bmm)           \
+  F(aten::cat)           \
+  F(aten::clamp)         \
+  F(aten::contiguous)    \
+  F(aten::div)           \
+  F(aten::flatten)       \
+  F(aten::index_put_)    \
+  F(aten::isnan)         \
+  F(aten::matmul)        \
+  F(aten::mul)           \
+  F(aten::permute)       \
+  F(aten::relu)          \
+  F(aten::sigmoid)       \
+  F(aten::size)          \
+  F(aten::softmax)       \
+  F(aten::t)             \
+  F(aten::to)            \
+  F(aten::transpose)     \
+  F(aten::view)          \
+  F(prim::Constant)      \
+  F(prim::ListConstruct) \
+  F(prim::TupleConstruct)
+
+} // namespace jit
+} // namespace torch

From e5f6e5af136367484daa0fa878578fb88437cc17 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Fri, 25 Sep 2020 11:01:10 -0700
Subject: [PATCH 142/449] Add Deep and wide to test and flatten/tranpose for
 good measure (#44129)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44129

Test Plan: Imported from OSS

Reviewed By: hlu1

Differential Revision: D23604302

Pulled By: bwasti

fbshipit-source-id: 5787f6f32a80b22b1b712c4116f70370dad98f12
---
 .../static_runtime/test_static_runtime.cc     | 24 +++++++++++++++
 torch/csrc/jit/runtime/static/impl.cpp        |  7 +++--
 torch/csrc/jit/runtime/static/ops.cpp         | 30 +++++++++++++++++--
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 3ad0956ced73..6b9b1dd198cc 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -18,3 +18,27 @@ TEST(StaticRuntime, TrivialModel) {
   at::Tensor output_2 = runtime.run(input_tensors)[0];
   EXPECT_TRUE(output_1.equal(output_2));
 }
+
+TEST(StaticRuntime, DeepWide) {
+  const int embedding_size = 32;
+  const int num_features = 50;
+  torch::jit::Module mod = getDeepAndWideSciptModel();
+  torch::jit::StaticRuntime runtime(mod);
+
+  for (int batch_size : {1, 8, 32}) {
+    for (int i = 0; i < 5; ++i) {
+      auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+      auto user_emb = torch::randn({batch_size, 1, embedding_size});
+      auto wide = torch::randn({batch_size, num_features});
+
+      // run jit graph executor
+      std::vector<at::IValue> inputs({ad_emb_packed, user_emb, wide});
+      at::Tensor output_1 = mod.forward(inputs).toTensor();
+
+      // run static runtime
+      std::vector<at::Tensor> input_tensors({ad_emb_packed, user_emb, wide});
+      at::Tensor output_2 = runtime.run(input_tensors)[0];
+      EXPECT_TRUE(output_1.equal(output_2));
+    }
+  }
+}
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 69171f3295cb..968a2d88b01e 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1,9 +1,10 @@
-#include <torch/csrc/jit/runtime/static/impl.h>
+#include <ATen/core/interned_strings.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/ops.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
@@ -123,7 +124,7 @@ void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
       stack.emplace_back(f->second);
     }
     if (op_) {
-      (*op_)(&stack);
+      op_->operator()(&stack);
     } else {
       if (node_->kind() == prim::ListConstruct) {
         listConstruct(
@@ -150,7 +151,7 @@ void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
       workspace[node_->outputs()[i]] = stack[i];
     }
   } else {
-    (*fn_)(workspace);
+    fn_->operator()(workspace);
   }
 }
 
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 291168e368a0..94e4dc32f676 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1,5 +1,6 @@
-#include <torch/csrc/jit/runtime/static/ops.h>
+#include <ATen/NativeFunctions.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/ops.h>
 
 namespace torch {
 namespace jit {
@@ -8,7 +9,8 @@ bool canRunOutOfPlace(Node* n) {
   auto str = std::string(n->kind().toQualString());
   if ((str == "aten::add") || (str == "aten::mul") || (str == "aten::addmm") ||
       (str == "aten::bmm") || (str == "aten::sigmoid") ||
-      (str == "aten::cat")) {
+      (str == "aten::cat") || (str == "aten::transpose") ||
+      (str == "aten::flatten")) {
     return true;
   }
   return false;
@@ -106,7 +108,7 @@ std::function<void(StaticRuntime::ConstantMap&)> getOutOfPlaceOperation(
         ws.emplace(out, create_empty_from(in0_tl[0]));
       }
       auto out_t = ws.at(out).toTensor();
-      at::native::cat_out(out_t, in0_tl, in1_i);
+      at::native::_cat_out_cpu(out_t, in0_tl, in1_i);
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::sigmoid")) {
     auto out = n->outputs().at(0);
@@ -119,6 +121,28 @@ std::function<void(StaticRuntime::ConstantMap&)> getOutOfPlaceOperation(
       auto out_t = ws.at(out).toTensor();
       at::native::sigmoid_out(out_t, in0_t);
     };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::transpose")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_i = ws.at(in1).toInt();
+      auto in2_i = ws.at(in2).toInt();
+      ws[out] = at::native::transpose(in0_t, in1_i, in2_i);
+    };
+  } else if (n->kind() == c10::Symbol::fromQualString("aten::flatten")) {
+    auto out = n->outputs().at(0);
+    auto in0 = n->inputs().at(0);
+    auto in1 = n->inputs().at(1);
+    auto in2 = n->inputs().at(2);
+    return [=](StaticRuntime::ConstantMap& ws) {
+      auto in0_t = ws.at(in0).toTensor();
+      auto in1_i = ws.at(in1).toInt();
+      auto in2_i = ws.at(in2).toInt();
+      ws[out] = at::native::flatten(in0_t, in1_i, in2_i);
+    };
   }
 
   return [](StaticRuntime::ConstantMap&) { TORCH_CHECK(0); };

From dc9e9c118e97b81466f2cd94df406ae8ed5a8189 Mon Sep 17 00:00:00 2001
From: "Gao, Xiang" <qasdfgtyuiop@gmail.com>
Date: Fri, 25 Sep 2020 11:24:10 -0700
Subject: [PATCH 143/449] CUDA BFloat16 neg (#45240)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45240

Reviewed By: mruberry

Differential Revision: D23933392

Pulled By: ngimel

fbshipit-source-id: 2472dc550600ff470a1044ddee39054e22598038
---
 aten/src/ATen/native/cuda/UnarySignKernels.cu         | 6 ++----
 torch/testing/_internal/common_methods_invocations.py | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index cd02c89f23f0..e0b9e33baf99 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -22,10 +22,8 @@ void logical_not_kernel_cuda(TensorIterator& iter) {
 
 void neg_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "neg_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "neg_cuda", [&] {
-      gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-        return -a;
-      });
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return -a;
     });
   });
 }
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index a434d69a8654..cf4ff7f31fdd 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -336,7 +336,7 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    ref=np.negative,
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
                    dtypesIfCPU=all_types_and_complex_and(torch.half, torch.bfloat16),
-                   dtypesIfCUDA=all_types_and_complex_and(torch.half)),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16)),
     UnaryUfuncInfo('sin',
                    ref=np.sin,
                    handles_large_floats=False,

From 5a0514e3e674529248a7bf6859c538d51b8d6194 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Fri, 25 Sep 2020 11:24:54 -0700
Subject: [PATCH 144/449] [pytorch] Update fmt to 7.0.3 (#45304)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45304

As title

Test Plan: sandcastle

Reviewed By: malfet

Differential Revision: D23916328

fbshipit-source-id: 47c76886c1f17233304dc59289ff6baa16c50b8d
---
 third_party/fmt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fmt b/third_party/fmt
index 9bdd1596cef1..cd4af11efc9c 160000
--- a/third_party/fmt
+++ b/third_party/fmt
@@ -1 +1 @@
-Subproject commit 9bdd1596cef1b57b9556f8bef32dc4a32322ef3e
+Subproject commit cd4af11efc9c622896a3e4cb599fa28668ca3d05

From 22401b850b2010119d3f674f495b7c2b2dfb710b Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Fri, 25 Sep 2020 11:35:39 -0700
Subject: [PATCH 145/449] port all JIT tests to gtest (#45264)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45264

Context for why we are porting to gtest in: https://github.com/pytorch/pytorch/pull/45018.

This PR completes the process of porting and removes unused files/macros.

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D23901392

Pulled By: suo

fbshipit-source-id: 89526890e1a49462f3f77718f4ee273c5bc578ba
---
 aten/src/ATen/test/thread_init_test.cpp  |    2 +-
 test/cpp/jit/CMakeLists.txt              |    3 -
 test/cpp/jit/README.md                   |   81 +-
 test/cpp/jit/gtest.cpp                   |   23 -
 test/cpp/jit/test_base.cpp               |   26 -
 test/cpp/jit/test_base.h                 |   47 -
 test/cpp/jit/test_class_parser.cpp       |    1 -
 test/cpp/jit/test_class_type.cpp         |   11 +-
 test/cpp/jit/test_gpu.cpp                |  237 +++--
 test/cpp/jit/test_graph_executor.cpp     |    5 +-
 test/cpp/jit/test_inliner.cpp            |   20 +-
 test/cpp/jit/test_interface.cpp          |    4 +-
 test/cpp/jit/test_interpreter.cpp        |  181 ++--
 test/cpp/jit/test_ir.cpp                 |    9 +-
 test/cpp/jit/test_irparser.cpp           |  368 ++++----
 test/cpp/jit/test_jit_type.cpp           |    5 +-
 test/cpp/jit/test_lite_interpreter.cpp   |   59 +-
 test/cpp/jit/test_lite_trainer.cpp       |   17 +-
 test/cpp/jit/test_misc.cpp               | 1025 +++++++++++-----------
 test/cpp/jit/test_mobile_type_parser.cpp |   31 +-
 test/cpp/jit/test_module_api.cpp         |   53 +-
 test/cpp/jit/test_peephole_optimize.cpp  |  126 +--
 test/cpp/jit/test_qualified_name.cpp     |  114 +--
 test/cpp/jit/test_save_load.cpp          |   87 +-
 test/cpp/jit/test_schema_matching.cpp    |  106 +--
 test/cpp/jit/test_subgraph_matcher.cpp   |   51 +-
 test/cpp/jit/test_subgraph_rewriter.cpp  |   12 +-
 test/cpp/jit/test_subgraph_utils.cpp     |    7 +-
 test/cpp/jit/test_utils.cpp              |   20 +
 test/cpp/jit/test_utils.h                |    1 -
 test/cpp/jit/tests.h                     |  242 -----
 31 files changed, 1336 insertions(+), 1638 deletions(-)
 delete mode 100644 test/cpp/jit/gtest.cpp
 delete mode 100644 test/cpp/jit/test_base.cpp
 delete mode 100644 test/cpp/jit/test_base.h
 delete mode 100644 test/cpp/jit/tests.h

diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp
index 0650e9a3e6b4..55df55f3b58c 100644
--- a/aten/src/ATen/test/thread_init_test.cpp
+++ b/aten/src/ATen/test/thread_init_test.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
-#include <test/cpp/jit/test_base.h>
+#include <test/cpp/tensorexpr/test_base.h>
 #include <thread>
 
 
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index b8f6ef195226..2d135befd805 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -19,12 +19,9 @@ endif()
 
 # Build the cpp gtest binary containing the cpp-only tests.
 set(JIT_TEST_SRCS
-  ${JIT_TEST_ROOT}/gtest.cpp
   ${JIT_TEST_ROOT}/test_alias_analysis.cpp
   ${JIT_TEST_ROOT}/test_argument_spec.cpp
   ${JIT_TEST_ROOT}/test_autodiff.cpp
-  ${JIT_TEST_ROOT}/test_base.cpp
-  ${JIT_TEST_ROOT}/test_base.h
   ${JIT_TEST_ROOT}/test_class_import.cpp
   ${JIT_TEST_ROOT}/test_class_parser.cpp
   ${JIT_TEST_ROOT}/test_class_type.cpp
diff --git a/test/cpp/jit/README.md b/test/cpp/jit/README.md
index a3e92403201f..ef5ea2d910be 100644
--- a/test/cpp/jit/README.md
+++ b/test/cpp/jit/README.md
@@ -1,69 +1,44 @@
 # JIT C++ Tests
 
-## How to add a new test
+## Adding a new test
 First, create a new test file. Test files should have be placed in this
 directory, with a name that starts with `test_`, like `test_foo.cpp`.
 
-Here is an example test file you can copy-paste.
+In general a single test suite
+
+Add your test file to the `JIT_TEST_SRCS` list in `test/cpp/jit/CMakeLists.txt`.
+
+A test file may look like:
 ```cpp
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
 
-// Tests go in torch::jit
-namespace torch {
-namespace jit {
+using namespace ::torch::jit
 
-// 1. Test cases are void() functions.
-// 2. They start with the prefix `test`
-void testCaseOne() {
-    // ...
+TEST(FooTest, BarBaz) {
+   // ...
 }
 
-void testCaseTwo() {
-    // ...
-}
-}
+// Append '_CUDA' to the test case name will automatically filter it out if CUDA
+// is not compiled.
+TEST(FooTest, NeedsAGpu_CUDA) {
+   // ...
 }
-```
 
-Then, register your test in `tests.h`:
-```cpp
-// Add to TH_FORALL_TESTS_CUDA instead for CUDA-requiring tests
-#define TH_FORALL_TESTS(_)             \
-  _(ADFormulas)                        \
-  _(Attributes)                        \
-  ...
-  _(CaseOne)  // note that the `test` prefix is omitted.
-  _(CaseTwo)
-```
-
-We glob all the test files together in `CMakeLists.txt` so that you don't
-have to edit it every time you add a test. Unfortunately, this means that in
-order to get the build to pick up your new test file, you need to re-run
-cmake:
-```
-python setup.py build --cmake
+// Similarly, if only one GPU is detected, tests with `_MultiCUDA` at the end
+// will not be run.
+TEST(FooTest, NeedsMultipleGpus_MultiCUDA) {
+   // ...
+}
 ```
 
-## Why do we have two different test runners?
-We have two different ways of running our cpp tests:
-1. With `gtest`, from a standalone binary.
-2. With Python, from `TestJit.test_cpp` and `TestJit.test_cpp_cuda` (in
-   `test/test_jit.py`)
-
-We want both because we need to test things from a pure-C++ environment and
-with all our various Python patch-points enabled.
-
-## How do I run the tests?
+## Building and running the tests
 The following commands assume you are in PyTorch root.
 
-1. With `gtest`:
-   ```bash
-   # (re)build the test binary
-   ninja build/bin/test_jit
-   # run
-   build/bin/test_jit --gtest_filter='glob_style_filter*'
-   ```
-2. With Python:
-   ```
-   python test/test_jit.py TestJit.test_cpp TestJit.test_cpp_cuda
-   ```
+```bash
+# ... Build PyTorch from source, e.g.
+python setup.py develop
+# (re)build just the binary
+ninja -C build bin/test_jit
+# run tests
+build/bin/test_jit --gtest_filter='glob_style_filter*'
+```
diff --git a/test/cpp/jit/gtest.cpp b/test/cpp/jit/gtest.cpp
deleted file mode 100644
index e0e512be4352..000000000000
--- a/test/cpp/jit/gtest.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <test/cpp/jit/tests.h>
-
-#include <gtest/gtest.h>
-
-namespace torch {
-namespace jit {
-
-#define JIT_GTEST(name) \
-  TEST(JitTest, name) { \
-    test##name();       \
-  }
-TH_FORALL_TESTS(JIT_GTEST)
-#undef JIT_TEST
-
-#define JIT_GTEST_CUDA(name)   \
-  TEST(JitTest, name##_CUDA) { \
-    test##name();              \
-  }
-TH_FORALL_TESTS_CUDA(JIT_GTEST_CUDA)
-#undef JIT_TEST_CUDA
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/jit/test_base.cpp b/test/cpp/jit/test_base.cpp
deleted file mode 100644
index 338577fbd833..000000000000
--- a/test/cpp/jit/test_base.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <test/cpp/jit/test_base.h>
-#include <test/cpp/jit/test_utils.h>
-
-#include "torch/csrc/jit/runtime/custom_operator.h"
-
-namespace torch {
-namespace jit {
-inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
-  return c10::AliasAnalysisKind::FROM_SCHEMA;
-}
-
-namespace {
-RegisterOperators reg({
-    // This operator is intended to be used in JIT analysis and transformation
-    // pass unit tests in which Values with type Tensor are often required. It
-    // should not be used in situations in which the graph is actually executed
-    // because it always produces empty Tensors.
-    Operator(
-        "prim::MakeTestTensor() -> Tensor",
-        [](Stack* stack) { push(stack, at::Tensor()); },
-        aliasAnalysisFromSchema()),
-});
-}
-
-} // namespace jit
-} // namespace torch
diff --git a/test/cpp/jit/test_base.h b/test/cpp/jit/test_base.h
deleted file mode 100644
index 25f9e9f36cde..000000000000
--- a/test/cpp/jit/test_base.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-
-// This file defines assertion macros that work in both gtest and non-gtest
-// builds, and has some common includes.
-#include "torch/csrc/jit/ir/ir.h"
-#include "torch/csrc/jit/runtime/operator.h"
-
-#if defined(USE_GTEST)
-#include <gtest/gtest.h>
-#include <test/cpp/common/support.h>
-#else
-#include "c10/util/Exception.h"
-// Temporary: we are going to remove these polyfills entirely.
-// But for now avoid redefining them if they are already defined in gtest.
-// (ASSERT_EQ is a proxy for whether gtest is already present)
-#ifndef ASSERT_EQ
-#define ASSERT_EQ(x, y) TORCH_INTERNAL_ASSERT((x) == (y))
-#define ASSERT_NE(x, y) TORCH_INTERNAL_ASSERT((x) != (y))
-#define ASSERT_TRUE TORCH_INTERNAL_ASSERT
-#define ASSERT_FALSE(x) ASSERT_TRUE(!(x))
-#define ASSERT_THROWS_WITH(statement, substring)                         \
-  try {                                                                  \
-    (void)statement;                                                     \
-    ASSERT_TRUE(false);                                                  \
-  } catch (const std::exception& e) {                                    \
-    ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \
-  }
-#define ASSERT_ANY_THROW(statement)     \
-  {                                     \
-    bool threw = false;                 \
-    try {                               \
-      (void)statement;                  \
-    } catch (const std::exception& e) { \
-      threw = true;                     \
-    }                                   \
-    ASSERT_TRUE(threw);                 \
-  }
-#endif // ndef(ASSERT_EQ)
-
-#endif // defined(USE_GTEST)
-
-static inline bool isSandcastle() {
-  return (
-      (std::getenv("SANDCASTLE")) ||
-      (std::getenv("TW_JOB_USER") &&
-       std::string(std::getenv("TW_JOB_USER")) == "sandcastle"));
-}
diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp
index a5b19f63fd3f..2f7f06d3802b 100644
--- a/test/cpp/jit/test_class_parser.cpp
+++ b/test/cpp/jit/test_class_parser.cpp
@@ -1,6 +1,5 @@
 #include <gtest/gtest.h>
 
-#include <test/cpp/jit/test_base.h>
 #include <torch/csrc/jit/frontend/parser.h>
 #include <torch/csrc/jit/frontend/resolver.h>
 
diff --git a/test/cpp/jit/test_class_type.cpp b/test/cpp/jit/test_class_type.cpp
index c00aafcc526b..21229594d56d 100644
--- a/test/cpp/jit/test_class_type.cpp
+++ b/test/cpp/jit/test_class_type.cpp
@@ -1,11 +1,12 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/torch.h>
 
 namespace torch {
 namespace jit {
 
-void testClassTypeAddRemoveAttr() {
+TEST(ClassTypeTest, AddRemoveAttr) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   cls->addAttribute("attr1", TensorType::get(), true);
@@ -32,12 +33,12 @@ void testClassTypeAddRemoveAttr() {
   cls->addAttribute("attr1", IntType::get());
 }
 
-void testClassTypeAddRemoveConstant() {
+TEST(ClassTypeTest, AddRemoveConstant) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu);
   cls->addConstant("const1", IValue(1));
   cls->addConstant("const2", IValue(2));
-  cls->addConstant("const3", IValue(2));
+  cls->addConstant("const3", IValue(3));
   ASSERT_EQ(cls->numConstants(), 3);
   ASSERT_TRUE(cls->hasConstant("const1"));
   ASSERT_TRUE(cls->hasConstant("const2"));
@@ -46,7 +47,7 @@ void testClassTypeAddRemoveConstant() {
 
   ASSERT_EQ(cls->getConstant("const1").toInt(), 1);
   ASSERT_EQ(cls->getConstant("const2").toInt(), 2);
-  ASSERT_EQ(cls->getConstant("const2").toInt(), 3);
+  ASSERT_EQ(cls->getConstant("const3").toInt(), 3);
 
   cls->unsafeRemoveConstant("const2");
   ASSERT_TRUE(cls->hasConstant("const1"));
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
index d18becfa6641..38008d417256 100644
--- a/test/cpp/jit/test_gpu.cpp
+++ b/test/cpp/jit/test_gpu.cpp
@@ -1,6 +1,5 @@
 #if defined(USE_CUDA)
-
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
 
 #include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/codegen.h>
@@ -93,7 +92,7 @@ void checkIntValue(
 // (These tests exercise IrGraphGenerator through a non-trivial IR,
 //  to make sure that it runs w/o crashing. The actual output is not
 //  validated)
-void testGPU_IrGraphGenerator() {
+TEST(NVFuserTest, IrGraphGenerator_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -145,7 +144,7 @@ void testGPU_IrGraphGenerator() {
                    .empty());
 }
 
-void testGPU_FusionDispatch() {
+TEST(NVFuserTest, FusionDispatch_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -160,7 +159,7 @@ void testGPU_FusionDispatch() {
 }
 
 // Evaluate basic scalar operations with constant values
-void testGPU_FusionExprEvalConstants() {
+TEST(NVFuserTest, FusionExprEvalConstants_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -177,7 +176,7 @@ void testGPU_FusionExprEvalConstants() {
 }
 
 // Evaluate basic scalar operations with bound values
-void testGPU_FusionExprEvalBindings() {
+TEST(NVFuserTest, FusionExprEvalBindings_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -222,7 +221,7 @@ void testGPU_FusionExprEvalBindings() {
 }
 
 // Evaluate expressions in a simple IR
-void testGPU_FusionExprEvalBasic() {
+TEST(NVFuserTest, FusionExprEvalBasic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -278,7 +277,7 @@ void testGPU_FusionExprEvalBasic() {
 }
 
 // Evaluate expressions in a more complex IR
-void testGPU_FusionExprEvalComplex() {
+TEST(NVFuserTest, FusionExprEvalComplex_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -330,7 +329,7 @@ void testGPU_FusionExprEvalComplex() {
 }
 
 // Evaluate expressions post lowering
-void testGPU_FusionExprEvalPostLower() {
+TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -387,7 +386,7 @@ void testGPU_FusionExprEvalPostLower() {
   checkIntValue(evaluator, tid_x, 128);
 }
 
-void testGPU_FusionClear() {
+TEST(NVFuserTest, FusionClear_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -467,7 +466,7 @@ void testGPU_FusionClear() {
   TORCH_CHECK(output_ref.equal(outputs[0]));
 }
 
-void testGPU_FusionCopy() {
+TEST(NVFuserTest, FusionCopy_CUDA) {
   Fusion original_fusion;
 
   // Create the test IR
@@ -541,7 +540,7 @@ void testGPU_FusionCopy() {
   ASSERT_EQ(original_kernel, clone_kernel);
 }
 
-void testGPU_FusionMove() {
+TEST(NVFuserTest, FusionMove_CUDA) {
   Fusion fusion;
 
   // Create the test IR
@@ -611,7 +610,7 @@ void testGPU_FusionMove() {
   ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str());
 }
 
-void testGPU_FusionSimpleArith() {
+TEST(NVFuserTest, FusionSimpleArith_CUDA) {
   std::stringstream ss1, ss2;
 
   Fusion fusion;
@@ -640,7 +639,7 @@ void testGPU_FusionSimpleArith() {
       "Error where explicit add nodes don't match implicit add nodes.");
 }
 
-void testGPU_FusionSimpleTypePromote() {
+TEST(NVFuserTest, FusionSimpleTypePromote_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -663,7 +662,7 @@ class ZeroMutator : public OptOutMutator {
   }
 };
 
-void testGPU_FusionMutator() {
+TEST(NVFuserTest, FusionMutator_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -681,7 +680,7 @@ void testGPU_FusionMutator() {
   TORCH_CHECK(flhs->value().value() == 0.f);
 }
 
-void testGPU_FusionRegister() {
+TEST(NVFuserTest, FusionRegister_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   Float* v1 = new Float{1.f};
@@ -712,7 +711,7 @@ struct DummyExpr : public Expr {
   DummyExpr& operator=(DummyExpr&& other) = delete;
 };
 
-void testGPU_FusionTopoSort() {
+TEST(NVFuserTest, FusionTopoSort_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -779,7 +778,7 @@ void testGPU_FusionTopoSort() {
   TORCH_CHECK(fusion.origin(v6)->name() == 3);
 }
 
-void testGPU_FusionTensor() {
+TEST(NVFuserTest, FusionTensor_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
 
   Fusion fusion;
@@ -843,7 +842,7 @@ void testGPU_FusionTensor() {
   }
 }
 
-void testGPU_FusionFilterVals() {
+TEST(NVFuserTest, FusionFilterVals_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -881,7 +880,7 @@ void testGPU_FusionFilterVals() {
       "Not expecting any results");
 }
 
-void testGPU_FusionTVSplit() {
+TEST(NVFuserTest, FusionTVSplit_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -907,7 +906,7 @@ void testGPU_FusionTVSplit() {
       static_cast<Int*>(inner->extent())->value().value() == 2);
 }
 
-void testGPU_FusionTVMerge() {
+TEST(NVFuserTest, FusionTVMerge_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -925,7 +924,7 @@ void testGPU_FusionTVMerge() {
           tv->getRootDomain()[2]->extent());
 }
 
-void testGPU_FusionTVReorder() {
+TEST(NVFuserTest, FusionTVReorder_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -972,7 +971,7 @@ void testGPU_FusionTVReorder() {
   TORCH_CHECK(ref[1]->sameAs(tv->axis(1)));
 }
 
-void testGPU_FusionEquality() {
+TEST(NVFuserTest, FusionEquality_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1013,7 +1012,7 @@ void testGPU_FusionEquality() {
   TORCH_CHECK(!neg1->sameAs(neg2));
 }
 
-void testGPU_FusionDependency() {
+TEST(NVFuserTest, FusionDependency_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1083,7 +1082,7 @@ void testGPU_FusionDependency() {
   TORCH_CHECK(dep_chain.empty());
 }
 
-void testGPU_FusionParser() {
+TEST(NVFuserTest, FusionParser_CUDA) {
   auto g = std::make_shared<Graph>();
   const auto graph0_string = R"IR(
     graph(%0 : Float(2:1),
@@ -1163,7 +1162,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 1> T0, Tensor<float, 1> T1, Te
   TORCH_CHECK(output_ref.equal(outputs[0]));
 }
 
-void testGPU_FusionForLoop() {
+TEST(NVFuserTest, FusionForLoop_CUDA) {
 // TODO(kir): re-enable this test
 //  due to the current "GpuLower guard" approach, we can only create
 //  kernel IR during GpuLower::lower()
@@ -1204,7 +1203,7 @@ void testGPU_FusionForLoop() {
 #endif
 }
 
-void testGPU_FusionCodeGen() {
+TEST(NVFuserTest, FusionCodeGen_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1241,7 +1240,7 @@ void testGPU_FusionCodeGen() {
   TORCH_CHECK(output_ref.equal(output));
 }
 
-void testGPU_FusionCodeGen2() {
+TEST(NVFuserTest, FusionCodeGen2_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1283,7 +1282,7 @@ void testGPU_FusionCodeGen2() {
   TORCH_CHECK(output_ref.equal(outputs[0]));
 }
 
-void testGPU_FusionSimplePWise() {
+TEST(NVFuserTest, FusionSimplePWise_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   // dimensionality of the problem
@@ -1340,7 +1339,7 @@ void testGPU_FusionSimplePWise() {
   TORCH_CHECK(output_ref.equal(output));
 }
 
-void testGPU_FusionExecKernel() {
+TEST(NVFuserTest, FusionExecKernel_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -1394,7 +1393,7 @@ int ceilDiv_(int a, int b) {
   return (a + b - 1) / b;
 }
 
-void testGPU_FusionAdvancedComputeAt() {
+TEST(NVFuserTest, FusionAdvancedComputeAt_CUDA) {
   // Case 1
   // tv1 = tv0 * 0.5
   // tv2 = tv1 * -1
@@ -1693,7 +1692,7 @@ void testGPU_FusionAdvancedComputeAt() {
   }
 }
 
-void testGPU_FusionComputeAtMultiConsumers() {
+TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) {
   // tv1 = tv0 * 0.5
   // tv2 = tv1 * -1
   // tv3 = tv2 * -2
@@ -1754,7 +1753,7 @@ void testGPU_FusionComputeAtMultiConsumers() {
 }
 
 // Similar to ComputeAtMultiConsumers, but with a common consumer.
-void testGPU_FusionComputeAtCommonConsumer1() {
+TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) {
   // tv1 = tv0 * 0.5
   // tv2 = tv1 * -1
   // tv3 = tv2 * -2
@@ -1825,7 +1824,7 @@ void testGPU_FusionComputeAtCommonConsumer1() {
   TORCH_CHECK(at::allclose(kernel_tv5, t5));
 }
 
-void testGPU_FusionComputeAtCommonConsumer2() {
+TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) {
   // tv1 = tv0 * 0.5
   // tv2 = tv1 * -1
   // tv3 = tv2 * -1
@@ -1912,7 +1911,7 @@ void testGPU_FusionComputeAtCommonConsumer2() {
 
 // Similar to the above common consumer test but adds an additional
 // tensor that has no common consumer with the other tensors.
-void testGPU_FusionComputeAtCommonConsumer3() {
+TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) {
   // tv1 = tv0 * 0.5
   // tv2 = tv1 * -1
   // tv3 = tv2 * -1
@@ -2010,7 +2009,7 @@ void testGPU_FusionComputeAtCommonConsumer3() {
 
 // Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor
 // that does not have data dependency with the consumer.
-void testGPU_FusionComputeAtNoCommonConsumer() {
+TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) {
   // tv1 = tv0 * 0.5
   // tv2 = tv1 * -1
   // tv3 = tv1 * -2
@@ -2102,7 +2101,7 @@ void checkConcretized(
 
 } // namespace
 
-void testGPU_FusionBCastConcretizeBasic() {
+TEST(NVFuserTest, FusionBCastConcretizeBasic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2132,7 +2131,7 @@ void testGPU_FusionBCastConcretizeBasic() {
   checkConcretized(tv2_0, 0, tv1, 1, false);
 }
 
-void testGPU_FusionBCastConcretizeRfactor() {
+TEST(NVFuserTest, FusionBCastConcretizeRfactor_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2181,7 +2180,7 @@ void checkIdProvedEquivalent(
 
 } // namespace
 
-void testGPU_FusionProveIdEqBasic() {
+TEST(NVFuserTest, FusionProveIdEqBasic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2206,7 +2205,7 @@ void testGPU_FusionProveIdEqBasic() {
   checkIdProvedEquivalent(tv0, 0, tv1, 1, false);
 }
 
-void testGPU_FusionProveIdEqRfactor() {
+TEST(NVFuserTest, FusionProveIdEqRfactor_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2236,7 +2235,7 @@ void testGPU_FusionProveIdEqRfactor() {
   checkIdProvedEquivalent(tv3, 0, tv0, 0, true);
 }
 
-void testGPU_FusionScalarInputs() {
+TEST(NVFuserTest, FusionScalarInputs_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2323,7 +2322,7 @@ void testGPU_FusionScalarInputs() {
   TORCH_CHECK(at::allclose(kernel_tv4, t4));
 }
 
-void testGPU_FusionLoopUnroll() {
+TEST(NVFuserTest, FusionLoopUnroll_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2564,7 +2563,7 @@ void test_op(
       std::make_index_sequence<size>{});
 }
 
-void testGPU_FusionUnaryOps() {
+TEST(NVFuserTest, FusionUnaryOps_CUDA) {
   using OpTuple =
       std::tuple<at::Tensor (*)(const at::Tensor&), UnaryOpType, std::string>;
 
@@ -2638,7 +2637,7 @@ void testGPU_FusionUnaryOps() {
       std::make_tuple(std::make_pair(ValType::TensorView, DataType::Float)));
 }
 
-void testGPU_FusionBinaryOps() {
+TEST(NVFuserTest, FusionBinaryOps_CUDA) {
   using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&);
   using OpTuple = std::tuple<AtenFuncSig, BinaryOpType, std::string>;
 
@@ -2738,7 +2737,7 @@ void testGPU_FusionBinaryOps() {
           std::make_pair(ValType::Scalar, DataType::Float)));
 }
 
-void testGPU_FusionTernaryOps() {
+TEST(NVFuserTest, FusionTernaryOps_CUDA) {
   test_op(
       /*blocks*/ 640,
       /*threads*/ 64,
@@ -2787,7 +2786,7 @@ void testGPU_FusionTernaryOps() {
           std::make_pair(ValType::TensorView, DataType::Float)));
 }
 
-void testGPU_FusionCompoundOps() {
+TEST(NVFuserTest, FusionCompoundOps_CUDA) {
   test_op(
       /*blocks*/ 640,
       /*threads*/ 64,
@@ -2826,7 +2825,7 @@ void testGPU_FusionCompoundOps() {
           std::make_pair(ValType::Scalar, DataType::Float)));
 }
 
-void testGPU_FusionCastOps() {
+TEST(NVFuserTest, FusionCastOps_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2868,7 +2867,7 @@ void testGPU_FusionCastOps() {
 
 // We want split/merge/reorder all tested both on and off rfactor domains, also
 // want compute at into the rfactor domain, and into its consumer
-void testGPU_FusionRFactorReplay() {
+TEST(NVFuserTest, FusionRFactorReplay_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -2961,7 +2960,7 @@ void testGPU_FusionRFactorReplay() {
 
 // Start off simple, block on the outer dim
 // block stride + thread all reduce + unrolling on inner dim
-void testGPU_FusionReduction() {
+TEST(NVFuserTest, FusionReduction_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3019,7 +3018,7 @@ void testGPU_FusionReduction() {
   TORCH_CHECK(aten_output.allclose(cg_output));
 }
 
-void testGPU_FusionReduction2() {
+TEST(NVFuserTest, FusionReduction2_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -3146,7 +3145,7 @@ void testGPU_FusionReduction2() {
   }
 }
 
-void testGPU_FusionReduction3() {
+TEST(NVFuserTest, FusionReduction3_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -3217,7 +3216,7 @@ void testGPU_FusionReduction3() {
   }
 }
 
-void testGPU_FusionReduction4() {
+TEST(NVFuserTest, FusionReduction4_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3269,7 +3268,7 @@ void testGPU_FusionReduction4() {
       aten_output.sub(cg_output).abs().max());
 }
 
-void testGPU_FusionReduction5() {
+TEST(NVFuserTest, FusionReduction5_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3330,7 +3329,7 @@ void testGPU_FusionReduction5() {
   TORCH_CHECK(aten_output.allclose(outputs[0]));
 }
 
-void testGPU_FusionReductionTFT() {
+TEST(NVFuserTest, FusionReductionTFT_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3387,7 +3386,7 @@ void testGPU_FusionReductionTFT() {
   TORCH_CHECK(aten_output.allclose(cg_output));
 }
 
-void testGPU_FusionBranches() {
+TEST(NVFuserTest, FusionBranches_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -3444,7 +3443,7 @@ void testGPU_FusionBranches() {
   TORCH_CHECK(t6.allclose(outputs[0]));
 }
 
-void testGPU_FusionSimpleBCast() {
+TEST(NVFuserTest, FusionSimpleBCast_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -3708,7 +3707,7 @@ void testGPU_FusionSimpleBCast() {
   }
 }
 
-void testGPU_FusionComplexBCast() {
+TEST(NVFuserTest, FusionComplexBCast_CUDA) {
   {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -3811,7 +3810,7 @@ void testGPU_FusionComplexBCast() {
   }
 }
 
-void testGPU_FusionAdvancedIndexing() {
+TEST(NVFuserTest, FusionAdvancedIndexing_CUDA) {
   // Merging left to right is still broken in some instances. Indexing can't
   // complete because we assume we can simply traverse consumer->producer in the
   // index/extent map, but this case breaks this assumption.
@@ -3980,7 +3979,7 @@ void testGPU_FusionAdvancedIndexing() {
 }
 
 // Test a simple Gemm but also play around with fusion executor features
-void testGPU_FusionSimpleGemm() {
+TEST(NVFuserTest, FusionSimpleGemm_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4067,7 +4066,7 @@ void testGPU_FusionSimpleGemm() {
 }
 
 // Softmax with a 1D tensor. Parallelized only with a single thread block.
-void testGPU_FusionSoftmax1D() {
+TEST(NVFuserTest, FusionSoftmax1D_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4124,7 +4123,7 @@ void testGPU_FusionSoftmax1D() {
 }
 
 // Softmax with a 1D tensor with input normalization.
-void testGPU_FusionSoftmax1DNormalized() {
+TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4195,7 +4194,7 @@ void testGPU_FusionSoftmax1DNormalized() {
 
 // Softmax with a 3D tensor, where the inner-most 3rd dimension is
 // normalized. Pallelized with multiple thread blocks.
-void testGPU_FusionSoftmax3D() {
+TEST(NVFuserTest, FusionSoftmax3D_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4255,7 +4254,7 @@ void testGPU_FusionSoftmax3D() {
 }
 
 // Softmax with a 3D tensor with input normalization.
-void testGPU_FusionSoftmax3DNormalized() {
+TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4329,7 +4328,7 @@ void testGPU_FusionSoftmax3DNormalized() {
       t2.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionSoftmaxComputeAt() {
+TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4355,7 +4354,7 @@ void testGPU_FusionSoftmaxComputeAt() {
 }
 
 // Similar to FusionReduction but uses grid reduction
-void testGPU_FusionGridReduction1() {
+TEST(NVFuserTest, FusionGridReduction1_CUDA) {
   const int gdimx = 32;
   const int bdimx = 128;
 
@@ -4413,7 +4412,7 @@ void testGPU_FusionGridReduction1() {
 }
 
 // Same test as the above but uses BIDy and TIDx for reduction
-void testGPU_FusionGridReduction2() {
+TEST(NVFuserTest, FusionGridReduction2_CUDA) {
   const int gdimy = 32;
   const int bdimx = 128;
 
@@ -4468,7 +4467,7 @@ void testGPU_FusionGridReduction2() {
 }
 
 // Same test but uses BIDy and BIDz for reduction. No TID used.
-void testGPU_FusionGridReduction3dim1() {
+TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) {
   const int gdimz = 32;
   const int gdimy = 128;
 
@@ -4524,7 +4523,7 @@ void testGPU_FusionGridReduction3dim1() {
 }
 
 // Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0
-void testGPU_FusionGridReduction3dim0() {
+TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) {
   const int rdim = 0;
   const int gdimy = 128;
   const int gdimz = 32;
@@ -4577,7 +4576,7 @@ void testGPU_FusionGridReduction3dim0() {
 }
 
 // This is similar to the FusionReduction, but swaps BIDx and TIDx
-void testGPU_FusionGridReduction4() {
+TEST(NVFuserTest, FusionGridReduction4_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4640,7 +4639,7 @@ void testGPU_FusionGridReduction4() {
 
 // Grid reduction with 2D thread blocks but only TIDx and BIDx are
 // mapped to a reduction dim
-void testGPU_FusionGridReduction5() {
+TEST(NVFuserTest, FusionGridReduction5_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4692,7 +4691,7 @@ void testGPU_FusionGridReduction5() {
 }
 
 // Similar to FusionGridReduction1 but with 3D tensors
-void testGPU_FusionGridReduction6() {
+TEST(NVFuserTest, FusionGridReduction6_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4753,7 +4752,7 @@ void testGPU_FusionGridReduction6() {
   TORCH_CHECK(aten_output.allclose(cg_output));
 }
 
-void testGPU_FusionNonRedAxisBind() {
+TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) {
   int bid_x = 3;
   int tid_x = 2;
   int red_dim = 0;
@@ -4788,7 +4787,7 @@ void testGPU_FusionNonRedAxisBind() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionSplitBCast() {
+TEST(NVFuserTest, FusionSplitBCast_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4836,7 +4835,7 @@ void testGPU_FusionSplitBCast() {
   fe.runFusion({t0, t1}, {cg_output});
 }
 
-void testGPU_FusionBCastInnerDim() {
+TEST(NVFuserTest, FusionBCastInnerDim_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4850,7 +4849,7 @@ void testGPU_FusionBCastInnerDim() {
   TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast());
 }
 
-void testGPU_FusionBCastReduce() {
+TEST(NVFuserTest, FusionBCastReduce_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4866,7 +4865,7 @@ void testGPU_FusionBCastReduce() {
 
 // Multiple consumer reduction with computeAt
 // https://github.com/csarofeen/pytorch/issues/110
-void testGPU_FusionReductionMultiConsumer() {
+TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   TensorView* tv0 = makeDummyTensor(2);
@@ -4883,7 +4882,7 @@ void testGPU_FusionReductionMultiConsumer() {
       tv1->getThisComputeAtAxis() == 2 && tv1->getRelativeComputeAtAxis() == 2);
 }
 
-void testGPU_FusionComputeAtExprOrder() {
+TEST(NVFuserTest, FusionComputeAtExprOrder_CUDA) {
   {
     for (int i = 0; i < 2; ++i) {
       Fusion fusion;
@@ -4953,7 +4952,7 @@ void testGPU_FusionComputeAtExprOrder() {
   }
 }
 
-void testGPU_FusionZeroDimComputeAt() {
+TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -4980,7 +4979,7 @@ void testGPU_FusionZeroDimComputeAt() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionZeroDimBroadcast() {
+TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5016,7 +5015,7 @@ void testGPU_FusionZeroDimBroadcast() {
       aten_output.sub(output).abs().max());
 }
 
-void testGPU_FusionZeroDimReduction() {
+TEST(NVFuserTest, FusionZeroDimReduction_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5053,7 +5052,7 @@ void testGPU_FusionZeroDimReduction() {
       aten_output.sub(output).abs().max());
 }
 
-void testGPU_FusionBCastAfterReduce() {
+TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   const int tidx = 128;
@@ -5104,7 +5103,7 @@ void testGPU_FusionBCastAfterReduce() {
   TORCH_CHECK(t5.allclose(outputs[0], 1e-5, 1e-5));
 }
 
-void testGPU_FusionReductionScheduler() {
+TEST(NVFuserTest, FusionReductionScheduler_CUDA) {
   constexpr int bid_x = 80;
   constexpr int tid_x = 4096;
   constexpr int red_dim = 1;
@@ -5142,7 +5141,7 @@ void testGPU_FusionReductionScheduler() {
 }
 
 // Simple reduction parallelized on a symbolic size.
-void testGPU_FusionSymbolicReduction() {
+TEST(NVFuserTest, FusionSymbolicReduction_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5192,7 +5191,7 @@ void testGPU_FusionSymbolicReduction() {
   TORCH_CHECK(aten_output.allclose(outputs[0]));
 }
 
-void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
+TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) {
   const std::vector<int> red_dims = {0, 2};
   // Copy is because CodeGen requires int and Pytorch requires int64_t
   // for a vector of reduction dimensions
@@ -5232,7 +5231,7 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionReductionSchedulerMultiDimFastest() {
+TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) {
   const std::vector<int> red_dims = {1, 3};
   // Copy is because CodeGen requires int and Pytorch requires int64_t
   // for a vector of reduction dimensions
@@ -5270,7 +5269,7 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionReductionSchedulerDimShmoo() {
+TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) {
   std::vector<bool> fp16_usage = {true, false};
   std::vector<int> red_axis = {1, 0};
   std::vector<int> output_dims = {320, 640};
@@ -5346,7 +5345,7 @@ void testGPU_FusionReductionSchedulerDimShmoo() {
   }
 }
 
-void testGPU_FusionCacheBefore() {
+TEST(NVFuserTest, FusionCacheBefore_CUDA) {
   // TVM Cache Write
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -5387,7 +5386,7 @@ void testGPU_FusionCacheBefore() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheAfter() {
+TEST(NVFuserTest, FusionCacheAfter_CUDA) {
   // TVM Cache Read
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -5428,7 +5427,7 @@ void testGPU_FusionCacheAfter() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheIndirect() {
+TEST(NVFuserTest, FusionCacheIndirect_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5477,7 +5476,7 @@ void testGPU_FusionCacheIndirect() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheBcast() {
+TEST(NVFuserTest, FusionCacheBcast_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5536,7 +5535,7 @@ void testGPU_FusionCacheBcast() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionCacheComplex() {
+TEST(NVFuserTest, FusionCacheComplex_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5591,7 +5590,7 @@ void testGPU_FusionCacheComplex() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionCacheMultiConsumer() {
+TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5636,7 +5635,7 @@ void testGPU_FusionCacheMultiConsumer() {
       aten_output.sub(outputs[1]).abs().sum());
 }
 
-void testGPU_FusionSmem() {
+TEST(NVFuserTest, FusionSmem_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5691,7 +5690,7 @@ void testGPU_FusionSmem() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
 }
 
-void testGPU_FusionSmemReduce() {
+TEST(NVFuserTest, FusionSmemReduce_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5742,7 +5741,7 @@ void testGPU_FusionSmemReduce() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1);
 }
 
-void testGPU_FusionSmemBlockGemm() {
+TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5805,7 +5804,7 @@ void testGPU_FusionSmemBlockGemm() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
 }
 
-void testGPU_FusionSmemBlockGemmCache() {
+TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5891,7 +5890,7 @@ void testGPU_FusionSmemBlockGemmCache() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
 }
 
-void testGPU_FusionSmemDynamicReductionSymbolic() {
+TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -5940,7 +5939,7 @@ void testGPU_FusionSmemDynamicReductionSymbolic() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0);
 }
 
-void testGPU_FusionSmemDynamicReductionSymbolicArg() {
+TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6000,7 +5999,7 @@ void testGPU_FusionSmemDynamicReductionSymbolicArg() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1);
 }
 
-void testGPU_FusionSmemDynamicPwiseMulSymbolicArgWAR() {
+TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6059,7 +6058,7 @@ void testGPU_FusionSmemDynamicPwiseMulSymbolicArgWAR() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(22) == 1);
 }
 
-void testGPU_FusionSmemDynamicTiledGemm() {
+TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6185,7 +6184,7 @@ void testGPU_FusionSmemDynamicTiledGemm() {
   TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(41) == 1);
 }
 
-void testGPU_FusionGlobalIntermediate() {
+TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6233,7 +6232,7 @@ void testGPU_FusionGlobalIntermediate() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionGlobalIntermediateDefaultSchedule() {
+TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6273,7 +6272,7 @@ void testGPU_FusionGlobalIntermediateDefaultSchedule() {
       aten_output.sub(outputs[0]).abs().sum());
 }
 
-void testGPU_FusionConstCheck() {
+TEST(NVFuserTest, FusionConstCheck_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6290,7 +6289,7 @@ void testGPU_FusionConstCheck() {
   TORCH_CHECK(one_x4->isConstScalar());
 }
 
-void testGPU_FusionUnrollWithAlloc() {
+TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) {
   const std::vector<int64_t> tensor_dims_in = {128, 128};
   Fusion fusion;
   FusionGuard fg(&fusion);
@@ -6338,7 +6337,7 @@ void testGPU_FusionUnrollWithAlloc() {
 }
 
 // Test isZeroInt
-void testGPU_FusionIsZeroInt() {
+TEST(NVFuserTest, FusionIsZeroInt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6351,7 +6350,7 @@ void testGPU_FusionIsZeroInt() {
 }
 
 // Test isOneInt
-void testGPU_FusionIsOneInt() {
+TEST(NVFuserTest, FusionIsOneInt_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6366,7 +6365,7 @@ void testGPU_FusionIsOneInt() {
 // This is to verify no cycle of computeAt is created. A more complex
 // variation of this pattern appears in one of the Python tests
 // (test_random_topo).
-void testGPU_FusionComputeAtNonterminatingOutput() {
+TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6430,7 +6429,7 @@ void testGPU_FusionComputeAtNonterminatingOutput() {
   return;
 }
 
-void testGPU_FusionTraversalOrder1() {
+TEST(NVFuserTest, FusionTraversalOrder1_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6478,7 +6477,7 @@ void testGPU_FusionTraversalOrder1() {
       t4.sub(cg_output_tv4).abs().max());
 }
 
-void testGPU_FusionTraversalOrder2() {
+TEST(NVFuserTest, FusionTraversalOrder2_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6531,7 +6530,7 @@ void testGPU_FusionTraversalOrder2() {
       t5.sub(cg_output_tv5).abs().max());
 }
 
-void testGPU_FusionTraversalOrder3() {
+TEST(NVFuserTest, FusionTraversalOrder3_CUDA) {
   for (int i = 0; i < 2; ++i) {
     Fusion fusion;
     FusionGuard fg(&fusion);
@@ -6599,7 +6598,7 @@ void testGPU_FusionTraversalOrder3() {
   }
 }
 
-void testGPU_FusionTraversalOrder4() {
+TEST(NVFuserTest, FusionTraversalOrder4_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6663,7 +6662,7 @@ void testGPU_FusionTraversalOrder4() {
       t7.sub(cg_output_tv7).abs().max());
 }
 
-void testGPU_FusionTraversalOrder5() {
+TEST(NVFuserTest, FusionTraversalOrder5_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6713,7 +6712,7 @@ void testGPU_FusionTraversalOrder5() {
       t5.sub(cg_output_tv5).abs().max());
 }
 
-void testGPU_FusionTraversalOrder6() {
+TEST(NVFuserTest, FusionTraversalOrder6_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6755,7 +6754,7 @@ void testGPU_FusionTraversalOrder6() {
       t4.sub(cg_output_tv4).abs().max());
 }
 
-void testGPU_FusionTraversalOrder7() {
+TEST(NVFuserTest, FusionTraversalOrder7_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6804,7 +6803,7 @@ void testGPU_FusionTraversalOrder7() {
 }
 
 // Test predication of grid reduction
-void testGPU_FusionThreadPredicate() {
+TEST(NVFuserTest, FusionThreadPredicate_CUDA) {
   const int gdimx = 4;
   const int bdimx = 128;
 
@@ -6860,7 +6859,7 @@ void testGPU_FusionThreadPredicate() {
   TORCH_CHECK(aten_output_tv3.allclose(cg_output_tv3));
 }
 
-void testGPU_FusionLSTMCell() {
+TEST(NVFuserTest, FusionLSTMCell_CUDA) {
   const int hidden_features = 512;
   const int batch_size = 64;
 
@@ -6940,7 +6939,7 @@ void testGPU_FusionLSTMCell() {
   TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7));
 }
 
-void testGPU_FusionComputeAtMultiBCast() {
+TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -6958,7 +6957,7 @@ void testGPU_FusionComputeAtMultiBCast() {
   ASSERT_ANY_THROW(tv1->computeAt(tv3, -1));
 }
 
-void testGPU_FusionReductionHalf() {
+TEST(NVFuserTest, FusionReductionHalf_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -7011,7 +7010,7 @@ void testGPU_FusionReductionHalf() {
       aten_output.sub(outputs[0]).abs().max());
 }
 
-void testGPU_FusionInputsIdLookup() {
+TEST(NVFuserTest, FusionInputsIdLookup_CUDA) {
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
   at::Tensor t0 = at::randn({16, 8, 8}, options);
   at::Tensor t1 = at::randn({8, 8}, options);
diff --git a/test/cpp/jit/test_graph_executor.cpp b/test/cpp/jit/test_graph_executor.cpp
index 992cde217a90..923e3421738b 100644
--- a/test/cpp/jit/test_graph_executor.cpp
+++ b/test/cpp/jit/test_graph_executor.cpp
@@ -1,11 +1,12 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/runtime/graph_executor.h"
 
 namespace torch {
 namespace jit {
 
-void testGraphExecutor() {
+TEST(GraphExecutorTest, Basic_CUDA) {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
 
diff --git a/test/cpp/jit/test_inliner.cpp b/test/cpp/jit/test_inliner.cpp
index 2153a0389319..702f5bd97573 100644
--- a/test/cpp/jit/test_inliner.cpp
+++ b/test/cpp/jit/test_inliner.cpp
@@ -1,4 +1,4 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
 
 #include <torch/csrc/jit/api/compilation_unit.h>
 #include <torch/csrc/jit/api/module.h>
@@ -36,18 +36,16 @@ struct InlinerGuard {
   bool oldState_;
 };
 
-void testInliner() {
-  {
-    // disable automatic inlining so we can test it manually
-    InlinerGuard guard(/*shouldInline=*/false);
+TEST(InlinerTest, Basic) {
+  // disable automatic inlining so we can test it manually
+  InlinerGuard guard(/*shouldInline=*/false);
 
-    CompilationUnit cu(testSource);
-    auto& fn = cu.get_function("foo3");
+  CompilationUnit cu(testSource);
+  auto& fn = cu.get_function("foo3");
 
-    auto g = fn.graph();
-    Inline(*g);
-    FileCheck().check_count("prim::Print", 3)->run(*g);
-  }
+  auto g = fn.graph();
+  Inline(*g);
+  FileCheck().check_count("prim::Print", 3)->run(*g);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_interface.cpp b/test/cpp/jit/test_interface.cpp
index b256e2328ceb..04a532459426 100644
--- a/test/cpp/jit/test_interface.cpp
+++ b/test/cpp/jit/test_interface.cpp
@@ -1,5 +1,5 @@
+#include <gtest/gtest.h>
 
-#include <test/cpp/jit/test_base.h>
 #include <test/cpp/jit/test_utils.h>
 
 #include <ATen/core/qualified_name.h>
@@ -44,7 +44,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-void testModuleInterfaceSerialization() {
+TEST(InterfaceTest, ModuleInterfaceSerialization) {
   auto cu = std::make_shared<CompilationUnit>();
   Module parentMod("parentMod", cu);
   Module subMod("subMod", cu);
diff --git a/test/cpp/jit/test_interpreter.cpp b/test/cpp/jit/test_interpreter.cpp
index 5977b0c0494a..da4607d7f047 100644
--- a/test/cpp/jit/test_interpreter.cpp
+++ b/test/cpp/jit/test_interpreter.cpp
@@ -1,12 +1,18 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
-#include <stdexcept>
 namespace torch {
 namespace jit {
 
-void testTypeCheck() {
-  {
+class TypeCheckTest : public ::testing::Test {
+ protected:
+  TypeCheckTest() : interp(makeInterp()) {}
+
+  InterpreterState interp;
+
+ private:
+  static InterpreterState makeInterp() {
     auto graph = std::make_shared<Graph>();
     std::unordered_map<std::string, Value*> vmap;
     parseIR(
@@ -20,88 +26,97 @@ graph(%a.1 : Tensor,
         vmap);
 
     Code function(graph, "");
-    InterpreterState interp(function);
-    {
-      // TypeCheck yields to true! Shape, grad and device matches.
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a.set_requires_grad(true);
-      a = a.to(at::kCPU);
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_TRUE(exactlyEqual(stack[0].toTensor(), a));
-      ASSERT_TRUE(exactlyEqual(stack[1].toTensor(), b));
-      ASSERT_TRUE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({2, 2}, at::kFloat); // Size mismatch
-      a.set_requires_grad(true);
-      a = a.to(at::kCPU);
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a = a.to(at::kCPU);
-      a.set_requires_grad(false); // Gradient mismatch
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a = a.to(at::kCPU);
-      a.set_requires_grad(true);
-      a = a.to(at::kInt); // Scalar type mismatch
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
-    {
-      auto a = at::zeros({2, 2}, at::kFloat);
-      auto b = at::ones({3, 3}, at::kFloat);
-      a.set_requires_grad(true);
-      a = a.to(at::kCUDA); // Device mismatch
-      std::vector<IValue> stack({a, b});
-      interp.run(stack);
-      ASSERT_FALSE(stack[2].toBool());
-    }
+    return InterpreterState(function);
   }
+};
 
-  try { // Test empty Typecheck raises an internal assertion
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
-graph(%a.1 : Tensor,
-      %b.1 : Tensor):
-  %type_matched : bool = prim::TypeCheck()
-  return (%type_matched)
-  )IR",
-        &*graph,
-        vmap);
-  } catch (const std::exception& e) {
-  }
-  try { // Test for assertion if num_inputs + 1 != num_outputs
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
-graph(%a.1 : Tensor,
-      %b.1 : Tensor):
-  %type_matched : bool = prim::TypeCheck(%a.1)
-  return (%type_matched)
-  )IR",
-        &*graph,
-        vmap);
-  } catch (const std::exception& e) {
-  }
+TEST_F(TypeCheckTest, MatchingType) {
+  // TypeCheck yields to true! Shape, grad and device matches.
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a.set_requires_grad(true);
+  a = a.to(at::kCPU);
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_TRUE(exactlyEqual(stack[0].toTensor(), a));
+  ASSERT_TRUE(exactlyEqual(stack[1].toTensor(), b));
+  ASSERT_TRUE(stack[2].toBool());
+}
+
+TEST_F(TypeCheckTest, SizeMismatch) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({2, 2}, at::kFloat); // Size mismatch
+  a.set_requires_grad(true);
+  a = a.to(at::kCPU);
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
 }
-void testInterp() {
+
+TEST_F(TypeCheckTest, GradientMismatch) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a = a.to(at::kCPU);
+  a.set_requires_grad(false); // Gradient mismatch
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
+}
+
+TEST_F(TypeCheckTest, ScalarTypeMismatch) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a = a.to(at::kCPU);
+  a.set_requires_grad(true);
+  a = a.to(at::kInt); // Scalar type mismatch
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
+}
+
+TEST_F(TypeCheckTest, DeviceMismatch_CUDA) {
+  auto a = at::zeros({2, 2}, at::kFloat);
+  auto b = at::ones({3, 3}, at::kFloat);
+  a.set_requires_grad(true);
+  a = a.to(at::kCUDA); // Device mismatch
+  std::vector<IValue> stack({a, b});
+  interp.run(stack);
+  ASSERT_FALSE(stack[2].toBool());
+}
+
+// TODO: These tests weren't doing anything.
+// TEST(TypeCheckErrorTest, EmptyCheckRaises) {
+//   // Test empty Typecheck raises an internal assertion
+//   auto graph = std::make_shared<Graph>();
+//   std::unordered_map<std::string, Value*> vmap;
+//   EXPECT_ANY_THROW(parseIR(
+//       R"IR(
+// graph(%a.1 : Tensor,
+//       %b.1 : Tensor):
+//   %type_matched : bool = prim::TypeCheck()
+//   return (%type_matched)
+//   )IR",
+//       &*graph,
+//       vmap));
+// }
+
+// TODO: These tests weren't doing anything.
+// TEST(TypeCheckErrorTest, WrongInputOutputCountRaises) {
+//   // Test for assertion if num_inputs + 1 != num_outputs
+//   auto graph = std::make_shared<Graph>();
+//   std::unordered_map<std::string, Value*> vmap;
+//   EXPECT_ANY_THROW(parseIR(
+//       R"IR(
+// graph(%a.1 : Tensor,
+//       %b.1 : Tensor):
+//   %type_matched : bool = prim::TypeCheck(%a.1)
+//   return (%type_matched)
+//   )IR",
+//       &*graph,
+//       vmap));
+// }
+
+TEST(InterpreterTest, Basic_CUDA) {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
   constexpr int seq_len = 32;
diff --git a/test/cpp/jit/test_ir.cpp b/test/cpp/jit/test_ir.cpp
index a05ff70061bf..2423bbf0c773 100644
--- a/test/cpp/jit/test_ir.cpp
+++ b/test/cpp/jit/test_ir.cpp
@@ -1,11 +1,12 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/ir/irparser.h"
 
 namespace torch {
 namespace jit {
 
-void testAttributes() {
+TEST(IRTest, Attributes) {
   Graph g;
   auto one = attr::alpha;
   auto two = attr::device;
@@ -33,7 +34,7 @@ void testAttributes() {
   ASSERT_EQ(attr2.f(one), 5);
 }
 
-void testBlocks() {
+TEST(IRTest, Blocks) {
   auto g = std::make_shared<Graph>();
   const auto graph_string = R"IR(
     graph(%a : Tensor,
@@ -92,7 +93,7 @@ void testBlocks() {
       ->run(*g2);
 }
 
-void testCommonAncestor() {
+TEST(IRTest, CommonAncestor) {
   std::string input_str = R"(
 graph(%x : Tensor,
       %a.1 : bool,
diff --git a/test/cpp/jit/test_irparser.cpp b/test/cpp/jit/test_irparser.cpp
index a71b64a7b85b..57f21f5bf5f9 100644
--- a/test/cpp/jit/test_irparser.cpp
+++ b/test/cpp/jit/test_irparser.cpp
@@ -1,7 +1,8 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 
 #include <sstream>
 #include <string>
@@ -38,52 +39,52 @@ static void checkRoundtrip(const std::string& s) {
   AT_ASSERT(original == parsed);
 }
 
-void testIRParser() {
-  {
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
+TEST(IRParserTest, Basic) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(
+      R"IR(
 graph(%0 : Tensor, %1 : Tensor):
   %2 : Tensor = foo::add(%0, %1)
   %res, %3 = foo::mul(%0, %2)
   %x, %y = foo::combine(%res, %2, %3)
   return (%x, %y, %res))IR",
-        &*graph,
-        vmap);
+      &*graph,
+      vmap);
 
-    AT_ASSERT(graph->inputs().size() == 2);
-    AT_ASSERT(graph->outputs().size() == 3);
-    Value* x = graph->outputs()[0];
-    Value* y = graph->outputs()[1];
-    Value* res = graph->outputs()[2];
-    Value* t0 = graph->inputs()[0];
-    Value* t1 = graph->inputs()[1];
-    AT_ASSERT(vmap["x"] == x);
-    AT_ASSERT(vmap["y"] == y);
-    AT_ASSERT(vmap["res"] == res);
-    AT_ASSERT(vmap["0"] == t0);
-    AT_ASSERT(vmap["1"] == t1);
-    AT_ASSERT(x->node() == y->node());
-    Node* comb = x->node();
-    Value* t2 = comb->inputs()[1];
-    Value* t3 = comb->inputs()[2];
-    AT_ASSERT(vmap["2"] == t2);
-    AT_ASSERT(vmap["3"] == t3);
-    AT_ASSERT(comb->kind().toQualString() == std::string("foo::combine"));
-    AT_ASSERT(comb->outputs() == std::vector<Value*>({x, y}));
-    AT_ASSERT(comb->inputs() == std::vector<Value*>({res, t2, t3}));
-    Node* mul = res->node();
-    AT_ASSERT(mul->kind().toQualString() == std::string("foo::mul"));
-    AT_ASSERT(mul->inputs() == std::vector<Value*>({t0, t2}));
-    AT_ASSERT(mul->outputs() == std::vector<Value*>({res, t3}));
-    Node* add = t2->node();
-    AT_ASSERT(add->kind().toQualString() == std::string("foo::add"));
-    AT_ASSERT(add->inputs() == std::vector<Value*>({t0, t1}));
-    AT_ASSERT(add->outputs() == std::vector<Value*>({t2}));
-  }
-  {
-    checkRoundtrip(R"IR(
+  AT_ASSERT(graph->inputs().size() == 2);
+  AT_ASSERT(graph->outputs().size() == 3);
+  Value* x = graph->outputs()[0];
+  Value* y = graph->outputs()[1];
+  Value* res = graph->outputs()[2];
+  Value* t0 = graph->inputs()[0];
+  Value* t1 = graph->inputs()[1];
+  AT_ASSERT(vmap["x"] == x);
+  AT_ASSERT(vmap["y"] == y);
+  AT_ASSERT(vmap["res"] == res);
+  AT_ASSERT(vmap["0"] == t0);
+  AT_ASSERT(vmap["1"] == t1);
+  AT_ASSERT(x->node() == y->node());
+  Node* comb = x->node();
+  Value* t2 = comb->inputs()[1];
+  Value* t3 = comb->inputs()[2];
+  AT_ASSERT(vmap["2"] == t2);
+  AT_ASSERT(vmap["3"] == t3);
+  AT_ASSERT(comb->kind().toQualString() == std::string("foo::combine"));
+  AT_ASSERT(comb->outputs() == std::vector<Value*>({x, y}));
+  AT_ASSERT(comb->inputs() == std::vector<Value*>({res, t2, t3}));
+  Node* mul = res->node();
+  AT_ASSERT(mul->kind().toQualString() == std::string("foo::mul"));
+  AT_ASSERT(mul->inputs() == std::vector<Value*>({t0, t2}));
+  AT_ASSERT(mul->outputs() == std::vector<Value*>({res, t3}));
+  Node* add = t2->node();
+  AT_ASSERT(add->kind().toQualString() == std::string("foo::add"));
+  AT_ASSERT(add->inputs() == std::vector<Value*>({t0, t1}));
+  AT_ASSERT(add->outputs() == std::vector<Value*>({t2}));
+}
+
+TEST(IRParserTest, NestedBlock) {
+  checkRoundtrip(R"IR(
 graph():
   %0 : Tensor = a::a()
     block0():
@@ -95,9 +96,10 @@ graph():
   %3 : Tensor = d::d()
   return (%3)
 )IR");
-  }
-  {
-    checkRoundtrip(R"IR(
+}
+
+TEST(IRParserTest, If) {
+  checkRoundtrip(R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
@@ -114,9 +116,10 @@ graph(%0 : Tensor,
   %11 : Tensor = aten::add(%5, %3, %10)
   return (%11)
 )IR");
-  }
-  {
-    checkRoundtrip(R"IR(
+}
+
+TEST(IRParserTest, If2) {
+  checkRoundtrip(R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
@@ -133,40 +136,43 @@ graph(%0 : Tensor,
   %11 : Tensor = aten::add(%5, %3, %10)
   return (%11)
 )IR");
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+}
+
+TEST(IRParserTest, InferredTypeIsTensor) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%a):
   return (%a))IR",
-        &*graph);
-    AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
-  }
-  {
-    // Check that parser correctly handles values reusing the same name.
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      &*graph);
+  AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
+}
+
+TEST(IRParserTest, ValueReuse) {
+  // Check that parser correctly handles values reusing the same name.
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%x):
   %x = a::a(%x)
   %x = b::b(%x)
   return (%x))IR",
-        &*graph);
-    Value* x0 = graph->inputs()[0];
-    Value* x2 = graph->outputs()[0];
-    Node* b = x2->node();
-    Value* x1 = b->inputs()[0];
-    Node* a = x1->node();
-    AT_ASSERT(a->inputs() == std::vector<Value*>({x0}));
-    AT_ASSERT(a->outputs() == std::vector<Value*>({x1}));
-    AT_ASSERT(b->inputs() == std::vector<Value*>({x1}));
-    AT_ASSERT(b->outputs() == std::vector<Value*>({x2}));
-  }
-  {
-    // Check that parser handles attributes and types.
-    checkRoundtrip(
-        R"IR(
+      &*graph);
+  Value* x0 = graph->inputs()[0];
+  Value* x2 = graph->outputs()[0];
+  Node* b = x2->node();
+  Value* x1 = b->inputs()[0];
+  Node* a = x1->node();
+  AT_ASSERT(a->inputs() == std::vector<Value*>({x0}));
+  AT_ASSERT(a->outputs() == std::vector<Value*>({x1}));
+  AT_ASSERT(b->inputs() == std::vector<Value*>({x1}));
+  AT_ASSERT(b->outputs() == std::vector<Value*>({x2}));
+}
+
+TEST(IRParserTest, Attributes) {
+  // Check that parser handles attributes and types.
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
@@ -176,155 +182,147 @@ graph(%0 : Tensor,
   %8 : string = z::z()
   return (%7)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, OptionalTypes) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : int? = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, StarTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : Float(*, *, *) = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, UnshapedTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : Long() = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, ShapedTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
       %1 : Tensor,
       %2 : Tensor):
   %3 : Double(4, 4, 5) = prim::Constant()
   return (%3)
 )IR");
-  }
+}
 
-  {
-    checkRoundtrip(
-        R"IR(
+TEST(IRParserTest, NestedContrainer) {
+  checkRoundtrip(
+      R"IR(
 graph():
   %0 : float[] = prim::Constant[value=[1., 2., 3.]]()
   %1 : str[] = prim::Constant[value=["ab", "cd", "ef"]]()
   %2 : (float[], str[]) = prim::TupleConstruct(%0, %1)
   return (%2)
 )IR");
-  }
+}
 
-  {
-    bool error_thrown = false;
-    try {
-      checkRoundtrip(
-          R"IR(
+TEST(IRParserTest, MalformedShapeAnnotation) {
+  EXPECT_ANY_THROW(checkRoundtrip(
+      R"IR(
 graph(%0 : Tensor,
     %1 : Tensor,
     %2 : Tensor):
   %3 : Double(4!, 4, 5) = prim::Constant()
   return (%3)
-)IR");
-    } catch (const std::exception& error) {
-      error_thrown = true;
-    }
-    AT_ASSERT(error_thrown);
-  }
+)IR"));
+}
 
-  {
-    auto graph = std::make_shared<Graph>();
-    const std::string& text =
-        R"IR(
+TEST(IRParserTest, FileCheck) {
+  auto graph = std::make_shared<Graph>();
+  const std::string& text =
+      R"IR(
     graph(%a):
     # CHECK: return
       return (%a))IR";
 
-    parseIR(text, &*graph);
-    AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
-    torch::jit::testing::FileCheck().run(text, *graph);
-  }
+  parseIR(text, &*graph);
+  AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get()));
+  torch::jit::testing::FileCheck().run(text, *graph);
+}
 
-  {
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    parseIR(
-        R"IR(
+TEST(IRParserTest, Strides) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  parseIR(
+      R"IR(
 graph(%a : Float(4, 5),
       %b : Float(4:5, 5:1),
       %c : Double(*, *)):
   return (%a)
 )IR",
-        &*graph,
-        vmap);
-    Value* a = graph->inputs()[0];
-    Value* b = graph->inputs()[1];
-    Value* c = graph->inputs()[2];
+      &*graph,
+      vmap);
+  Value* a = graph->inputs()[0];
+  Value* b = graph->inputs()[1];
+  Value* c = graph->inputs()[2];
 
-    auto a_type = a->type()->cast<TensorType>();
-    auto a_sizes = *a_type->sizes().concrete_sizes();
-    auto a_strides = a_type->strides().concrete_sizes();
-    AT_ASSERT(a_sizes[0] == 4 && a_sizes[1] == 5);
-    AT_ASSERT(a_strides == c10::nullopt);
+  auto a_type = a->type()->cast<TensorType>();
+  auto a_sizes = *a_type->sizes().concrete_sizes();
+  auto a_strides = a_type->strides().concrete_sizes();
+  AT_ASSERT(a_sizes[0] == 4 && a_sizes[1] == 5);
+  AT_ASSERT(a_strides == c10::nullopt);
 
-    auto b_type = b->type()->cast<TensorType>();
-    auto b_sizes = *b_type->sizes().concrete_sizes();
-    auto b_strides = *(b_type->strides().sizes());
-    AT_ASSERT(b_sizes[0] == 4 && b_sizes[1] == 5);
-    AT_ASSERT(*b_strides[0] == 5 && *b_strides[1] == 1);
+  auto b_type = b->type()->cast<TensorType>();
+  auto b_sizes = *b_type->sizes().concrete_sizes();
+  auto b_strides = *(b_type->strides().sizes());
+  AT_ASSERT(b_sizes[0] == 4 && b_sizes[1] == 5);
+  AT_ASSERT(*b_strides[0] == 5 && *b_strides[1] == 1);
 
-    auto c_type = c->type()->cast<TensorType>();
-    AT_ASSERT(*c_type->sizes().size() == 2);
-    AT_ASSERT(c_type->sizes().concrete_sizes() == c10::nullopt);
-    AT_ASSERT(c_type->strides().concrete_sizes() == c10::nullopt);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    std::unordered_map<std::string, Value*> vmap;
-    bool error_thrown = false;
-    try {
-      parseIR(
-          R"IR(
+  auto c_type = c->type()->cast<TensorType>();
+  AT_ASSERT(*c_type->sizes().size() == 2);
+  AT_ASSERT(c_type->sizes().concrete_sizes() == c10::nullopt);
+  AT_ASSERT(c_type->strides().concrete_sizes() == c10::nullopt);
+}
+
+TEST(IRParserTest, MalformedStrides) {
+  auto graph = std::make_shared<Graph>();
+  std::unordered_map<std::string, Value*> vmap;
+  bool error_thrown = false;
+  EXPECT_ANY_THROW(parseIR(
+      R"IR(
 graph(%a : Float(4:5, 5)):
   return (%a)
 )IR",
-          &*graph,
-          vmap);
-    } catch (const std::exception& error) {
-      error_thrown = true;
-    }
-    AT_ASSERT(error_thrown);
-  }
-  {
-    checkRoundtrip(
-        R"IR(
+      &*graph,
+      vmap));
+}
+
+TEST(IRParserTest, TensorShapes) {
+  checkRoundtrip(
+      R"IR(
 graph(%a : Float(4, 5),
       %b : Float(4:5, 5:1),
       %c : Double(*, *)):
   return (%a)
 )IR");
-  }
-  {
-    checkRoundtrip(
-        R"IR(
+}
+
+TEST(IRParserTest, DeviceAndRequiresGradTensors) {
+  checkRoundtrip(
+      R"IR(
 graph(%a : Float(*, *, device=cpu),
       %b : Float(*, *, requires_grad=1),
       %c : Long(5, 10, requires_grad=1, device=cpu),
@@ -337,41 +335,45 @@ graph(%a : Float(*, *, device=cpu),
       %j : Double(*, *, requires_grad=0)):
   return (%a)
 )IR");
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+}
+
+TEST(IRParserTest, ListConstant) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %d : int[] = prim::Constant[value=[1,2,3]]()
   return (%d)
 )IR",
-        &*graph);
-    Node* n = graph->outputs()[0]->node();
-    AT_ASSERT(n->kind() == prim::Constant);
-    AT_ASSERT(n->kindOf(attr::value) == AttributeKind::ival);
-    const auto& genericList = n->ival(attr::value).toList();
-    std::vector<int> int_vals;
-    for (const IValue& ival : genericList) {
-      int_vals.push_back(ival.toInt());
-    }
-    AT_ASSERT(int_vals.size() == 3);
-    AT_ASSERT(int_vals[0] == 1 && int_vals[1] == 2 && int_vals[2] == 3);
+      &*graph);
+  Node* n = graph->outputs()[0]->node();
+  AT_ASSERT(n->kind() == prim::Constant);
+  AT_ASSERT(n->kindOf(attr::value) == AttributeKind::ival);
+  const auto& genericList = n->ival(attr::value).toList();
+  std::vector<int> int_vals;
+  for (const IValue& ival : genericList) {
+    int_vals.push_back(ival.toInt());
   }
-  {
-    checkRoundtrip(
-        R"IR(
+  AT_ASSERT(int_vals.size() == 3);
+  AT_ASSERT(int_vals[0] == 1 && int_vals[1] == 2 && int_vals[2] == 3);
+}
+
+TEST(IRParserTest, PartialStarTensor) {
+  checkRoundtrip(
+      R"IR(
 graph(%x : Float(10, *, 10)):
   return (%x)
 )IR");
-    checkRoundtrip(
-        R"IR(
+}
+
+TEST(IRParserTest, ComplexTensorAttributes) {
+  checkRoundtrip(
+      R"IR(
 graph(%x : Double(*, 200, *, requires_grad=1, device=cuda:1),
       %b : Float(5, *, requires_grad=1),
       %c : Long(*, 10, device=cpu)):
   return (%x)
 )IR");
-  }
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_jit_type.cpp b/test/cpp/jit/test_jit_type.cpp
index 16c69ccd05fd..9462a572ea65 100644
--- a/test/cpp/jit/test_jit_type.cpp
+++ b/test/cpp/jit/test_jit_type.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include "torch/csrc/jit/ir/ir.h"
@@ -7,7 +8,7 @@
 namespace torch {
 namespace jit {
 
-void testUnifyTypes() {
+TEST(JitTypeTest, UnifyTypes) {
   auto bool_tensor = TensorType::get()->withScalarType(at::kBool);
   auto opt_bool_tensor = OptionalType::create(bool_tensor);
   auto unified_opt_bool = unifyTypes(bool_tensor, opt_bool_tensor);
diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 814654dfc697..b262075a42aa 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -1,5 +1,6 @@
+#include <gtest/gtest.h>
+
 #include <c10/core/TensorOptions.h>
-#include <test/cpp/jit/test_base.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/mobile/import.h>
@@ -10,11 +11,19 @@
 
 #include <unordered_set>
 
+#define ASSERT_THROWS_WITH(statement, substring)                         \
+  try {                                                                  \
+    (void)statement;                                                     \
+    ASSERT_TRUE(false);                                                  \
+  } catch (const std::exception& e) {                                    \
+    ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \
+  }
+
 // Tests go in torch::jit
 namespace torch {
 namespace jit {
 
-void testLiteInterpreterUpsampleNearest2d() {
+TEST(LiteInterpreterTest, UpsampleNearest2d) {
   Module m("m");
   m.define(R"(
     def forward(self, input: Tensor, scale:float):
@@ -37,7 +46,7 @@ void testLiteInterpreterUpsampleNearest2d() {
   ASSERT_TRUE(resd.equal(refd));
 }
 
-void testLiteInterpreterAdd() {
+TEST(LiteInterpreterTest, Add) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   // TODO: support default param val, which was pushed in
@@ -71,7 +80,7 @@ void testLiteInterpreterAdd() {
   AT_ASSERT(resd == refd);
 }
 
-void testLiteInterpreterConv() {
+TEST(LiteInterpreterTest, Conv) {
   auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
   if (s && strcmp(s, "1") == 0)
     return;
@@ -103,7 +112,7 @@ void testLiteInterpreterConv() {
       outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
 }
 
-void testLiteInterpreterInline() {
+TEST(LiteInterpreterTest, Inline) {
   Module m("m");
   m.define(R"JIT(
   def foo1(self, x):
@@ -123,7 +132,7 @@ void testLiteInterpreterInline() {
   AT_ASSERT(output.toTensor().item<float>() == 7.0);
 }
 
-void testLiteInterpreterTuple() {
+TEST(LiteInterpreterTest, Tuple) {
   Module m("m");
   m.define(R"JIT(
   def foo(self, x):
@@ -141,7 +150,7 @@ void testLiteInterpreterTuple() {
   AT_ASSERT(output.toTuple()->elements()[1].toInt() == 2);
 }
 
-void testLiteInterpreterDict() {
+TEST(LiteInterpreterTest, Dict) {
   Module m("m");
   m.define(R"JIT(
   def foo(self, x):
@@ -159,7 +168,7 @@ void testLiteInterpreterDict() {
   AT_ASSERT(output.toGenericDict().at("result").toTensor().item().toInt() == 2);
 }
 
-void testLiteInterpreterPrimOverload() {
+TEST(LiteInterpreterTest, PrimOverload) {
   /*
   // temporarily disabled
   script::Module m("m");
@@ -178,7 +187,7 @@ void testLiteInterpreterPrimOverload() {
   */
 }
 
-void testLiteInterpreterPrim() {
+TEST(LiteInterpreterTest, Prim) {
   Module m("m");
   m.define(R"JIT(
         def forward(self, x):
@@ -204,7 +213,7 @@ void testLiteInterpreterPrim() {
   AT_ASSERT(resi == refi);
 }
 
-void testLiteInterpreterPrimScalar() {
+TEST(LiteInterpreterTest, PrimScalar) {
   Module m("m");
   m.define(R"JIT(
         def forward(self, x):
@@ -230,7 +239,7 @@ void testLiteInterpreterPrimScalar() {
   AT_ASSERT(resi == refi);
 }
 
-void testLiteInterpreterLoadOrigJit() {
+TEST(LiteInterpreterTest, LoadOrigJit) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -243,7 +252,7 @@ void testLiteInterpreterLoadOrigJit() {
   ASSERT_THROWS_WITH(_load_for_mobile(ss), "file not found");
 }
 
-void testLiteInterpreterWrongMethodName() {
+TEST(LiteInterpreterTest, WrongMethodName) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -260,7 +269,7 @@ void testLiteInterpreterWrongMethodName() {
   ASSERT_THROWS_WITH(bc.get_method("forward")(inputs), "is not defined");
 }
 
-void testLiteInterpreterSetState() {
+TEST(LiteInterpreterTest, SetState) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -308,7 +317,7 @@ class TorchBindLiteInterpreterTestStruct
   }
 };
 
-void testLiteInterpreterBuiltinFunction() {
+TEST(LiteInterpreterTest, BuiltinFunction) {
   script::Module m("m");
   auto custom_class_obj =
       make_custom_class<TorchBindLiteInterpreterTestStruct>();
@@ -328,7 +337,7 @@ void testLiteInterpreterBuiltinFunction() {
   AT_ASSERT(str == expected);
 }
 
-void testLiteInterpreterModuleInfoBasic() {
+TEST(LiteInterpreterTest, ModuleInfoBasic) {
   Module m("M");
   m.define(R"JIT(
     def forward(self, x):
@@ -357,7 +366,7 @@ void testLiteInterpreterModuleInfoBasic() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterNotSavingModuleInfo() {
+TEST(LiteInterpreterTest, NotSaveModuleInfo) {
   Module m("M");
   m.define(R"JIT(
     def forward(self, x):
@@ -380,7 +389,7 @@ void testLiteInterpreterNotSavingModuleInfo() {
   }
 }
 
-void testLiteInterpreterOneSubmoduleModuleInfo() {
+TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -416,7 +425,7 @@ void testLiteInterpreterOneSubmoduleModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterTwoSubmodulesModuleInfo() {
+TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -458,7 +467,7 @@ void testLiteInterpreterTwoSubmodulesModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterSequentialModuleInfo() {
+TEST(LiteInterpreterTest, SequentialModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -500,7 +509,7 @@ void testLiteInterpreterSequentialModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterHierarchyModuleInfo() {
+TEST(LiteInterpreterTest, HierarchyModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -546,7 +555,7 @@ void testLiteInterpreterHierarchyModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterDuplicatedClassTypeModuleInfo() {
+TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
   Module a("A");
   a.define(R"JIT(
     def forward(self, x):
@@ -586,7 +595,7 @@ void testLiteInterpreterDuplicatedClassTypeModuleInfo() {
   AT_ASSERT(module_debug_info_set == expected_result);
 }
 
-void testLiteInterpreterEval() {
+TEST(LiteInterpreterTest, Eval) {
   std::vector<torch::jit::IValue> inputs;
 
   Module m("m");
@@ -619,7 +628,7 @@ void testLiteInterpreterEval() {
       outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
 }
 
-void testLiteInterpreterFindWrongMethodName() {
+TEST(LiteInterpreterTest, FindWrongMethodName) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -633,7 +642,7 @@ void testLiteInterpreterFindWrongMethodName() {
   ASSERT_TRUE(bc.find_method("forward") == c10::nullopt);
 }
 
-void testLiteInterpreterFindAndRunMethod() {
+TEST(LiteInterpreterTest, FindAndRunMethod) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -663,7 +672,7 @@ void testLiteInterpreterFindAndRunMethod() {
   AT_ASSERT(resd == refd);
 }
 
-void testLiteInterpreterRunMethodVariadic() {
+TEST(LiteInterpreterTest, RunMethodVariadic) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
diff --git a/test/cpp/jit/test_lite_trainer.cpp b/test/cpp/jit/test_lite_trainer.cpp
index b70c4db62c70..9a988ecb2db1 100644
--- a/test/cpp/jit/test_lite_trainer.cpp
+++ b/test/cpp/jit/test_lite_trainer.cpp
@@ -1,5 +1,6 @@
+#include <gtest/gtest.h>
+
 #include <c10/core/TensorOptions.h>
-#include <test/cpp/jit/test_base.h>
 #include <torch/csrc/autograd/generated/variable_factories.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/mobile/export_data.h>
@@ -16,7 +17,7 @@
 namespace torch {
 namespace jit {
 
-void testLiteInterpreterParams() {
+TEST(LiteTrainerTest, Params) {
   Module m("m");
   m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false);
   m.define(R"(
@@ -74,7 +75,7 @@ void testLiteInterpreterParams() {
   AT_ASSERT(parameters[0].item<float>() == bc_parameters[0].item<float>());
 }
 
-void testMobileNamedParameters() {
+TEST(MobileTest, NamedParameters) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -99,7 +100,7 @@ void testMobileNamedParameters() {
   }
 }
 
-void testMobileSaveLoadData() {
+TEST(MobileTest, SaveLoadData) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -127,7 +128,7 @@ void testMobileSaveLoadData() {
   }
 }
 
-void testMobileSaveLoadParameters() {
+TEST(MobileTest, SaveLoadParameters) {
   Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   m.define(R"(
@@ -157,7 +158,7 @@ void testMobileSaveLoadParameters() {
   }
 }
 
-void testMobileSaveLoadParametersEmpty() {
+TEST(MobileTest, SaveLoadParametersEmpty) {
   Module m("m");
   m.define(R"(
     def add_it(self, x):
@@ -180,7 +181,7 @@ void testMobileSaveLoadParametersEmpty() {
   AT_ASSERT(mobile_params.size() == 0);
 }
 
-void testLiteSGD() {
+TEST(LiteTrainerTest, SGD) {
   Module m("m");
   m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false);
   m.define(R"(
@@ -253,7 +254,7 @@ struct DummyDataset : torch::data::datasets::Dataset<DummyDataset, int> {
 };
 } // namespace
 
-void testLiteSequentialSampler() {
+TEST(LiteTrainerTest, SequentialSampler) {
   // test that sampler can be used with dataloader
   const int kBatchSize = 10;
   auto data_loader =
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 92baba1168da..fd260cb0b4b1 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -1,9 +1,10 @@
+#include <gtest/gtest.h>
+
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
 
-#include "test/cpp/jit/test_base.h"
 #include "test/cpp/jit/test_utils.h"
 
 #include <torch/csrc/jit/ir/type_hashing.h>
@@ -92,7 +93,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& list) {
   return out;
 }
 
-void testInternedStrings() {
+TEST(InternedStringsTest, Basic) {
   ASSERT_EQ(prim::Param, Symbol::prim("Param"));
   ASSERT_EQ(prim::Return, Symbol::prim("Return"));
   ASSERT_EQ(prim::Return.toUnqualString(), std::string("Return"));
@@ -108,7 +109,7 @@ void testInternedStrings() {
   ASSERT_EQ(Symbol(symstart + 2).toUnqualString(), std::string("What2"));
 }
 
-void testFromQualString() {
+TEST(FromQualStringTest, Basic) {
   ASSERT_EQ(Symbol::fromQualString("prim::Param"), Symbol::prim("Param"));
   ASSERT_EQ(Symbol::fromQualString("aten::mm"), Symbol::aten("mm"));
   ASSERT_EQ(Symbol::fromQualString("onnx::LSTM"), Symbol::onnx("LSTM"));
@@ -138,7 +139,7 @@ void testFromQualString() {
   }
 }
 
-void testTHNNConv() {
+TEST(THNNConvTest, Basic) {
   std::vector<int64_t> input_size = {4, 3, 15, 17}; // B x C x H x W
   std::vector<int64_t> kernel_size = {3, 5};
   std::vector<int64_t> stride = {1, 2};
@@ -233,7 +234,7 @@ void testTHNNConv() {
   assertAllClose(tensor_grads_out, expected_tensor_grads_out);
 }
 
-void testATenNativeBatchNorm() {
+TEST(ATenNativeBatchNormTest, Basic) {
   // aten::native_batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor
   // running_mean, Tensor running_var, bool training, float momentum, float eps)
   // -> (Tensor, Tensor, Tensor)
@@ -365,7 +366,7 @@ void testATenNativeBatchNorm() {
   assertAllClose(tensor_grads_out, expected_tensor_grads_out);
 }
 
-void testCustomFusion() {
+TEST(CustomFusionTest, Basic) {
   auto graph_string = R"IR(
     graph(%0 : Float(2, 3, 4),
           %1 : Float(2, 3, 4)):
@@ -399,7 +400,7 @@ void testCustomFusion() {
   AT_ASSERT(hits == 2);
 }
 
-void testCustomFusionNestedBlocks() {
+TEST(CustomFusionTest, NestedBlocks) {
   auto graph_string = R"IR(
   graph(%0 : Float(2, 3, 4),
         %1 : Float(2, 3, 4),
@@ -461,7 +462,8 @@ static const auto cf_examples = R"JIT(
       i += 1
     return a
 )JIT";
-void testControlFlow() {
+
+TEST(ControlFlowTest, Basic) {
   auto cu = compile(cf_examples);
 
   auto run = [&](const std::string& name, std::vector<IValue> stack) {
@@ -484,13 +486,13 @@ void testControlFlow() {
   ASSERT_EQ(256, run_binary("while_test", 2, 0));
 }
 
-void testProto() {
+TEST(ProtoTest, Basic) {
   ::ONNX_NAMESPACE::ModelProto proto;
   proto.set_producer_name("foo");
 }
 
 // test a few features that are not directly used in schemas yet
-void testSchemaParser() {
+TEST(SchemaParserTest, NestedArrays) {
   // nested arrays
   auto s = parseSchema("at::what(int[][4] foo) -> ()");
   ASSERT_TRUE(s.arguments().at(0).N() == 4);
@@ -509,145 +511,151 @@ void testSchemaParser() {
                                               ->getElementType()
                                               ->expect<ListType>()
                                               ->getElementType()));
+}
 
+TEST(SchemaParserTest, NamedReturns) {
   // named returns
   parseSchema("at::what(Tensor! i_will_be_written_to) -> ()");
   auto s3 =
       parseSchema("at::what() -> (Tensor the_return, Tensor the_return2)");
   ASSERT_TRUE(s3.returns().at(0).name() == "the_return");
   ASSERT_TRUE(s3.returns().at(1).name() == "the_return2");
+}
 
+TEST(SchemaParserTest, Futures) {
   // futures
   auto s4 = parseSchema("at::what(Future(int) foo) -> ()");
   ASSERT_TRUE(IntType::get()->isSubtypeOf(
       s4.arguments().at(0).type()->expect<FutureType>()->getElementType()));
+}
 
+TEST(SchemaParserTest, AnnotatedAliasSets) {
   // test tensor with annotated alias sets
   parseSchema("at::what(Tensor(a) foo) -> (Tensor(a))");
+}
 
-  {
-    const auto s = parseSchema(
-        "at::what(Tensor(b|c)[](a!) list, Tensor(c) element)"
-        " -> (Tensor(b|c)[](a!))");
-
-    // The list itself is annotated with `a`
-    const auto& aliasInfo = *s.arguments().at(0).alias_info();
-    ASSERT_TRUE(
-        aliasInfo.beforeSets() ==
-        std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
-    ASSERT_TRUE(aliasInfo.isWrite());
-
-    // Check the contained types
-    ASSERT_TRUE(!aliasInfo.containedTypes().empty());
-    const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
-    const auto expected = std::unordered_set<Symbol>{
-        Symbol::fromQualString("alias::b"),
-        Symbol::fromQualString("alias::c"),
-    };
-    ASSERT_TRUE(containedAliasInfo.beforeSets() == expected);
-    ASSERT_TRUE(containedAliasInfo.afterSets() == expected);
-    ASSERT_FALSE(containedAliasInfo.isWrite());
-  }
-  {
-    const auto s = parseSchema(
-        "at::what(Tensor(b -> b|c)[](a!) list, Tensor(c) element)"
-        " -> (Tensor(b|c)[](a!))");
-
-    // The list itself is annotated with `a`
-    const auto& aliasInfo = *s.arguments().at(0).alias_info();
-    ASSERT_EQ(
-        aliasInfo.beforeSets(),
-        std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
-    ASSERT_EQ(
-        aliasInfo.afterSets(),
-        std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
-    ASSERT_TRUE(aliasInfo.isWrite());
-    ASSERT_EQ(aliasInfo.containedTypes().size(), 1);
-
-    // Check the contained types
-    ASSERT_TRUE(!aliasInfo.containedTypes().empty());
-    const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
-    const auto expectedBefore = std::unordered_set<Symbol>{
-        Symbol::fromQualString("alias::b"),
-    };
-    const auto expectedAfter = std::unordered_set<Symbol>{
-        Symbol::fromQualString("alias::b"), Symbol::fromQualString("alias::c")};
-    ASSERT_TRUE(containedAliasInfo.beforeSets() == expectedBefore);
-    ASSERT_TRUE(containedAliasInfo.afterSets() == expectedAfter);
-    ASSERT_FALSE(containedAliasInfo.isWrite());
-  }
+TEST(SchemaParserTest, BeforeAfterSets) {
+  const auto s = parseSchema(
+      "at::what(Tensor(b|c)[](a!) list, Tensor(c) element)"
+      " -> (Tensor(b|c)[](a!))");
+
+  // The list itself is annotated with `a`
+  const auto& aliasInfo = *s.arguments().at(0).alias_info();
+  ASSERT_TRUE(
+      aliasInfo.beforeSets() ==
+      std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
+  ASSERT_TRUE(aliasInfo.isWrite());
+
+  // Check the contained types
+  ASSERT_TRUE(!aliasInfo.containedTypes().empty());
+  const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
+  const auto expected = std::unordered_set<Symbol>{
+      Symbol::fromQualString("alias::b"),
+      Symbol::fromQualString("alias::c"),
+  };
+  ASSERT_TRUE(containedAliasInfo.beforeSets() == expected);
+  ASSERT_TRUE(containedAliasInfo.afterSets() == expected);
+  ASSERT_FALSE(containedAliasInfo.isWrite());
 }
 
-void testTopologicalIndex() {
-  {
-    Graph graph;
-    auto node1 = graph.create(prim::AutogradZero);
-    auto node2 = graph.create(prim::AutogradZero);
-    auto node3 = graph.create(prim::AutogradZero);
-    auto node4 = graph.create(prim::AutogradZero);
-
-    graph.appendNode(node4);
-    graph.prependNode(node1);
-    node2->insertAfter(node1);
-    node3->insertBefore(node4);
-
-    // nodes should be in numerical order
-    ASSERT_TRUE(node1->isBefore(node2));
-    ASSERT_TRUE(node1->isBefore(node3));
-    ASSERT_TRUE(node1->isBefore(node4));
-    ASSERT_TRUE(node2->isAfter(node1));
-    ASSERT_TRUE(node2->isBefore(node3));
-    ASSERT_TRUE(node2->isBefore(node4));
-    ASSERT_FALSE(node3->isBefore(node1));
-    ASSERT_FALSE(node3->isBefore(node2));
-    ASSERT_FALSE(node3->isAfter(node4));
-
-    // Built up a block structure
-    //  node3
-    //   /\        ...
-    //  A  B     block1
-    //      \      ...
-    //      C    block2
-    auto block1 = node3->addBlock();
-    auto A = graph.create(prim::AutogradZero);
-    block1->appendNode(A);
-    auto B = graph.create(prim::AutogradZero);
-    block1->appendNode(B);
-    auto block2 = B->addBlock();
-    auto C = graph.create(prim::AutogradZero);
-    block2->appendNode(C);
-
-    // Check isAfter on different block levels
-    ASSERT_TRUE(node1->isBefore(A));
-    ASSERT_TRUE(A->isBefore(B));
-    ASSERT_TRUE(A->isBefore(C));
-
-    // make sure things don't blow up on deletions
-    node2->destroy();
-    auto node2p = graph.create(prim::AutogradZero);
-    node2p->insertAfter(node1);
-    ASSERT_TRUE(node1->isBefore(node2p));
-    ASSERT_TRUE(node2p->isBefore(node3));
+TEST(SchemaParserTest, BeforeAfterSets2) {
+  const auto s = parseSchema(
+      "at::what(Tensor(b -> b|c)[](a!) list, Tensor(c) element)"
+      " -> (Tensor(b|c)[](a!))");
+
+  // The list itself is annotated with `a`
+  const auto& aliasInfo = *s.arguments().at(0).alias_info();
+  ASSERT_EQ(
+      aliasInfo.beforeSets(),
+      std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
+  ASSERT_EQ(
+      aliasInfo.afterSets(),
+      std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
+  ASSERT_TRUE(aliasInfo.isWrite());
+  ASSERT_EQ(aliasInfo.containedTypes().size(), 1);
+
+  // Check the contained types
+  ASSERT_TRUE(!aliasInfo.containedTypes().empty());
+  const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
+  const auto expectedBefore = std::unordered_set<Symbol>{
+      Symbol::fromQualString("alias::b"),
+  };
+  const auto expectedAfter = std::unordered_set<Symbol>{
+      Symbol::fromQualString("alias::b"), Symbol::fromQualString("alias::c")};
+  ASSERT_TRUE(containedAliasInfo.beforeSets() == expectedBefore);
+  ASSERT_TRUE(containedAliasInfo.afterSets() == expectedAfter);
+  ASSERT_FALSE(containedAliasInfo.isWrite());
+}
+
+TEST(TopologicalIndexTest, Basic) {
+  Graph graph;
+  auto node1 = graph.create(prim::AutogradZero);
+  auto node2 = graph.create(prim::AutogradZero);
+  auto node3 = graph.create(prim::AutogradZero);
+  auto node4 = graph.create(prim::AutogradZero);
+
+  graph.appendNode(node4);
+  graph.prependNode(node1);
+  node2->insertAfter(node1);
+  node3->insertBefore(node4);
+
+  // nodes should be in numerical order
+  ASSERT_TRUE(node1->isBefore(node2));
+  ASSERT_TRUE(node1->isBefore(node3));
+  ASSERT_TRUE(node1->isBefore(node4));
+  ASSERT_TRUE(node2->isAfter(node1));
+  ASSERT_TRUE(node2->isBefore(node3));
+  ASSERT_TRUE(node2->isBefore(node4));
+  ASSERT_FALSE(node3->isBefore(node1));
+  ASSERT_FALSE(node3->isBefore(node2));
+  ASSERT_FALSE(node3->isAfter(node4));
+
+  // Built up a block structure
+  //  node3
+  //   /\        ...
+  //  A  B     block1
+  //      \      ...
+  //      C    block2
+  auto block1 = node3->addBlock();
+  auto A = graph.create(prim::AutogradZero);
+  block1->appendNode(A);
+  auto B = graph.create(prim::AutogradZero);
+  block1->appendNode(B);
+  auto block2 = B->addBlock();
+  auto C = graph.create(prim::AutogradZero);
+  block2->appendNode(C);
+
+  // Check isAfter on different block levels
+  ASSERT_TRUE(node1->isBefore(A));
+  ASSERT_TRUE(A->isBefore(B));
+  ASSERT_TRUE(A->isBefore(C));
+
+  // make sure things don't blow up on deletions
+  node2->destroy();
+  auto node2p = graph.create(prim::AutogradZero);
+  node2p->insertAfter(node1);
+  ASSERT_TRUE(node1->isBefore(node2p));
+  ASSERT_TRUE(node2p->isBefore(node3));
+}
+
+TEST(TopologicalIndexTest, Reindex) {
+  // Induce reindexing to test that path
+  Graph graph;
+  std::map<size_t, Node*> nodes;
+
+  auto anchor = graph.create(prim::AutogradZero);
+  graph.appendNode(anchor);
+  // Inserting to the same place a lot will trigger reindexing
+  for (auto i = 0; i < 100; ++i) {
+    auto n = graph.create(prim::AutogradZero);
+    n->insertAfter(anchor);
+    nodes[i] = n;
   }
-  {
-    // Induce reindexing to test that path
-    Graph graph;
-    std::map<size_t, Node*> nodes;
-
-    auto anchor = graph.create(prim::AutogradZero);
-    graph.appendNode(anchor);
-    // Inserting to the same place a lot will trigger reindexing
-    for (auto i = 0; i < 100; ++i) {
-      auto n = graph.create(prim::AutogradZero);
-      n->insertAfter(anchor);
-      nodes[i] = n;
-    }
 
-    // Nodes should be in reverse order
-    for (auto i = 0; i < 100; ++i) {
-      for (auto j = i + 1; j < 100; ++j) {
-        ASSERT_TRUE(nodes[i]->isAfter(nodes[j]));
-      }
+  // Nodes should be in reverse order
+  for (auto i = 0; i < 100; ++i) {
+    for (auto j = i + 1; j < 100; ++j) {
+      ASSERT_TRUE(nodes[i]->isAfter(nodes[j]));
     }
   }
 }
@@ -770,7 +778,7 @@ void checkScopeCallbacks() {
   TORCH_CHECK(found_user_scope);
 }
 
-void testRecordFunction() {
+TEST(RecordFunctionTest, Basic) {
   // disabling the inlining of method calls
   GraphOptimizerEnabledGuard opt_guard(false);
 
@@ -1136,7 +1144,7 @@ void checkDebugInfo(c10::DebugInfoKind kind, int model_id) {
   TORCH_CHECK(test_debug_info->getModelId() == model_id);
 }
 
-void testThreadLocalDebugInfo() {
+TEST(ThreadLocalDebugInfoTest, Basic) {
   TORCH_CHECK(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr);
   auto debug_info = std::make_shared<TestThreadLocalDebugInfo>();
@@ -1209,7 +1217,7 @@ void testThreadLocalDebugInfo() {
   }
 }
 
-void testFallbackGraphs() {
+TEST(FallbackGraphsTest, Basic) {
   static const auto nestGraphIntoFallbackGraph =
       [](const std::shared_ptr<Graph>& graph) {
         ProfilingRecord::removeProfileCounter(graph->block());
@@ -1285,35 +1293,36 @@ void testFallbackGraphs() {
   }
 }
 
-void testAutogradProfiler() {
-  constexpr int batch_size = 4;
-  constexpr int input_size = 256;
-  constexpr int seq_len = 32;
-
-  int hidden_size = 2 * input_size;
-  auto input = torch::randn({seq_len, batch_size, input_size}, at::kCPU);
-  auto hx = torch::randn({batch_size, hidden_size}, at::kCPU);
-  auto cx = torch::randn({batch_size, hidden_size}, at::kCPU);
-  auto w_ih = t_def(torch::randn({4 * hidden_size, input_size}, at::kCPU));
-  auto w_hh = t_def(torch::randn({4 * hidden_size, hidden_size}, at::kCPU));
-
-  std::stringstream ss;
-  {
-    RecordProfile guard(ss);
-    for (size_t i = 0; i < 100; ++i) {
-      std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
-    }
-  }
-
-  std::string result = ss.str();
-  size_t count = 0;
-  for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos;
-       count++, pos++) {
-  }
-  TORCH_CHECK(count == 200);
-}
-
-void testNoneSchemaMatch() {
+// TODO this test wasn't running and is broken.
+// TEST(AutogradProfilerTest, Basic) {
+//   constexpr int batch_size = 4;
+//   constexpr int input_size = 256;
+//   constexpr int seq_len = 32;
+
+//   int hidden_size = 2 * input_size;
+//   auto input = torch::randn({seq_len, batch_size, input_size}, at::kCPU);
+//   auto hx = torch::randn({batch_size, hidden_size}, at::kCPU);
+//   auto cx = torch::randn({batch_size, hidden_size}, at::kCPU);
+//   auto w_ih = t_def(torch::randn({4 * hidden_size, input_size}, at::kCPU));
+//   auto w_hh = t_def(torch::randn({4 * hidden_size, hidden_size}, at::kCPU));
+
+//   std::stringstream ss;
+//   {
+//     RecordProfile guard(ss);
+//     for (size_t i = 0; i < 100; ++i) {
+//       std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
+//     }
+//   }
+
+//   std::string result = ss.str();
+//   size_t count = 0;
+//   for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos;
+//        count++, pos++) {
+//   }
+//   ASSERT_EQ((count, 200);
+// }
+
+TEST(NoneSchemaMatchTest, Basic) {
   RegisterOperators reg({
       Operator(
           "prim::test_none() -> int?",
@@ -1348,40 +1357,6 @@ void testNoneSchemaMatch() {
   AT_ASSERT(std::distance(nodes.begin(), nodes.end()) == 1);
 }
 
-void testModuleDefine() {
-  Module m("m");
-  m.register_parameter("foo", torch::ones({}), false);
-  m.define(R"(
-    def add_it(self, x, b : int = 4):
-      return self.foo + x + b
-  )");
-  auto result = m.run_method("add_it", torch::ones({}));
-  AT_ASSERT(result.toTensor().item<float>() == 6);
-}
-
-void testModuleConversion() {
-  Module m("test");
-  {
-    // test cuda to cpu for params and buffers
-    m.register_parameter("foo", torch::ones({}, at::kCUDA), false);
-    m.register_buffer("bar", torch::ones({}, at::kCUDA));
-
-    m.to(at::kCUDA);
-    m.to(at::kCPU);
-    AT_ASSERT(m.attr("foo").toTensor().device().is_cpu());
-    AT_ASSERT(m.attr("bar").toTensor().device().is_cpu());
-  }
-  {
-    // test cpu to cuda for params and buffers
-    m.register_parameter("foo", torch::ones({}), false);
-    m.register_buffer("bar", torch::ones({}));
-
-    m.to(at::kCUDA);
-    AT_ASSERT(m.attr("foo").toTensor().device().is_cuda());
-    AT_ASSERT(m.attr("bar").toTensor().device().is_cuda());
-  }
-}
-
 static int testPassValue = 0;
 void fakePass(std::shared_ptr<Graph>& g) {
   testPassValue++;
@@ -1390,7 +1365,7 @@ void fakePass(std::shared_ptr<Graph>& g) {
 
 RegisterPass p(fakePass);
 
-void testPassManagement() {
+TEST(PassManagementTest, Basic) {
   std::shared_ptr<Graph> graph = std::make_shared<Graph>();
   parseIR(
       R"IR(
@@ -1447,14 +1422,17 @@ size_t countNodes(
   return count;
 }
 
-void testLoopPeeler() {
-  // peel all loops
-  auto true_pred = [](Node* n) { return true; };
-  auto is_loop = [](Node* n) { return n->kind() == prim::Loop; };
+bool true_pred(Node* n) {
+  return true;
+};
+
+bool is_loop(Node* n) {
+  return n->kind() == prim::Loop;
+};
 
+TEST(LoopPeelerTest, NoInductionVariableUse) {
   // do not use an induction variable explicitly
-  {
-    static const auto str_func_def = R"JIT(
+  static const auto str_func_def = R"JIT(
     def test_peel_n_times():
       sum = 0
       for i in range(10):
@@ -1462,41 +1440,41 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_peel_n_times");
-    auto stack = createStack({});
-    // peeling loop once
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 20);
-    }
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_peel_n_times");
+  auto stack = createStack({});
+  // peeling loop once
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 20);
+  }
 
-    // test peeling more than one iteration
-    {
-      LoopsPeeler peeler(true_pred, 3);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 20);
-    }
+  // test peeling more than one iteration
+  {
+    LoopsPeeler peeler(true_pred, 3);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 20);
   }
+}
 
+TEST(LoopPeelerTest, YesInductionVariableUse) {
   // uses the induction variable
-  {
-    static const auto str_func_def = R"JIT(
+  static const auto str_func_def = R"JIT(
     def test_peel_n_times():
       sum = 0
       for i in range(10):
@@ -1504,41 +1482,41 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_peel_n_times");
-    auto stack = createStack({});
-    // peeling loop once
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 45);
-    }
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_peel_n_times");
+  auto stack = createStack({});
+  // peeling loop once
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 45);
+  }
 
-    // test peeling more than one iteration
-    {
-      LoopsPeeler peeler(true_pred, 3);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 45);
-    }
+  // test peeling more than one iteration
+  {
+    LoopsPeeler peeler(true_pred, 3);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 45);
   }
+}
 
+TEST(LoopPeelerTest, LoopWithTerminationCondition) {
   // tests with explicit termination conditions
-  {
-    static const auto str_func_def = R"JIT(
+  static const auto str_func_def = R"JIT(
     def test_with_cond_times():
       sum = 0
       i = 0
@@ -1548,44 +1526,44 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    // the peel changes the termination condition to false
-    // so the original loop doesn't run
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_with_cond_times");
-    auto stack = createStack({});
-    // peeling 5 iterations should update the termination
-    // condition to false
-    {
-      LoopsPeeler peeler(true_pred, 5);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
-
-    // the termination condition remains true
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      int num_loops =
-          std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
-      ASSERT_EQ(num_loops, 2);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
+  // the peel changes the termination condition to false
+  // so the original loop doesn't run
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_with_cond_times");
+  auto stack = createStack({});
+  // peeling 5 iterations should update the termination
+  // condition to false
+  {
+    LoopsPeeler peeler(true_pred, 5);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
   }
 
-  // tests simple nested loops
+  // the termination condition remains true
   {
-    static const auto str_func_def = R"JIT(
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    int num_loops =
+        std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
+    ASSERT_EQ(num_loops, 2);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
+  }
+}
+
+// tests simple nested loops
+TEST(LoopPeelerTest, SimpleNestedLoops) {
+  static const auto str_func_def = R"JIT(
     def test_nested_loops():
       sum = 0
       i = 0
@@ -1595,35 +1573,35 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_nested_loops");
-    auto stack = createStack({});
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_nested_loops");
+  auto stack = createStack({});
 
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 900);
-    }
-
-    {
-      LoopsPeeler peeler(true_pred, 5);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 900);
-    }
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 900);
   }
 
   {
-    static const auto str_func_def = R"JIT(
+    LoopsPeeler peeler(true_pred, 5);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 900);
+  }
+}
+
+TEST(LoopPeelerTest, SimpleNestedLoops2) {
+  static const auto str_func_def = R"JIT(
     def test_nested_loops():
       sum = 0
       i = 0
@@ -1635,34 +1613,33 @@ void testLoopPeeler() {
       return sum
     )JIT";
 
-    auto cu = compile(str_func_def);
-    auto& f = cu->get_function("test_nested_loops");
-    auto stack = createStack({});
-    {
-      LoopsPeeler peeler(true_pred, 1);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
+  auto cu = compile(str_func_def);
+  auto& f = cu->get_function("test_nested_loops");
+  auto stack = createStack({});
+  {
+    LoopsPeeler peeler(true_pred, 1);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
+  }
 
-    {
-      LoopsPeeler peeler(true_pred, 5);
-      auto copy = f.graph()->copy();
-      peeler.run(copy);
-      ASSERT_EQ(countNodes(copy, is_loop), 5);
-      Code code(copy, "");
-      InterpreterState interpreter{code};
-      interpreter.run(stack);
-      ASSERT_EQ(stack.back().toInt(), 3);
-    }
+  {
+    LoopsPeeler peeler(true_pred, 5);
+    auto copy = f.graph()->copy();
+    peeler.run(copy);
+    ASSERT_EQ(countNodes(copy, is_loop), 5);
+    Code code(copy, "");
+    InterpreterState interpreter{code};
+    interpreter.run(stack);
+    ASSERT_EQ(stack.back().toInt(), 3);
   }
 }
 
-void testInsertAndEliminateRedundantGuards() {
+TEST(InsertAndEliminateRedundantGuardsTest, Basic) {
   static const auto basic_example = R"JIT(
   def basic(x, y):
     a = x + y
@@ -1705,7 +1682,7 @@ void testInsertAndEliminateRedundantGuards() {
   ASSERT_EQ(num_guards, 2);
 }
 
-void testInsertBailOuts() {
+TEST(InsertBailOutsTest, Basic) {
   static const auto basic_example = R"JIT(
   def basic_loop(x, y):
 
@@ -1754,7 +1731,7 @@ void testInsertBailOuts() {
   }
 }
 
-void testProfiler() {
+TEST(ProfilerTest, Basic) {
   constexpr int batch_size = 4;
   constexpr int input_size = 256;
 
@@ -1804,7 +1781,7 @@ void testProfiler() {
   checkShape(tanh_n->inputs().at(0)->node()->ty(attr::profiled_type), eltwise);
 }
 
-void testCallStack() {
+TEST(CallStackTest, Basic) {
   const auto text = R"(
 def ham(x):
     return x/7
@@ -1880,7 +1857,7 @@ def foo(x):
   }
 }
 
-void testCallStackCaching() {
+TEST(CallStackTest, Caching) {
   const auto text = R"(
 
 def a(x):
@@ -1923,7 +1900,7 @@ def c(x):
   ASSERT_TRUE(callstack_objects.at("a1") == callstack_objects.at("a2"));
 }
 
-void testAutogradSymbols() {
+TEST(AutogradSymbolsTest, Basic) {
   Symbol sym = Symbol::fromQualString("aten::test_symbol");
   Graph graph;
   auto node = graph.create(sym);
@@ -1942,7 +1919,7 @@ void testAutogradSymbols() {
   TORCH_CHECK(!canRunWithAutograd(node));
 }
 
-void testDefaultArgTypeHinting() {
+TEST(DefaultArgTypeHintingTest, Basic) {
   const auto text_non_hinted = R"(
 
 def a(x, y=1):
@@ -1968,184 +1945,182 @@ def a(x, y:int=1):
   auto cu = compile(text_hinted);
 }
 
-void testFutures() {
-  // Basic set case.
-  {
-    auto f1 = c10::make_intrusive<Future>(IntType::get());
-    ASSERT_FALSE(f1->completed());
-    ASSERT_FALSE(f1->hasValue());
-    int32_t sat1 = 0;
-    int32_t sat2 = 0;
-    f1->addCallback([&]() { ++sat1; });
-    f1->markCompleted(43);
-    ASSERT_TRUE(f1->completed());
-    ASSERT_TRUE(f1->hasValue());
-    ASSERT_FALSE(f1->hasError());
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(f1->constValue().toInt(), 43);
-    ASSERT_EQ(f1->value().toInt(), 43);
-    f1->addCallback([&]() { ++sat2; });
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(sat2, 1);
-  }
+// Basic set case.
+TEST(FuturesTest, Basic) {
+  auto f1 = c10::make_intrusive<Future>(IntType::get());
+  ASSERT_FALSE(f1->completed());
+  ASSERT_FALSE(f1->hasValue());
+  int32_t sat1 = 0;
+  int32_t sat2 = 0;
+  f1->addCallback([&]() { ++sat1; });
+  f1->markCompleted(43);
+  ASSERT_TRUE(f1->completed());
+  ASSERT_TRUE(f1->hasValue());
+  ASSERT_FALSE(f1->hasError());
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(f1->constValue().toInt(), 43);
+  ASSERT_EQ(f1->value().toInt(), 43);
+  f1->addCallback([&]() { ++sat2; });
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(sat2, 1);
+}
 
-  // Basic error cases.
-  {
-    auto f1 = c10::make_intrusive<Future>(IntType::get());
-    int sat1 = 0;
-    int sat2 = 0;
-    f1->addCallback([&]() { ++sat1; });
-    f1->setError(
-        std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
-    ASSERT_EQ(sat1, 1);
-    ASSERT_TRUE(f1->completed());
-    ASSERT_TRUE(f1->hasError());
-    ASSERT_FALSE(f1->hasValue());
-    try {
-      (void)f1->value();
-      ASSERT_TRUE(false); // Supposed to throw.
-    } catch (const std::exception& e) {
-      ASSERT_TRUE(strcmp(e.what(), "Failed") == 0);
-    }
-    f1->addCallback([&]() { ++sat2; });
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(sat2, 1);
-    f1->setErrorIfNeeded(
-        std::make_exception_ptr(c10::ivalue::Future::FutureError("Dup")));
-    ASSERT_TRUE(strcmp(f1->tryRetrieveErrorMessage().c_str(), "Failed") == 0);
-    ASSERT_EQ(sat1, 1);
-    ASSERT_EQ(sat2, 1);
+// Basic error cases.
+TEST(FuturesTest, Error) {
+  auto f1 = c10::make_intrusive<Future>(IntType::get());
+  int sat1 = 0;
+  int sat2 = 0;
+  f1->addCallback([&]() { ++sat1; });
+  f1->setError(
+      std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
+  ASSERT_EQ(sat1, 1);
+  ASSERT_TRUE(f1->completed());
+  ASSERT_TRUE(f1->hasError());
+  ASSERT_FALSE(f1->hasValue());
+  try {
+    (void)f1->value();
+    ASSERT_TRUE(false); // Supposed to throw.
+  } catch (const std::exception& e) {
+    ASSERT_TRUE(strcmp(e.what(), "Failed") == 0);
   }
+  f1->addCallback([&]() { ++sat2; });
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(sat2, 1);
+  f1->setErrorIfNeeded(
+      std::make_exception_ptr(c10::ivalue::Future::FutureError("Dup")));
+  ASSERT_TRUE(strcmp(f1->tryRetrieveErrorMessage().c_str(), "Failed") == 0);
+  ASSERT_EQ(sat1, 1);
+  ASSERT_EQ(sat2, 1);
+}
 
-  // then
-  {
-    auto f1 = c10::make_intrusive<Future>(IntType::get());
-    auto f2 = f1->then(
-        [f1]() -> IValue { return f1->constValue().toInt() + 1; },
-        IntType::get());
-    auto f3 = f2->then(
-        [f2]() -> IValue { return f2->constValue().toInt() * 3; },
-        IntType::get());
-    bool done = false;
-    f3->addCallback([f3, &done]() {
-      ASSERT_EQ(f3->constValue().toInt(), (42 + 1) * 3);
-      done = true;
-    });
-    ASSERT_FALSE(done);
-    f1->markCompleted(42);
-    ASSERT_TRUE(done);
-  }
+// then
+TEST(FuturesTest, Then) {
+  auto f1 = c10::make_intrusive<Future>(IntType::get());
+  auto f2 = f1->then(
+      [f1]() -> IValue { return f1->constValue().toInt() + 1; },
+      IntType::get());
+  auto f3 = f2->then(
+      [f2]() -> IValue { return f2->constValue().toInt() * 3; },
+      IntType::get());
+  bool done = false;
+  f3->addCallback([f3, &done]() {
+    ASSERT_EQ(f3->constValue().toInt(), (42 + 1) * 3);
+    done = true;
+  });
+  ASSERT_FALSE(done);
+  f1->markCompleted(42);
+  ASSERT_TRUE(done);
+}
 
-  // collectAll()
-  {
-    auto s1 = c10::make_intrusive<Future>(IntType::get());
-    auto s2 = c10::make_intrusive<Future>(IntType::get());
-    auto s3 = c10::make_intrusive<Future>(IntType::get());
-
-    // Empty case
-    c10::List<intrusive_ptr<ivalue::Future>> futures(
-        FutureType::create(IntType::get()));
-    auto c1 = collectAll(futures);
-    ASSERT_TRUE(c1->completed());
-    ASSERT_EQ(c1->value().toList().size(), 0);
-    ASSERT_TRUE(
-        *(c1->value().toList().elementType()) ==
-        *FutureType::create(IntType::get()));
-
-    // 1-element, initially not completed.
-    futures.push_back(s1);
-    auto c2 = collectAll(futures);
-    ASSERT_FALSE(c2->completed());
-    s1->markCompleted(5);
-    ASSERT_TRUE(c2->completed());
-    ASSERT_EQ(c2->value().toList().size(), 1);
-    ASSERT_TRUE(
-        *(c2->value().toList().elementType()) ==
-        *FutureType::create(IntType::get()));
-    ASSERT_EQ(c2->value().toList().get(0).toFuture()->value().toInt(), 5);
-
-    // 1-element, already completed
-    auto c3 = collectAll(futures);
-    ASSERT_TRUE(c3->completed());
-    ASSERT_EQ(c3->value().toList().size(), 1);
-    ASSERT_EQ(c3->value().toList().get(0).toFuture()->value().toInt(), 5);
-
-    // 3 elements.
-    futures.push_back(s2);
-    futures.push_back(s3);
-    auto c4 = collectAll(futures);
-    ASSERT_FALSE(c4->completed());
-    s3->markCompleted(7);
-    ASSERT_FALSE(c4->completed());
-    s2->markCompleted(6);
-    ASSERT_TRUE(c4->completed());
-    ASSERT_EQ(c4->value().toList().size(), 3);
-    ASSERT_EQ(c4->value().toList().get(0).toFuture()->value().toInt(), 5);
-    ASSERT_EQ(c4->value().toList().get(1).toFuture()->value().toInt(), 6);
-    ASSERT_EQ(c4->value().toList().get(2).toFuture()->value().toInt(), 7);
-    ASSERT_TRUE(
-        *(c4->value().toList().elementType()) ==
-        *FutureType::create(IntType::get()));
-
-    // Handle exception in the list.
-    auto s4 = c10::make_intrusive<Future>(IntType::get());
-    futures.push_back(s4);
-    auto c5 = collectAll(futures);
-    ASSERT_FALSE(c5->completed());
-    s4->setError(
-        std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
-    ASSERT_TRUE(c5->completed());
-    ASSERT_EQ(c5->value().toList().size(), 4);
-    try {
-      (void)c5->value().toList().get(3).toFuture()->value();
-      ASSERT_TRUE(false); // supposed to throw
-    } catch (const std::exception& e) {
-      ASSERT_EQ(std::string(e.what()), "Failed");
-    }
+// collectAll()
+TEST(FuturesTest, CollectAll) {
+  auto s1 = c10::make_intrusive<Future>(IntType::get());
+  auto s2 = c10::make_intrusive<Future>(IntType::get());
+  auto s3 = c10::make_intrusive<Future>(IntType::get());
+
+  // Empty case
+  c10::List<intrusive_ptr<ivalue::Future>> futures(
+      FutureType::create(IntType::get()));
+  auto c1 = collectAll(futures);
+  ASSERT_TRUE(c1->completed());
+  ASSERT_EQ(c1->value().toList().size(), 0);
+  ASSERT_TRUE(
+      *(c1->value().toList().elementType()) ==
+      *FutureType::create(IntType::get()));
+
+  // 1-element, initially not completed.
+  futures.push_back(s1);
+  auto c2 = collectAll(futures);
+  ASSERT_FALSE(c2->completed());
+  s1->markCompleted(5);
+  ASSERT_TRUE(c2->completed());
+  ASSERT_EQ(c2->value().toList().size(), 1);
+  ASSERT_TRUE(
+      *(c2->value().toList().elementType()) ==
+      *FutureType::create(IntType::get()));
+  ASSERT_EQ(c2->value().toList().get(0).toFuture()->value().toInt(), 5);
+
+  // 1-element, already completed
+  auto c3 = collectAll(futures);
+  ASSERT_TRUE(c3->completed());
+  ASSERT_EQ(c3->value().toList().size(), 1);
+  ASSERT_EQ(c3->value().toList().get(0).toFuture()->value().toInt(), 5);
+
+  // 3 elements.
+  futures.push_back(s2);
+  futures.push_back(s3);
+  auto c4 = collectAll(futures);
+  ASSERT_FALSE(c4->completed());
+  s3->markCompleted(7);
+  ASSERT_FALSE(c4->completed());
+  s2->markCompleted(6);
+  ASSERT_TRUE(c4->completed());
+  ASSERT_EQ(c4->value().toList().size(), 3);
+  ASSERT_EQ(c4->value().toList().get(0).toFuture()->value().toInt(), 5);
+  ASSERT_EQ(c4->value().toList().get(1).toFuture()->value().toInt(), 6);
+  ASSERT_EQ(c4->value().toList().get(2).toFuture()->value().toInt(), 7);
+  ASSERT_TRUE(
+      *(c4->value().toList().elementType()) ==
+      *FutureType::create(IntType::get()));
+
+  // Handle exception in the list.
+  auto s4 = c10::make_intrusive<Future>(IntType::get());
+  futures.push_back(s4);
+  auto c5 = collectAll(futures);
+  ASSERT_FALSE(c5->completed());
+  s4->setError(
+      std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
+  ASSERT_TRUE(c5->completed());
+  ASSERT_EQ(c5->value().toList().size(), 4);
+  try {
+    (void)c5->value().toList().get(3).toFuture()->value();
+    ASSERT_TRUE(false); // supposed to throw
+  } catch (const std::exception& e) {
+    ASSERT_EQ(std::string(e.what()), "Failed");
   }
+}
 
-  // collectAny()
-  {
-    auto s1 = c10::make_intrusive<Future>(IntType::get());
+// collectAny()
+TEST(FuturesTest, CollectAny) {
+  auto s1 = c10::make_intrusive<Future>(IntType::get());
 
-    // Empty case
-    c10::List<intrusive_ptr<ivalue::Future>> futures(
-        FutureType::create(IntType::get()));
-    auto c1 = collectAny(futures);
-    ASSERT_TRUE(c1->completed());
-
-    // 1 element, not yet satisfied
-    futures.push_back(s1);
-    auto c2 = collectAny(futures);
-    ASSERT_FALSE(c2->completed());
-    s1->markCompleted(5);
-    ASSERT_TRUE(c2->completed());
-    ASSERT_TRUE(c2->value().isInt());
-    ASSERT_EQ(c2->value().toInt(), 5);
-
-    // 1 element already satisfied.
-    auto c3 = collectAny(futures);
-    ASSERT_TRUE(c3->completed());
-    ASSERT_TRUE(c3->value().isInt());
-    ASSERT_EQ(c3->value().toInt(), 5);
-
-    // 2 elements
-    futures.clear();
-    auto s2 = c10::make_intrusive<Future>(IntType::get());
-    auto s3 = c10::make_intrusive<Future>(IntType::get());
-    futures.push_back(s2);
-    futures.push_back(s3);
-    auto c4 = collectAny(futures);
-    ASSERT_FALSE(c4->completed());
-    s3->markCompleted(7);
-    ASSERT_TRUE(c4->completed());
-    ASSERT_EQ(c4->value().toInt(), 7);
-    s2->markCompleted(1);
-    ASSERT_EQ(c4->value().toInt(), 7);
-  }
+  // Empty case
+  c10::List<intrusive_ptr<ivalue::Future>> futures(
+      FutureType::create(IntType::get()));
+  auto c1 = collectAny(futures);
+  ASSERT_TRUE(c1->completed());
+
+  // 1 element, not yet satisfied
+  futures.push_back(s1);
+  auto c2 = collectAny(futures);
+  ASSERT_FALSE(c2->completed());
+  s1->markCompleted(5);
+  ASSERT_TRUE(c2->completed());
+  ASSERT_TRUE(c2->value().isInt());
+  ASSERT_EQ(c2->value().toInt(), 5);
+
+  // 1 element already satisfied.
+  auto c3 = collectAny(futures);
+  ASSERT_TRUE(c3->completed());
+  ASSERT_TRUE(c3->value().isInt());
+  ASSERT_EQ(c3->value().toInt(), 5);
+
+  // 2 elements
+  futures.clear();
+  auto s2 = c10::make_intrusive<Future>(IntType::get());
+  auto s3 = c10::make_intrusive<Future>(IntType::get());
+  futures.push_back(s2);
+  futures.push_back(s3);
+  auto c4 = collectAny(futures);
+  ASSERT_FALSE(c4->completed());
+  s3->markCompleted(7);
+  ASSERT_TRUE(c4->completed());
+  ASSERT_EQ(c4->value().toInt(), 7);
+  s2->markCompleted(1);
+  ASSERT_EQ(c4->value().toInt(), 7);
 }
 
-void testTLSFutureCallbacks() {
+TEST(TLSFutureCallbacksTest, Basic) {
   // cb that verifies the profiler is enabled
   auto profilerEnabledCb = []() {
     ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
@@ -2184,7 +2159,7 @@ void testTLSFutureCallbacks() {
   }
 }
 
-void testProfilerDisableInCallback() {
+TEST(ProfilerDisableInCallbackTest, Basic) {
   // cb that verifies the profiler is enabled
   auto profilerEnabledCb = []() {
     ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
@@ -2225,7 +2200,7 @@ void testProfilerDisableInCallback() {
   t.join();
 }
 
-void testIValueKWargs() {
+TEST(IValueKWargsTest, Basic) {
   const auto text = R"(
     def foo(a : int, b : int, c : int = 4):
       return a + 2*b + 3*c
diff --git a/test/cpp/jit/test_mobile_type_parser.cpp b/test/cpp/jit/test_mobile_type_parser.cpp
index 989d16794bd2..7e24e5dc65bc 100644
--- a/test/cpp/jit/test_mobile_type_parser.cpp
+++ b/test/cpp/jit/test_mobile_type_parser.cpp
@@ -1,5 +1,6 @@
-#include "test/cpp/jit/test_base.h"
-//#include <gtest.h>
+#include <gtest/gtest.h>
+
+#include <ATen/core/jit_type.h>
 
 namespace c10 {
 // std::string serializeType(const Type &t);
@@ -8,50 +9,74 @@ TypePtr parseType(const std::string& pythonStr);
 
 namespace torch {
 namespace jit {
-void testMobileTypeParser() {
+TEST(MobileTypeParserTest, Empty) {
   std::string empty_ps("");
   ASSERT_ANY_THROW(c10::parseType(empty_ps));
+}
 
+TEST(MobileTypeParserTest, RoundTripAnnotationStr) {
   std::string int_ps("int");
   auto int_tp = c10::parseType(int_ps);
   std::string int_tps = int_tp->annotation_str();
   ASSERT_EQ(int_ps, int_tps);
+}
 
+TEST(MobileTypeParserTest, NestedContainersAnnotationStr) {
   std::string tuple_ps(
       "Tuple[str, Optional[float], Dict[str, List[Tensor]], int]");
   auto tuple_tp = c10::parseType(tuple_ps);
   std::string tuple_tps = tuple_tp->annotation_str();
   ASSERT_EQ(tuple_ps, tuple_tps);
+}
 
+TEST(MobileTypeParserTest, NestedContainersAnnotationStrWithSpaces) {
+  std::string tuple_ps(
+      "Tuple[str, Optional[float], Dict[str, List[Tensor]], int]");
   std::string tuple_space_ps(
       "Tuple[  str, Optional[float], Dict[str, List[Tensor ]]  , int]");
   auto tuple_space_tp = c10::parseType(tuple_space_ps);
   // tuple_space_tps should not have weird white spaces
   std::string tuple_space_tps = tuple_space_tp->annotation_str();
   ASSERT_EQ(tuple_ps, tuple_space_tps);
+}
 
+TEST(MobileTypeParserTest, TypoRaises) {
   std::string typo_token("List[tensor]");
   ASSERT_ANY_THROW(c10::parseType(typo_token));
+}
 
+TEST(MobileTypeParserTest, MismatchBracketRaises) {
   std::string mismatch1("List[Tensor");
   ASSERT_ANY_THROW(c10::parseType(mismatch1));
+}
 
+TEST(MobileTypeParserTest, MismatchBracketRaises2) {
   std::string mismatch2("List[[Tensor]");
   ASSERT_ANY_THROW(c10::parseType(mismatch2));
+}
 
+TEST(MobileTypeParserTest, DictWithoutValueRaises) {
   std::string mismatch3("Dict[Tensor]");
   ASSERT_ANY_THROW(c10::parseType(mismatch3));
+}
 
+TEST(MobileTypeParserTest, ListArgCountMismatchRaises) {
   // arg count mismatch
   std::string mismatch4("List[int, str]");
   ASSERT_ANY_THROW(c10::parseType(mismatch4));
+}
 
+TEST(MobileTypeParserTest, DictArgCountMismatchRaises) {
   std::string trailing_commm("Dict[str,]");
   ASSERT_ANY_THROW(c10::parseType(trailing_commm));
+}
 
+TEST(MobileTypeParserTest, ValidTypeWithExtraStuffRaises) {
   std::string extra_stuff("int int");
   ASSERT_ANY_THROW(c10::parseType(extra_stuff));
+}
 
+TEST(MobileTypeParserTest, NonIdentifierRaises) {
   std::string non_id("(int)");
   ASSERT_ANY_THROW(c10::parseType(non_id));
 }
diff --git a/test/cpp/jit/test_module_api.cpp b/test/cpp/jit/test_module_api.cpp
index 386addd9fbec..910331166d51 100644
--- a/test/cpp/jit/test_module_api.cpp
+++ b/test/cpp/jit/test_module_api.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 
 #include <ATen/core/qualified_name.h>
@@ -42,7 +43,7 @@ static void import_libs(
   si.loadType(QualifiedName(class_name));
 }
 
-void testModuleClone() {
+TEST(ModuleAPITest, Clone) {
   auto cu = std::make_shared<CompilationUnit>();
   // creating child module
   auto child = ClassType::create("child", cu, true);
@@ -71,7 +72,7 @@ void testModuleClone() {
   ASSERT_EQ(Module(p2.attr("c2").toObject()).attr(attr_name).toInt(), 3);
 }
 
-void testModuleCloneWithModuleInterface() {
+TEST(ModuleAPITest, CloneWithModuleInterface) {
   auto cu = std::make_shared<CompilationUnit>();
 
   // define a initial module with two submods share same interface
@@ -115,7 +116,7 @@ void testModuleCloneWithModuleInterface() {
   ASSERT_NE(clonedMod.type(), parentMod.type());
 }
 
-void testModuleCopy() {
+TEST(ModuleAPITest, Copy) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto attr_name = "attr";
@@ -144,7 +145,7 @@ void testModuleCopy() {
   ASSERT_EQ(m3.attr(attr_name).toInt(), 3);
 }
 
-void testModuleDeepcopy() {
+TEST(ModuleAPITest, DeepCopy) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto str_attr = "str_attr";
@@ -203,7 +204,7 @@ void testModuleDeepcopy() {
   ASSERT_TRUE(t1.equal(t3));
 }
 
-void testModuleDeepcopyString() {
+TEST(ModuleAPITest, DeepCopyString) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto attr1 = "attr1";
@@ -219,7 +220,7 @@ void testModuleDeepcopyString() {
   ASSERT_EQ(copied.attr(attr1).toString()->string(), original_str);
 }
 
-void testModuleDeepcopyAliasing() {
+TEST(ModuleAPITest, DeepCopyPreservesAliasing) {
   // check deepcopy preserves aliasing
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
@@ -256,7 +257,7 @@ void testModuleDeepcopyAliasing() {
   ASSERT_TRUE(copied_attr3.isAliasOf(copied_attr4));
 }
 
-void testModuleConstant() {
+TEST(ModuleAPITest, Constants) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   auto attr_name = "attr";
@@ -272,7 +273,7 @@ void testModuleConstant() {
   ASSERT_EQ(m.attr(const_name).toInt(), 3);
 }
 
-void testModuleParameter() {
+TEST(ModuleAPITest, Parameters) {
   auto cu = std::make_shared<CompilationUnit>();
   auto cls = ClassType::create("foo.bar", cu, true);
   Module m(cu, cls);
@@ -291,5 +292,39 @@ void testModuleParameter() {
   ASSERT_TRUE(m.hasattr("none_param2"));
 }
 
+TEST(ModuleAPITest, Define) {
+  Module m("m");
+  m.register_parameter("foo", torch::ones({}), false);
+  m.define(R"(
+    def add_it(self, x, b : int = 4):
+      return self.foo + x + b
+  )");
+  auto result = m.run_method("add_it", torch::ones({}));
+  AT_ASSERT(result.toTensor().item<float>() == 6);
+}
+
+TEST(ModuleAPITest, To_CUDA) {
+  Module m("test");
+  {
+    // test cuda to cpu for params and buffers
+    m.register_parameter("foo", torch::ones({}, at::kCUDA), false);
+    m.register_buffer("bar", torch::ones({}, at::kCUDA));
+
+    m.to(at::kCUDA);
+    m.to(at::kCPU);
+    AT_ASSERT(m.attr("foo").toTensor().device().is_cpu());
+    AT_ASSERT(m.attr("bar").toTensor().device().is_cpu());
+  }
+  {
+    // test cpu to cuda for params and buffers
+    m.register_parameter("foo", torch::ones({}), false);
+    m.register_buffer("bar", torch::ones({}));
+
+    m.to(at::kCUDA);
+    AT_ASSERT(m.attr("foo").toTensor().device().is_cuda());
+    AT_ASSERT(m.attr("bar").toTensor().device().is_cuda());
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_peephole_optimize.cpp b/test/cpp/jit/test_peephole_optimize.cpp
index 5382d556613d..9985faa6e9bd 100644
--- a/test/cpp/jit/test_peephole_optimize.cpp
+++ b/test/cpp/jit/test_peephole_optimize.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 
 #include <torch/csrc/jit/ir/ir.h>
@@ -8,47 +9,48 @@
 namespace torch {
 namespace jit {
 
-void testPeepholeOptimize() {
-  // test is / is not none optimization
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, IsAndIsNot)
+// test is / is not none optimization
+{
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%0 : int):
   %1 : None = prim::Constant()
   %2 : bool = aten::__is__(%0, %1)
   %3 : bool = aten::__isnot__(%0, %1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck()
-        .check_not("aten::__is__")
-        ->check_not("aten::__isnot__")
-        ->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck()
+      .check_not("aten::__is__")
+      ->check_not("aten::__isnot__")
+      ->run(*graph);
+}
+
+TEST(PeepholeOptimizeTest, IsAndIsNot2) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%0: int?):
   %1 : None = prim::Constant()
   %2 : bool = aten::__is__(%0, %1)
   %3 : bool = aten::__isnot__(%0, %1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck()
-        .check("aten::__is__")
-        ->check("aten::__isnot__")
-        ->run(*graph);
-  }
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck()
+      .check("aten::__is__")
+      ->check("aten::__isnot__")
+      ->run(*graph);
+}
 
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, IsAndIsNot3) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%0: int?):
   %1 : Tensor = prim::AutogradZero()
   %2 : None = prim::Constant()
@@ -56,48 +58,49 @@ graph(%0: int?):
   %5 : bool = aten::__isnot__(%1, %2)
   return (%4, %5)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck()
-        .check("aten::__is__")
-        ->check_not("aten::__isnot__")
-        ->run(*graph);
-  }
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck()
+      .check("aten::__is__")
+      ->check_not("aten::__isnot__")
+      ->run(*graph);
+}
 
-  // test unwrap optional
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, UnwrapOptional)
+// test unwrap optional
+{
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph():
   %1 : Float(*, *, *) = prim::Constant()
   %2 : bool = aten::_unwrap_optional(%1)
   %3 : bool = prim::unchecked_unwrap_optional(%1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck().check_not("unwrap")->run(*graph);
-  }
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck().check_not("unwrap")->run(*graph);
+}
+
+TEST(PeepholeOptimizeTest, UnwrapOptional2) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
 graph(%1 : Float(*, *, *)?):
   %2 : bool = aten::_unwrap_optional(%1)
   %3 : bool = prim::unchecked_unwrap_optional(%1)
   return (%2, %3)
   )IR",
-        graph.get());
-    PeepholeOptimize(graph);
-    testing::FileCheck().check_count("unwrap", 2)->run(*graph);
-  }
+      graph.get());
+  PeepholeOptimize(graph);
+  testing::FileCheck().check_count("unwrap", 2)->run(*graph);
+}
 
-  // tests addmm fusion
-  {
-    auto graph = std::make_shared<Graph>();
-    parseIR(
-        R"IR(
+TEST(PeepholeOptimizeTest, AddMMFusion) {
+  auto graph = std::make_shared<Graph>();
+  parseIR(
+      R"IR(
       graph(
         %0 : Float(2, 3, 4),
         %1 : Float(2, 3, 4),
@@ -108,10 +111,9 @@ graph(%1 : Float(*, *, *)?):
         %6 : Tensor = aten::add(%5, %2, %3)
         return (%6)
         )IR",
-        graph.get());
-    FuseAddMM(graph);
-    testing::FileCheck().check("addmm")->run(*graph);
-  }
+      graph.get());
+  FuseAddMM(graph);
+  testing::FileCheck().check("addmm")->run(*graph);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_qualified_name.cpp b/test/cpp/jit/test_qualified_name.cpp
index 0f387bb542ed..80028ada8565 100644
--- a/test/cpp/jit/test_qualified_name.cpp
+++ b/test/cpp/jit/test_qualified_name.cpp
@@ -1,68 +1,70 @@
+#include <gtest/gtest.h>
 
 #include <ATen/core/qualified_name.h>
 #include <c10/util/Exception.h>
-#include "test/cpp/jit/test_base.h"
 
 using c10::QualifiedName;
 
 namespace torch {
 namespace jit {
-void testQualifiedName() {
-  {
-    // Test prefix construction
-    auto foo = QualifiedName("foo");
-    auto bar = QualifiedName(foo, "bar");
-    auto baz = QualifiedName(bar, "baz");
-    ASSERT_EQ(baz.qualifiedName(), "foo.bar.baz");
-    ASSERT_EQ(baz.prefix(), "foo.bar");
-    ASSERT_EQ(baz.name(), "baz");
-    auto nullstate = QualifiedName();
-    ASSERT_EQ(nullstate.qualifiedName(), "");
-    ASSERT_EQ(nullstate.prefix(), "");
-    ASSERT_EQ(nullstate.name(), "");
-  }
-  {
-    // Test dotted construction
-    auto foo = QualifiedName("foo.bar.baz");
-    ASSERT_EQ(foo.qualifiedName(), "foo.bar.baz");
-    ASSERT_EQ(foo.prefix(), "foo.bar");
-    ASSERT_EQ(foo.name(), "baz");
+TEST(QualifiedNameTest, PrefixConstruction) {
+  // Test prefix construction
+  auto foo = QualifiedName("foo");
+  auto bar = QualifiedName(foo, "bar");
+  auto baz = QualifiedName(bar, "baz");
+  ASSERT_EQ(baz.qualifiedName(), "foo.bar.baz");
+  ASSERT_EQ(baz.prefix(), "foo.bar");
+  ASSERT_EQ(baz.name(), "baz");
+  auto nullstate = QualifiedName();
+  ASSERT_EQ(nullstate.qualifiedName(), "");
+  ASSERT_EQ(nullstate.prefix(), "");
+  ASSERT_EQ(nullstate.name(), "");
+}
+
+TEST(QualifiedNameTest, DottedConstruction) {
+  // Test dotted construction
+  auto foo = QualifiedName("foo.bar.baz");
+  ASSERT_EQ(foo.qualifiedName(), "foo.bar.baz");
+  ASSERT_EQ(foo.prefix(), "foo.bar");
+  ASSERT_EQ(foo.name(), "baz");
+
+  auto bar = QualifiedName("bar");
+  ASSERT_EQ(bar.qualifiedName(), "bar");
+  ASSERT_EQ(bar.prefix(), "");
+  ASSERT_EQ(bar.name(), "bar");
+}
+
+TEST(QualifiedNameTest, BadInputRaises) {
+  // throw some bad inputs at it
+  ASSERT_ANY_THROW(QualifiedName("foo..bar"));
+  ASSERT_ANY_THROW(QualifiedName(".foo.bar"));
+  ASSERT_ANY_THROW(QualifiedName("foo.bar."));
+  ASSERT_ANY_THROW(QualifiedName(""));
+}
+
+TEST(QualifiedNameTest, Equality) {
+  // test equality api
+  auto foo1 = QualifiedName("foo.bar.baz");
+  auto foo2 = QualifiedName("foo.bar.baz");
+  auto foo3 = QualifiedName("bar.bar.baz");
+  ASSERT_EQ(foo1, foo2);
+  ASSERT_NE(foo1, foo3);
+  auto bar1 = QualifiedName("sup");
+  auto bar2 = QualifiedName("sup");
+  ASSERT_EQ(foo1, foo2);
+}
 
-    auto bar = QualifiedName("bar");
-    ASSERT_EQ(bar.qualifiedName(), "bar");
-    ASSERT_EQ(bar.prefix(), "");
-    ASSERT_EQ(bar.name(), "bar");
-  }
-  {
-    // throw some bad inputs at it
-    ASSERT_ANY_THROW(QualifiedName("foo..bar"));
-    ASSERT_ANY_THROW(QualifiedName(".foo.bar"));
-    ASSERT_ANY_THROW(QualifiedName("foo.bar."));
-    ASSERT_ANY_THROW(QualifiedName(""));
-  }
-  {
-    // test equality api
-    auto foo1 = QualifiedName("foo.bar.baz");
-    auto foo2 = QualifiedName("foo.bar.baz");
-    auto foo3 = QualifiedName("bar.bar.baz");
-    ASSERT_EQ(foo1, foo2);
-    ASSERT_NE(foo1, foo3);
-    auto bar1 = QualifiedName("sup");
-    auto bar2 = QualifiedName("sup");
-    ASSERT_EQ(foo1, foo2);
-  }
-  {
-    // test prefix api
-    auto foo1 = QualifiedName("foo.bar.baz");
-    auto foo2 = QualifiedName("foo.bar");
-    auto foo3 = QualifiedName("bar.bar.baz");
-    auto foo4 = QualifiedName("foo.bar");
-    ASSERT_TRUE(foo2.isPrefixOf(foo1));
-    ASSERT_TRUE(foo2.isPrefixOf(foo4));
-    ASSERT_TRUE(foo4.isPrefixOf(foo2));
-    ASSERT_FALSE(foo1.isPrefixOf(foo2));
-    ASSERT_FALSE(foo2.isPrefixOf(foo3));
-  }
+TEST(QualifiedNameTest, IsPrefixOf) {
+  // test prefix api
+  auto foo1 = QualifiedName("foo.bar.baz");
+  auto foo2 = QualifiedName("foo.bar");
+  auto foo3 = QualifiedName("bar.bar.baz");
+  auto foo4 = QualifiedName("foo.bar");
+  ASSERT_TRUE(foo2.isPrefixOf(foo1));
+  ASSERT_TRUE(foo2.isPrefixOf(foo4));
+  ASSERT_TRUE(foo4.isPrefixOf(foo2));
+  ASSERT_FALSE(foo1.isPrefixOf(foo2));
+  ASSERT_FALSE(foo2.isPrefixOf(foo3));
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp
index 05940845d172..2e59358b4e00 100644
--- a/test/cpp/jit/test_save_load.cpp
+++ b/test/cpp/jit/test_save_load.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <sstream>
 
@@ -12,10 +13,10 @@
 namespace torch {
 namespace jit {
 
-// Tests that an extra file written explicitly has precedence over
-//   extra files written by a hook
-// TODO: test for the warning, too
-void testExtraFilesHookPreference() {
+TEST(SerializationTest, ExtraFilesHookPreference) {
+  // Tests that an extra file written explicitly has precedence over
+  //   extra files written by a hook
+  // TODO: test for the warning, too
   const auto script = R"JIT(
     def forward(self):
         x = torch.rand(5, 5)
@@ -43,52 +44,50 @@ void testExtraFilesHookPreference() {
   ASSERT_EQ(loaded_extra_files["metadata.json"], "abc");
 }
 
-void testSaveExtraFilesHook() {
+TEST(SerializationTest, ExtraFileHooksNoSecret) {
   // no secrets
+  std::stringstream ss;
+  {
+    Module m("__torch__.m");
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "abc";
+    m.save(ss, extra);
+  }
+  ss.seekg(0);
+  {
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "";
+    extra["secret.json"] = "";
+    jit::load(ss, c10::nullopt, extra);
+    ASSERT_EQ(extra["metadata.json"], "abc");
+    ASSERT_EQ(extra["secret.json"], "");
+  }
+}
+
+TEST(SerializationTest, ExtraFileHooksWithSecret) {
+  std::stringstream ss;
   {
-    std::stringstream ss;
-    {
-      Module m("__torch__.m");
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "abc";
-      m.save(ss, extra);
-    }
-    ss.seekg(0);
-    {
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "";
-      extra["secret.json"] = "";
-      jit::load(ss, c10::nullopt, extra);
-      ASSERT_EQ(extra["metadata.json"], "abc");
-      ASSERT_EQ(extra["secret.json"], "");
-    }
+    SetExportModuleExtraFilesHook([](const Module&) -> ExtraFilesMap {
+      return {{"secret.json", "topsecret"}};
+    });
+    Module m("__torch__.m");
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "abc";
+    m.save(ss, extra);
+    SetExportModuleExtraFilesHook(nullptr);
   }
-  // some secret
+  ss.seekg(0);
   {
-    std::stringstream ss;
-    {
-      SetExportModuleExtraFilesHook([](const Module&) -> ExtraFilesMap {
-        return {{"secret.json", "topsecret"}};
-      });
-      Module m("__torch__.m");
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "abc";
-      m.save(ss, extra);
-      SetExportModuleExtraFilesHook(nullptr);
-    }
-    ss.seekg(0);
-    {
-      ExtraFilesMap extra;
-      extra["metadata.json"] = "";
-      extra["secret.json"] = "";
-      jit::load(ss, c10::nullopt, extra);
-      ASSERT_EQ(extra["metadata.json"], "abc");
-      ASSERT_EQ(extra["secret.json"], "topsecret");
-    }
+    ExtraFilesMap extra;
+    extra["metadata.json"] = "";
+    extra["secret.json"] = "";
+    jit::load(ss, c10::nullopt, extra);
+    ASSERT_EQ(extra["metadata.json"], "abc");
+    ASSERT_EQ(extra["secret.json"], "topsecret");
   }
 }
 
-void testTypeTags() {
+TEST(SerializationTest, TypeTags) {
   auto list = c10::List<c10::List<int64_t>>();
   list.push_back(c10::List<int64_t>({1, 2, 3}));
   list.push_back(c10::List<int64_t>({4, 5, 6}));
diff --git a/test/cpp/jit/test_schema_matching.cpp b/test/cpp/jit/test_schema_matching.cpp
index bea7d14dcaf2..aeeb173b2678 100644
--- a/test/cpp/jit/test_schema_matching.cpp
+++ b/test/cpp/jit/test_schema_matching.cpp
@@ -1,8 +1,9 @@
+#include <gtest/gtest.h>
+
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/testing/file_check.h>
 #include <torch/jit.h>
-#include "test/cpp/jit/test_base.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
 
 #include <sstream>
 #include <string>
@@ -10,80 +11,79 @@
 namespace torch {
 namespace jit {
 
-void testSchemaMatching() {
-  {
-    RegisterOperators reg({
-        Operator(
-            "aten::test_vartype(t[] a, t b) -> (t)",
-            [](Stack* stack) {
-              c10::List<double> list;
-              double a;
-              pop(stack, list, a);
-              push(stack, a);
-            },
-            c10::AliasAnalysisKind::FROM_SCHEMA),
-    });
-    Module m("m");
-    m.define(R"(
+TEST(SchemaMatchingTest, VarType) {
+  RegisterOperators reg({
+      Operator(
+          "aten::test_vartype(t[] a, t b) -> (t)",
+          [](Stack* stack) {
+            c10::List<double> list;
+            double a;
+            pop(stack, list, a);
+            push(stack, a);
+          },
+          c10::AliasAnalysisKind::FROM_SCHEMA),
+  });
+  Module m("m");
+  m.define(R"(
       def test(self):
         a = (1.0, 2.0)
         return torch.test_vartype(a, 2.0)
     )");
-    auto result = m.run_method("test");
-    TORCH_INTERNAL_ASSERT(result.toDouble() == 2.0);
+  auto result = m.run_method("test");
+  TORCH_INTERNAL_ASSERT(result.toDouble() == 2.0);
 
-    const std::string error_example = R"JIT(
+  const std::string error_example = R"JIT(
       def test_2(self):
           a = (1.0, 2.0)
           non_float = (1, 1)
           return torch.test_vartype(a, non_float)
     )JIT";
 
-    std::string err = "";
-    try {
-      m.define(error_example);
-    } catch (const std::exception& e) {
-      err = e.what();
-    }
-    TORCH_INTERNAL_ASSERT(
-        err.find("previously matched to type") != std::string::npos);
+  std::string err = "";
+  try {
+    m.define(error_example);
+  } catch (const std::exception& e) {
+    err = e.what();
   }
-  {
-    RegisterOperators reg({
-        Operator(
-            "aten::test_vartype2(t a, t[] b) -> (t[])",
-            [](Stack* stack) {
-              double a;
-              c10::List<double> list;
-              pop(stack, a, list);
-              push(stack, a);
-            },
-            AliasAnalysisKind::FROM_SCHEMA),
-    });
-    Module m("m");
-    m.define(R"JIT(
+  TORCH_INTERNAL_ASSERT(
+      err.find("previously matched to type") != std::string::npos);
+}
+
+TEST(SchemaMatchingTest, VarType2) {
+  RegisterOperators reg({
+      Operator(
+          "aten::test_vartype2(t a, t[] b) -> (t[])",
+          [](Stack* stack) {
+            double a;
+            c10::List<double> list;
+            pop(stack, a, list);
+            push(stack, a);
+          },
+          AliasAnalysisKind::FROM_SCHEMA),
+  });
+  Module m("m");
+  m.define(R"JIT(
       def test(self):
           a = (1.0, 2.0)
           return torch.test_vartype2(3.0, a)
     )JIT");
-    auto result = m.run_method("test");
-    TORCH_INTERNAL_ASSERT(result.toDouble() == 3.0);
+  auto result = m.run_method("test");
+  TORCH_INTERNAL_ASSERT(result.toDouble() == 3.0);
 
-    static const auto error_exam2 = R"JIT(
+  static const auto error_exam2 = R"JIT(
       def test_2(self):
           a = (1, 2)
           return torch.test_vartype2(3.0, a)
     )JIT";
 
-    std::string err = "";
-    try {
-      m.define(error_exam2);
-    } catch (const std::exception& e) {
-      err = e.what();
-    }
-    TORCH_INTERNAL_ASSERT(
-        err.find("previously matched to type") != std::string::npos);
+  std::string err = "";
+  try {
+    m.define(error_exam2);
+  } catch (const std::exception& e) {
+    err = e.what();
   }
+  TORCH_INTERNAL_ASSERT(
+      err.find("previously matched to type") != std::string::npos);
 }
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_subgraph_matcher.cpp b/test/cpp/jit/test_subgraph_matcher.cpp
index 2e398db44e95..39078d345269 100644
--- a/test/cpp/jit/test_subgraph_matcher.cpp
+++ b/test/cpp/jit/test_subgraph_matcher.cpp
@@ -1,11 +1,12 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 #include "torch/csrc/jit/ir/subgraph_matcher.h"
 
 namespace torch {
 namespace jit {
 
-void testTrivial1() {
+TEST(SubgraphMatcherTest, Trivial1) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -22,7 +23,7 @@ graph(%0):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testTrivial2() {
+TEST(SubgraphMatcherTest, Trivial2) {
   Graph graph;
   auto* g_in = graph.addInput();
   auto* g_tanh = graph.insertNode(graph.create(aten::tanh, /*num_outputs =*/1));
@@ -45,7 +46,7 @@ void testTrivial2() {
   }
 }
 
-void testTrivial3() {
+TEST(SubgraphMatcherTest, Trivial3) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -64,7 +65,7 @@ graph(%a, %b):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testTrivial4() {
+TEST(SubgraphMatcherTest, Trivial4) {
   Graph graph;
   auto* g_in0 = graph.addInput();
   auto* g_in1 = graph.addInput();
@@ -92,7 +93,7 @@ void testTrivial4() {
   }
 }
 
-void testLinear1() {
+TEST(SubgraphMatcherTest, Linear1) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -114,7 +115,7 @@ graph(%0):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testLinear2() {
+TEST(SubgraphMatcherTest, Linear2) {
   Graph graph;
   auto* g_in = graph.addInput();
 
@@ -164,7 +165,7 @@ void testLinear2() {
  *      |
  *     eee
  */
-void testDiamond1() {
+TEST(SubgraphMatcherTest, Diamond1) {
   Graph graph, pattern1, pattern2;
   parseIR(
       R"IR(
@@ -215,7 +216,7 @@ graph(%0):
  *      |
  *      o1
  */
-void testDiamond2() {
+TEST(SubgraphMatcherTest, Diamond2) {
   Graph graph;
   auto* g_in = graph.addInput();
 
@@ -253,7 +254,7 @@ void testDiamond2() {
   }
 }
 
-void testXPattern() {
+TEST(SubgraphMatcherTest, XPattern) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -280,7 +281,7 @@ graph(%0, %1):
   AT_ASSERT(!findPatternMatches(pattern, graph).empty());
 }
 
-void testMultipleMatches() {
+TEST(SubgraphMatcherTest, MultipleMatches) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -301,7 +302,7 @@ graph(%t0):
   AT_ASSERT(matches.size() == 4);
 }
 
-void testOverlappingMatches() {
+TEST(SubgraphMatcherTest, OverlappingMatches) {
   Graph graph, pattern;
   parseIR(
       R"IR(
@@ -323,7 +324,7 @@ graph(%t0):
   AT_ASSERT(matches.size() == 3);
 }
 
-void testMatchInBasicBlocks1() {
+TEST(SubgraphMatcherTest, MatchInBasicBlocks1) {
   Graph graph;
   parseIR(
       R"IR(
@@ -360,7 +361,7 @@ graph(%x, %y):
   AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0);
 }
 
-void testMatchInBasicBlocks2() {
+TEST(SubgraphMatcherTest, MatchInBasicBlocks2) {
   Graph graph;
   parseIR(
       R"IR(
@@ -395,7 +396,7 @@ graph(%x, %y):
   AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0);
 }
 
-void testMatchesAttributes() {
+TEST(SubgraphMatcherTest, MatchesAttributes) {
   Graph graph;
   parseIR(
       R"IR(
@@ -479,7 +480,7 @@ graph(%a, %b):
   }
 }
 
-void testBadPattern() {
+TEST(SubgraphMatcherTest, BadPattern) {
   Graph graph, pattern1, pattern2;
   parseIR(
       R"IR(
@@ -509,23 +510,5 @@ graph(%x):
   ASSERT_ANY_THROW(findPatternMatches(pattern2, graph));
 }
 
-void testSubgraphMatching() {
-  testTrivial1();
-  testTrivial2();
-  testTrivial3();
-  testTrivial4();
-  testLinear1();
-  testLinear2();
-  testDiamond1();
-  testDiamond2();
-  testXPattern();
-  testMultipleMatches();
-  testOverlappingMatches();
-  testMatchInBasicBlocks1();
-  testMatchInBasicBlocks2();
-  testMatchesAttributes();
-  testBadPattern();
-}
-
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_subgraph_rewriter.cpp b/test/cpp/jit/test_subgraph_rewriter.cpp
index 9799dfdb97b2..f166962ebc5c 100644
--- a/test/cpp/jit/test_subgraph_rewriter.cpp
+++ b/test/cpp/jit/test_subgraph_rewriter.cpp
@@ -1,4 +1,5 @@
-#include <test/cpp/jit/test_base.h>
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/ir/subgraph_matcher.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
@@ -8,7 +9,7 @@ namespace torch {
 namespace jit {
 using namespace testing;
 
-void testFilterMatch() {
+TEST(SubgraphRewriterTest, FilterMatch) {
   auto graph = std::make_shared<Graph>();
 
   parseIR(
@@ -80,7 +81,7 @@ graph(%a, %b):
   }
 }
 
-void testFilterNoMatch() {
+TEST(SubgraphRewriterTest, FilterNoMatch) {
   auto graph = std::make_shared<Graph>();
   parseIR(
       R"IR(
@@ -121,10 +122,5 @@ graph(%a, %b):
   FileCheck().check("c::ccc")->check_not("d::ddd")->run(*graph);
 }
 
-void testSubgraphRewriter() {
-  testFilterMatch();
-  testFilterNoMatch();
-}
-
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_subgraph_utils.cpp b/test/cpp/jit/test_subgraph_utils.cpp
index e1f86cc34979..09e01f8836da 100644
--- a/test/cpp/jit/test_subgraph_utils.cpp
+++ b/test/cpp/jit/test_subgraph_utils.cpp
@@ -1,4 +1,5 @@
-#include "test/cpp/jit/test_base.h"
+#include <gtest/gtest.h>
+
 #include "test/cpp/jit/test_utils.h"
 
 #include "torch/csrc/jit/passes/common_subexpression_elimination.h"
@@ -7,7 +8,7 @@
 namespace torch {
 namespace jit {
 
-void testSubgraphUtils() {
+TEST(SubgraphUtilsTest, Basic) {
   auto graph = build_lstm();
   EliminateCommonSubexpression(graph);
 
@@ -37,7 +38,7 @@ void testSubgraphUtils() {
   ASSERT_EQ(originalNodes.size(), newNodes.size());
 }
 
-void testSubgraphUtilsVmap() {
+TEST(SubgraphUtilsTest, Vmap) {
   auto graph = std::make_shared<Graph>();
 
   std::unordered_map<std::string, Value*> parse_map;
diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp
index d87e8201615d..6f626756db74 100644
--- a/test/cpp/jit/test_utils.cpp
+++ b/test/cpp/jit/test_utils.cpp
@@ -1,6 +1,9 @@
+#include <gtest/gtest.h>
+
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/clear_undefinedness.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
 
 namespace torch {
 namespace jit {
@@ -137,5 +140,22 @@ std::pair<at::Tensor, at::Tensor> lstm(
   return {hy, cy};
 }
 
+inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+namespace {
+RegisterOperators reg({
+    // This operator is intended to be used in JIT analysis and transformation
+    // pass unit tests in which Values with type Tensor are often required. It
+    // should not be used in situations in which the graph is actually executed
+    // because it always produces empty Tensors.
+    Operator(
+        "prim::MakeTestTensor() -> Tensor",
+        [](Stack* stack) { push(stack, at::Tensor()); },
+        aliasAnalysisFromSchema()),
+});
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h
index 6e6b82fff442..109f7253deea 100644
--- a/test/cpp/jit/test_utils.h
+++ b/test/cpp/jit/test_utils.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <torch/csrc/jit/testing/file_check.h>
-#include "test/cpp/jit/test_base.h"
 #include "torch/csrc/jit/ir/irparser.h"
 #include "torch/csrc/jit/runtime/autodiff.h"
 #include "torch/csrc/jit/runtime/interpreter.h"
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
deleted file mode 100644
index a058326c2050..000000000000
--- a/test/cpp/jit/tests.h
+++ /dev/null
@@ -1,242 +0,0 @@
-#pragma once
-
-/**
- * See README.md for instructions on how to add a new test.
- */
-#include <c10/macros/Export.h>
-#include <torch/csrc/WindowsTorchApiMacro.h>
-
-namespace torch {
-namespace jit {
-#define TH_FORALL_TESTS(_)                        \
-  _(Attributes)                                   \
-  _(Blocks)                                       \
-  _(CallStack)                                    \
-  _(CallStackCaching)                             \
-  _(ControlFlow)                                  \
-  _(IValueKWargs)                                 \
-  _(CustomFusion)                                 \
-  _(SchemaMatching)                               \
-  _(FromQualString)                               \
-  _(InternedStrings)                              \
-  _(PassManagement)                               \
-  _(Proto)                                        \
-  _(SchemaParser)                                 \
-  _(TopologicalIndex)                             \
-  _(SubgraphUtils)                                \
-  _(SubgraphUtilsVmap)                            \
-  _(IRParser)                                     \
-  _(THNNConv)                                     \
-  _(ATenNativeBatchNorm)                          \
-  _(NoneSchemaMatch)                              \
-  _(UnifyTypes)                                   \
-  _(Profiler)                                     \
-  _(FallbackGraphs)                               \
-  _(InsertAndEliminateRedundantGuards)            \
-  _(LoopPeeler)                                   \
-  _(InsertBailOuts)                               \
-  _(PeepholeOptimize)                             \
-  _(RecordFunction)                               \
-  _(ThreadLocalDebugInfo)                         \
-  _(SubgraphMatching)                             \
-  _(SubgraphRewriter)                             \
-  _(ModuleClone)                                  \
-  _(ModuleConstant)                               \
-  _(ModuleParameter)                              \
-  _(ModuleCopy)                                   \
-  _(ModuleDeepcopy)                               \
-  _(ModuleDeepcopyString)                         \
-  _(ModuleDeepcopyAliasing)                       \
-  _(ModuleDefine)                                 \
-  _(QualifiedName)                                \
-  _(ExtraFilesHookPreference)                     \
-  _(SaveExtraFilesHook)                           \
-  _(TypeTags)                                     \
-  _(CustomFusionNestedBlocks)                     \
-  _(ModuleInterfaceSerialization)                 \
-  _(ModuleCloneWithModuleInterface)               \
-  _(ClassTypeAddRemoveAttr)                       \
-  _(Inliner)                                      \
-  _(LiteInterpreterAdd)                           \
-  _(LiteInterpreterConv)                          \
-  _(LiteInterpreterInline)                        \
-  _(LiteInterpreterTuple)                         \
-  _(LiteInterpreterUpsampleNearest2d)             \
-  _(CommonAncestor)                               \
-  _(AutogradSymbols)                              \
-  _(DefaultArgTypeHinting)                        \
-  _(Futures)                                      \
-  _(TLSFutureCallbacks)                           \
-  _(ProfilerDisableInCallback)                    \
-  _(MobileTypeParser)                             \
-  _(LiteInterpreterBuiltinFunction)               \
-  _(LiteInterpreterPrim)                          \
-  _(LiteInterpreterPrimScalar)                    \
-  _(LiteInterpreterLoadOrigJit)                   \
-  _(LiteInterpreterWrongMethodName)               \
-  _(LiteInterpreterParams)                        \
-  _(LiteInterpreterSetState)                      \
-  _(LiteInterpreterModuleInfoBasic)               \
-  _(LiteInterpreterNotSavingModuleInfo)           \
-  _(LiteInterpreterOneSubmoduleModuleInfo)        \
-  _(LiteInterpreterTwoSubmodulesModuleInfo)       \
-  _(LiteInterpreterSequentialModuleInfo)          \
-  _(LiteInterpreterHierarchyModuleInfo)           \
-  _(LiteInterpreterDuplicatedClassTypeModuleInfo) \
-  _(LiteInterpreterEval)                          \
-  _(LiteInterpreterDict)                          \
-  _(LiteInterpreterFindAndRunMethod)              \
-  _(LiteInterpreterFindWrongMethodName)           \
-  _(MobileNamedParameters)                        \
-  _(MobileSaveLoadData)                           \
-  _(MobileSaveLoadParameters)                     \
-  _(MobileSaveLoadParametersEmpty)                \
-  _(LiteSGD)                                      \
-  _(LiteSequentialSampler)
-
-#if defined(USE_CUDA)
-#define TH_FORALL_TESTS_CUDA(_)                     \
-  _(GraphExecutor)                                  \
-  _(ModuleConversion)                               \
-  _(Interp)                                         \
-  _(TypeCheck)                                      \
-  _(GPU_IrGraphGenerator)                           \
-  _(GPU_FusionDispatch)                             \
-  _(GPU_FusionClear)                                \
-  _(GPU_FusionCopy)                                 \
-  _(GPU_FusionMove)                                 \
-  _(GPU_FusionSimpleArith)                          \
-  _(GPU_FusionExprEvalConstants)                    \
-  _(GPU_FusionExprEvalBindings)                     \
-  _(GPU_FusionExprEvalBasic)                        \
-  _(GPU_FusionExprEvalComplex)                      \
-  _(GPU_FusionExprEvalPostLower)                    \
-  _(GPU_FusionSimpleTypePromote)                    \
-  _(GPU_FusionMutator)                              \
-  _(GPU_FusionRegister)                             \
-  _(GPU_FusionTopoSort)                             \
-  _(GPU_FusionTensor)                               \
-  _(GPU_FusionFilterVals)                           \
-  _(GPU_FusionTVSplit)                              \
-  _(GPU_FusionTVMerge)                              \
-  _(GPU_FusionTVReorder)                            \
-  _(GPU_FusionEquality)                             \
-  _(GPU_FusionParser)                               \
-  _(GPU_FusionDependency)                           \
-  _(GPU_FusionCodeGen)                              \
-  _(GPU_FusionCodeGen2)                             \
-  _(GPU_FusionSimplePWise)                          \
-  _(GPU_FusionExecKernel)                           \
-  _(GPU_FusionForLoop)                              \
-  _(GPU_FusionLoopUnroll)                           \
-  _(GPU_FusionUnaryOps)                             \
-  _(GPU_FusionBinaryOps)                            \
-  _(GPU_FusionTernaryOps)                           \
-  _(GPU_FusionCompoundOps)                          \
-  _(GPU_FusionCastOps)                              \
-  _(GPU_FusionAdvancedComputeAt)                    \
-  _(GPU_FusionComputeAtMultiConsumers)              \
-  _(GPU_FusionComputeAtCommonConsumer1)             \
-  _(GPU_FusionComputeAtCommonConsumer2)             \
-  _(GPU_FusionComputeAtCommonConsumer3)             \
-  _(GPU_FusionComputeAtNoCommonConsumer)            \
-  _(GPU_FusionScalarInputs)                         \
-  _(GPU_FusionBCastConcretizeBasic)                 \
-  _(GPU_FusionBCastConcretizeRfactor)               \
-  _(GPU_FusionProveIdEqBasic)                       \
-  _(GPU_FusionProveIdEqRfactor)                     \
-  _(GPU_FusionRFactorReplay)                        \
-  _(GPU_FusionReduction)                            \
-  _(GPU_FusionReduction2)                           \
-  _(GPU_FusionReduction3)                           \
-  _(GPU_FusionReduction4)                           \
-  _(GPU_FusionReduction5)                           \
-  _(GPU_FusionReductionTFT)                         \
-  _(GPU_FusionSimpleBCast)                          \
-  _(GPU_FusionComplexBCast)                         \
-  _(GPU_FusionAdvancedIndexing)                     \
-  _(GPU_FusionSimpleGemm)                           \
-  _(GPU_FusionSoftmax1D)                            \
-  _(GPU_FusionSoftmax1DNormalized)                  \
-  _(GPU_FusionSoftmax3D)                            \
-  _(GPU_FusionSoftmax3DNormalized)                  \
-  _(GPU_FusionSoftmaxComputeAt)                     \
-  _(GPU_FusionGridReduction1)                       \
-  _(GPU_FusionGridReduction2)                       \
-  _(GPU_FusionGridReduction3dim1)                   \
-  _(GPU_FusionGridReduction3dim0)                   \
-  _(GPU_FusionGridReduction4)                       \
-  _(GPU_FusionGridReduction5)                       \
-  _(GPU_FusionGridReduction6)                       \
-  _(GPU_FusionNonRedAxisBind)                       \
-  _(GPU_FusionBCastInnerDim)                        \
-  _(GPU_FusionBCastReduce)                          \
-  _(GPU_FusionSplitBCast)                           \
-  _(GPU_FusionComputeAtExprOrder)                   \
-  _(GPU_FusionZeroDimComputeAt)                     \
-  _(GPU_FusionZeroDimBroadcast)                     \
-  _(GPU_FusionZeroDimReduction)                     \
-  _(GPU_FusionReductionMultiConsumer)               \
-  _(GPU_FusionBCastAfterReduce)                     \
-  _(GPU_FusionReductionScheduler)                   \
-  _(GPU_FusionReductionSchedulerMultiDimNonFastest) \
-  _(GPU_FusionReductionSchedulerMultiDimFastest)    \
-  _(GPU_FusionReductionSchedulerDimShmoo)           \
-  _(GPU_FusionCacheBefore)                          \
-  _(GPU_FusionCacheAfter)                           \
-  _(GPU_FusionCacheIndirect)                        \
-  _(GPU_FusionCacheBcast)                           \
-  _(GPU_FusionCacheComplex)                         \
-  _(GPU_FusionCacheMultiConsumer)                   \
-  _(GPU_FusionSmem)                                 \
-  _(GPU_FusionSmemReduce)                           \
-  _(GPU_FusionSmemBlockGemm)                        \
-  _(GPU_FusionSmemBlockGemmCache)                   \
-  _(GPU_FusionSmemDynamicReductionSymbolic)         \
-  _(GPU_FusionSmemDynamicReductionSymbolicArg)      \
-  _(GPU_FusionSmemDynamicPwiseMulSymbolicArgWAR)    \
-  _(GPU_FusionSmemDynamicTiledGemm)                 \
-  _(GPU_FusionGlobalIntermediate)                   \
-  _(GPU_FusionGlobalIntermediateDefaultSchedule)    \
-  _(GPU_FusionConstCheck)                           \
-  _(GPU_FusionSymbolicReduction)                    \
-  _(GPU_FusionUnrollWithAlloc)                      \
-  _(GPU_FusionIsZeroInt)                            \
-  _(GPU_FusionIsOneInt)                             \
-  _(GPU_FusionComputeAtNonterminatingOutput)        \
-  _(GPU_FusionTraversalOrder1)                      \
-  _(GPU_FusionTraversalOrder2)                      \
-  _(GPU_FusionTraversalOrder3)                      \
-  _(GPU_FusionTraversalOrder4)                      \
-  _(GPU_FusionTraversalOrder5)                      \
-  _(GPU_FusionTraversalOrder6)                      \
-  _(GPU_FusionTraversalOrder7)                      \
-  _(GPU_FusionBranches)                             \
-  _(GPU_FusionThreadPredicate)                      \
-  _(GPU_FusionLSTMCell)                             \
-  _(GPU_FusionComputeAtMultiBCast)                  \
-  _(GPU_FusionReductionHalf)                        \
-  _(GPU_FusionInputsIdLookup)
-#else
-#define TH_FORALL_TESTS_CUDA(_) \
-  _(GraphExecutor)              \
-  _(ModuleConversion)           \
-  _(Interp)                     \
-  _(TypeCheck)
-#endif
-
-#define DECLARE_JIT_TEST(name) void test##name();
-TH_FORALL_TESTS(DECLARE_JIT_TEST)
-TH_FORALL_TESTS_CUDA(DECLARE_JIT_TEST)
-#undef DECLARE_JIT_TEST
-
-// This test is special since it requires prior setup in python.
-// So it is not part of the general test list (which is shared between the gtest
-// and python test runners), but is instead invoked manually by the
-// torch_python_test.cpp
-void testEvalModeForLoadedModule();
-void testSerializationInterop();
-void testTorchSaveError();
-
-} // namespace jit
-} // namespace torch

From c8166d4b58d877e4b6f9b6160fd83d6775fd1e04 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 25 Sep 2020 12:11:30 -0700
Subject: [PATCH 146/449] Add `torch.cuda.comm` to typechecking CI (#45350)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45350

Reviewed By: walterddr

Differential Revision: D23935750

Pulled By: malfet

fbshipit-source-id: 5a7d2d4fbc976699d80bb5caf4727c19fa2c5bc8
---
 mypy.ini           | 4 ++--
 torch/cuda/comm.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index a7c82cb69359..50287b8ee99e 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -186,8 +186,8 @@ ignore_errors = True
 [mypy-torch.cuda.amp.*]
 ignore_errors = True
 
-[mypy-torch.cuda.comm]
-ignore_errors = True
+#[mypy-torch.cuda.comm]
+#ignore_errors = True
 
 [mypy-torch.cuda.nccl]
 ignore_errors = True
diff --git a/torch/cuda/comm.py b/torch/cuda/comm.py
index f9856eda380f..557ffb0c0de4 100644
--- a/torch/cuda/comm.py
+++ b/torch/cuda/comm.py
@@ -2,4 +2,4 @@
 from torch.nn.parallel.comm import broadcast, broadcast_coalesced, reduce_add, \
     reduce_add_coalesced, scatter, gather
 
-__all__ = [broadcast, broadcast_coalesced, reduce_add, reduce_add_coalesced, scatter, gather]
+__all__ = ['broadcast', 'broadcast_coalesced', 'reduce_add', 'reduce_add_coalesced', 'scatter', 'gather']

From f07ac6a00498f4671b7fc0f74f9750ef3fb803cd Mon Sep 17 00:00:00 2001
From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com>
Date: Fri, 25 Sep 2020 12:35:42 -0700
Subject: [PATCH 147/449] Fix Windows build failure after DDP PR merged
 (#45335)

Summary:
Fixes #{issue number}
This is resubmit for PR https://github.com/pytorch/pytorch/issues/42897 . Together with fix for Windows build issue introduced by PR https://github.com/pytorch/pytorch/issues/44344 .

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45335

Reviewed By: zou3519

Differential Revision: D23931471

Pulled By: mrshenli

fbshipit-source-id: f49b5a114944c1450b32934b3292170be064f494
---
 .../install_miniconda3.bat                    |  7 +++
 CMakeLists.txt                                |  8 ++-
 caffe2/CMakeLists.txt                         | 49 +++++++++------
 cmake/Dependencies.cmake                      |  5 +-
 test/cpp/dist_autograd/CMakeLists.txt         |  2 +-
 test/distributed/test_c10d.py                 | 49 ++++++++++-----
 test/distributed/test_c10d_spawn.py           |  8 ++-
 test/run_test.py                              | 11 ++--
 tools/build_variables.bzl                     |  7 ++-
 torch/CMakeLists.txt                          | 33 +++++-----
 torch/csrc/Module.cpp                         |  4 +-
 torch/csrc/WindowsTorchApiMacro.h             |  6 ++
 torch/csrc/distributed/c10d/comm.h            |  4 +-
 torch/csrc/distributed/c10d/init.cpp          | 10 ++-
 torch/csrc/distributed/c10d/reducer.cpp       | 24 ++++----
 torch/csrc/distributed/c10d/reducer.h         | 14 +++++
 torch/csrc/jit/python/pybind_utils.h          |  8 +--
 .../csrc/jit/python/python_sugared_value.cpp  |  2 +-
 torch/csrc/jit/runtime/interpreter.cpp        |  8 +--
 torch/csrc/jit/serialization/pickler.cpp      |  6 +-
 torch/csrc/jit/serialization/unpickler.cpp    |  6 +-
 torch/csrc/utils/future.h                     |  2 +-
 torch/distributed/rendezvous.py               | 14 ++++-
 torch/lib/c10d/CMakeLists.txt                 | 32 ++++++----
 torch/lib/c10d/FileStore.cpp                  | 51 +++++++++++++++-
 torch/lib/c10d/GlooDeviceFactory.cpp          | 33 ++++++----
 torch/lib/c10d/ProcessGroupGloo.cpp           | 61 ++++++++++++++++---
 torch/lib/c10d/Utils.cpp                      |  3 +-
 torch/lib/c10d/Utils.hpp                      |  4 ++
 torch/lib/c10d/test/CMakeLists.txt            | 15 +++--
 torch/lib/c10d/test/CUDATest.hpp              | 10 ++-
 torch/lib/c10d/test/FileStoreTest.cpp         |  8 +++
 torch/lib/c10d/test/ProcessGroupGlooTest.cpp  |  9 ++-
 torch/lib/c10d/test/TestUtils.hpp             | 30 ++++++++-
 torch/testing/_internal/common_distributed.py | 17 +++++-
 torch/testing/_internal/common_utils.py       |  4 ++
 torch/testing/_internal/dist_utils.py         |  3 +-
 .../ddp_under_dist_autograd_test.py           | 16 ++---
 .../_internal/distributed/distributed_test.py | 48 +++++++++++----
 39 files changed, 464 insertions(+), 167 deletions(-)

diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
index a66ef4b651c5..cf7255ce3789 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat
@@ -12,4 +12,11 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic
 if "%REBUILD%"=="" (
   call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3
   call conda install -y -q -c conda-forge cmake
+  call conda install -y -q -c rdonnelly libuv
 )
+
+:: Get installed libuv path
+@echo off
+set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library
+@echo on
+echo libuv_ROOT=%libuv_ROOT%
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 826c187b602e..3d937e0e1655 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,7 @@ endif()
 
 # For non-supported platforms, turn USE_DISTRIBUTED off by default.
 # It is not tested and likely won't work without additional changes.
-if(NOT LINUX)
+if(NOT LINUX AND NOT WIN32)
   set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed")
   # On macOS, if USE_DISTRIBUTED is enabled (specified by the user),
   # then make Gloo build with the libuv transport.
@@ -226,6 +226,12 @@ option(USE_TBB "Use TBB" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 
+# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
+if(WIN32)
+  set(USE_TENSORPIPE OFF)
+  message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
+endif()
+
 # Linux distributions do not want too many embedded sources, in that sense we
 # need to be able to build pytorch with an (almost) empty third_party
 # directory.
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index cea1eb56287c..96da8a00c7c8 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -291,26 +291,29 @@ endif()
 
 if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   if(USE_DISTRIBUTED)
-    add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
-    target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
-    add_dependencies(process_group_agent torch c10d)
 
     # Define this target even if we're building without TensorPipe, to make life
     # easier to other targets that depend on this. However, in that case, by not
     # setting the USE_TENSORPIPE compile definition, this target will just end
     # up being empty. Downstream targets should also add a #ifdef guard.
-    add_library(tensorpipe_agent
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
-      "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
-      )
-    target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
-    add_dependencies(tensorpipe_agent torch c10d)
-    if(USE_TENSORPIPE)
-      target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
-      target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
-      add_dependencies(tensorpipe_agent tensorpipe)
+    if(NOT WIN32)
+      add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h")
+      target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
+      add_dependencies(process_group_agent torch c10d)
+
+      add_library(tensorpipe_agent
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
+        "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
+        )
+      target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
+      add_dependencies(tensorpipe_agent torch c10d)
+      if(USE_TENSORPIPE)
+        target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE)
+        target_link_libraries(tensorpipe_agent PRIVATE tensorpipe)
+        add_dependencies(tensorpipe_agent tensorpipe)
+      endif()
     endif()
   endif()
 
@@ -493,7 +496,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
         PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
       )
     endif()
-    if(USE_DISTRIBUTED)
+    if(USE_DISTRIBUTED AND NOT WIN32)
       append_filelist("libtorch_distributed_sources" TORCH_SRCS)
     endif()
   endif()
@@ -841,7 +844,7 @@ endif()
   if(BUILD_TEST AND NOT USE_ROCM)
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
     add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr)
-    if(USE_DISTRIBUTED)
+    if(USE_DISTRIBUTED AND NOT WIN32)
       add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
     endif()
   endif()
@@ -893,9 +896,7 @@ endif()
     DESTINATION share/cmake/Torch)
 
   if(USE_DISTRIBUTED)
-    if(NOT MSVC)
-      add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
-    endif()
+    add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d)
   endif()
 
 
@@ -970,6 +971,14 @@ if(USE_DISTRIBUTED)
   target_compile_definitions(torch_cpu PRIVATE
     USE_DISTRIBUTED
   )
+  # Pass USE_RPC in order to reduce use of 
+  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+  # need to be removed when RPC is supported
+  if(NOT WIN32)
+    target_compile_definitions(torch_cpu PRIVATE
+      USE_RPC
+    )
+  endif()
   # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
   # can only be compiled with USE_TENSORPIPE is set.
   if(USE_TENSORPIPE)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 028098f61d36..023bbe9e8d07 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1253,10 +1253,7 @@ if(USE_CUDA)
 endif()
 
 if(USE_GLOO)
-  if(MSVC)
-    message(WARNING "Gloo can not be used on Windows.")
-    caffe2_update_option(USE_GLOO OFF)
-  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+  if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
     message(WARNING "Gloo can only be used on 64-bit systems.")
     caffe2_update_option(USE_GLOO OFF)
   else()
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 5d23602881f0..9969c63e16d5 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(USE_DISTRIBUTED)
+if(USE_DISTRIBUTED AND NOT WIN32)
   set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
   set(DIST_AUTOGRAD_TEST_SOURCES
     ${TORCH_ROOT}/test/cpp/common/main.cpp
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index a81bc53f175a..911a73ce432e 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -29,7 +29,7 @@
 from torch.testing._internal.common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
-    simple_sparse_reduce_tests
+    simple_sparse_reduce_tests, skip_if_win32, create_device
 
 from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
     retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -255,6 +255,7 @@ def create_tcp_store(addr):
     raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports))
 
 
+@skip_if_win32()
 class TCPStoreTest(TestCase, StoreTestBase):
     def _create_store(self):
         store = create_tcp_store('localhost')
@@ -273,6 +274,7 @@ def test_address_already_in_use(self):
             store2 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
 
 
+@skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):
     def setUp(self):
         super(PrefixTCPStoreTest, self).setUp()
@@ -329,6 +331,7 @@ def test_unknown_handler(self):
             c10d.rendezvous('invalid://')
 
 
+@skip_if_win32()
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
     def test_common_errors(self):
@@ -455,7 +458,7 @@ def test_common_errors(self):
 
     def test_nominal(self):
         with tempfile.NamedTemporaryFile(delete=False) as file:
-            url = 'file://%s?world_size=%d' % (file.name, 2)
+            url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2'
             gen0 = c10d.rendezvous(url + "&rank=0")
             store0, rank0, size0 = next(gen0)
             self.assertEqual(0, rank0)
@@ -474,6 +477,7 @@ def test_nominal(self):
             self.assertEqual(b"value1", store0.get("key1"))
 
 
+@skip_if_win32()
 class RendezvousTCPTest(TestCase):
 
     def create_tcp_url(self):
@@ -544,9 +548,13 @@ def _test_store_timeout(self, backend, init_method, c2p):
 
     def _init_methods(self):
         f = tempfile.NamedTemporaryFile(delete=False)
-        yield "file://%s" % f.name
-        f.close()
-        yield "tcp://127.0.0.1:%d" % common.find_free_port()
+        if sys.platform == 'win32':
+            yield "file:///%s" % f.name.replace("\\", "/")
+            f.close()
+        else:
+            yield "file://%s" % f.name
+            f.close()
+            yield "tcp://127.0.0.1:%d" % common.find_free_port()
 
     def _test_default_store_timeout(self, backend):
         for init_method in self._init_methods():
@@ -584,11 +592,16 @@ def test_default_store_timeout_gloo(self):
 class ProcessGroupGlooTest(MultiProcessTestCase):
     def setUp(self):
         super(ProcessGroupGlooTest, self).setUp()
-        self._fork_processes()
+
+        # For Windows platform, Python does not support fork, change it to spawn here.
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def opts(self, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        opts.devices = [create_device(interface=LOOPBACK)]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
@@ -598,8 +611,8 @@ def test_multi_device_constructor(self):
         opts = c10d.ProcessGroupGloo.Options()
         opts.timeout = 5.0
         opts.devices = [
-            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
-            c10d.ProcessGroupGloo.create_device(interface=LOOPBACK),
+            create_device(interface=LOOPBACK),
+            create_device(interface=LOOPBACK),
         ]
         pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts)
 
@@ -1514,6 +1527,7 @@ def test_barrier_implies_wait(self):
         for i, tensor in enumerate(tensors):
             self.assertEqual(torch.full(size, float(i * self.world_size)), tensor)
 
+    @skip_if_win32()
     def test_round_robin(self):
         num_process_groups = 2
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -1531,6 +1545,7 @@ def test_round_robin(self):
             pg.broadcast(tensor, root=0).wait()
             self.assertEqual(torch.full([100, 100], 0.), tensor)
 
+    @skip_if_win32()
     def test_round_robin_create_destroy(self):
         store = c10d.FileStore(self.file_name, self.world_size)
 
@@ -1959,7 +1974,10 @@ def forward(self, x):
 class DistributedDataParallelTest(MultiProcessTestCase):
     def setUp(self):
         super(DistributedDataParallelTest, self).setUp()
-        self._fork_processes()
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def tearDown(self):
         # DistributedDataParallel test doesn't seem to call FileStore destructor
@@ -2068,7 +2086,7 @@ def update_parameters(model):
     def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view)
 
@@ -3947,7 +3965,10 @@ def test_nccl_timeout(self):
 class CommTest(MultiProcessTestCase):
     def setUp(self):
         super(CommTest, self).setUp()
-        self._fork_processes()
+        if sys.platform == 'win32':
+            self._spawn_processes()
+        else:
+            self._fork_processes()
 
     def tearDown(self):
         super(CommTest, self).tearDown()
@@ -4013,7 +4034,7 @@ def test_broadcast_coalesced_nccl(self):
     def test_broadcast_coalesced_gloo_cuda(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cuda:%d" % self.rank)
         ranks = list(range(self.world_size))
@@ -4024,7 +4045,7 @@ def test_broadcast_coalesced_gloo_cuda(self):
     def test_broadcast_coalesced_gloo_cpu(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
-        options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)]
+        options.devices = [create_device(interface=LOOPBACK)]
         process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)
         device = torch.device("cpu")
         ranks = list(range(self.world_size))
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index d0bf00b8a08a..c84608e8f178 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -10,8 +10,10 @@
 import torch.nn as nn
 
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
-from torch.testing._internal.common_distributed import requires_gloo
-from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm
+from torch.testing._internal.common_distributed import requires_gloo, \
+    create_device
+from torch.testing._internal.common_utils import TestCase, load_tests, \
+    run_tests, skipIfRocm
 from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN
 
 
@@ -39,7 +41,7 @@ class ProcessGroupShareTensorTest(TestCase):
     @classmethod
     def opts(cls, threads=2):
         opts = c10d.ProcessGroupGloo.Options()
-        opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")]
+        opts.devices = [create_device(interface='lo')]
         opts.timeout = 5.0
         opts.threads = threads
         return opts
diff --git a/test/run_test.py b/test/run_test.py
index b24a20c60f46..f0658f5224d9 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -13,7 +13,7 @@
 import torch
 import torch._six
 from torch.utils import cpp_extension
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell
+from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA
 import torch.distributed as dist
 from typing import Dict, Optional
 
@@ -100,7 +100,6 @@
     'distributed/rpc/test_process_group_agent',
     'distributed/rpc/test_tensorpipe_agent',
     'distributed/test_distributed_fork',
-    'distributed/test_distributed_spawn',
 ]
 
 ROCM_BLOCKLIST = [
@@ -307,9 +306,13 @@ def test_distributed(test_module, test_directory, options):
             'MPI not available -- MPI backend tests will be skipped')
     config = DISTRIBUTED_TESTS_CONFIG
     for backend, env_vars in config.items():
+        if sys.platform == 'win32' and backend != 'gloo':
+            continue
         if backend == 'mpi' and not mpi_available:
             continue
         for with_init_file in {True, False}:
+            if sys.platform == 'win32' and not with_init_file:
+                continue
             tmp_dir = tempfile.mkdtemp()
             if options.verbose:
                 init_str = "with {} init_method"
@@ -323,9 +326,9 @@ def test_distributed(test_module, test_directory, options):
             os.environ.update(env_vars)
             if with_init_file:
                 if test_module in ["test_distributed_fork", "test_distributed_spawn"]:
-                    init_method = 'file://{}/'.format(tmp_dir)
+                    init_method = f'{FILE_SCHEMA}{tmp_dir}/'
                 else:
-                    init_method = 'file://{}/shared_init_file'.format(tmp_dir)
+                    init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file'
                 os.environ['INIT_METHOD'] = init_method
             try:
                 os.mkdir(os.path.join(tmp_dir, 'barrier'))
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index 0b670724ba92..ba73e5d8f5c9 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -542,11 +542,14 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
 ]
 
-libtorch_python_distributed_sources = [
-    "torch/csrc/distributed/autograd/init.cpp",
+libtorch_python_distributed_core_sources = [
     "torch/csrc/distributed/c10d/comm.cpp",
     "torch/csrc/distributed/c10d/init.cpp",
     "torch/csrc/distributed/c10d/reducer.cpp",
+]
+
+libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
+    "torch/csrc/distributed/autograd/init.cpp",
     "torch/csrc/distributed/rpc/init.cpp",
     "torch/csrc/distributed/rpc/process_group_agent.cpp",
     "torch/csrc/distributed/rpc/py_rref.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index b78dc4a362a7..2ae2f7f737fe 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -160,25 +160,28 @@ endif()
 
 if(USE_DISTRIBUTED)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED)
-    if(NOT MSVC)
+    if(WIN32)
+      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
+    else()
+      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC)
       append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
-      # Disable certain warnings for GCC-9.X
-      if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-        set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      endif()
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
-      list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
-      if(USE_TENSORPIPE)
-        list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
-        list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
-      endif()
     endif()
+    # Disable certain warnings for GCC-9.X
+    if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+    endif()
+    if(USE_TENSORPIPE)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
+      list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
+    endif()
+    list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 endif()
 
-if(USE_NCCL)
+if(USE_NCCL AND NOT WIN32)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ed4aa21a8f76..ae6f15155f2a 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -688,9 +688,9 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   THPUtils_addPyMethodDefs(methods, THCPModule_methods());
 #endif
-#ifdef USE_DISTRIBUTED
-#ifdef USE_C10D
+#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
   THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions());
+#ifndef _WIN32
   THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions());
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::autograd::python_functions());
diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h
index 7f8ef4e01677..7f44db0baba9 100644
--- a/torch/csrc/WindowsTorchApiMacro.h
+++ b/torch/csrc/WindowsTorchApiMacro.h
@@ -5,3 +5,9 @@
 // There's no difference between aten, torch and caffe2 libs any more
 // TODO: clean up the naming for consistency
 #define TORCH_API CAFFE2_API
+
+#ifdef _WIN32
+#define TORCH_PYTHON_API
+#else
+#define TORCH_PYTHON_API CAFFE2_API
+#endif
diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h
index e2b501f08aff..2eb626c40232 100644
--- a/torch/csrc/distributed/c10d/comm.h
+++ b/torch/csrc/distributed/c10d/comm.h
@@ -38,7 +38,7 @@ class GradBucket {
 // DDP's c10d reducer allows communication hooks defined as a sub class
 // of CommHookInterface. CommHookInterface is an abstract class and can
 // be used to implement both Python and CPP hooks.
-struct TORCH_API CommHookInterface {
+struct TORCH_PYTHON_API CommHookInterface {
  public:
   virtual ~CommHookInterface() {}
 
@@ -59,7 +59,7 @@ struct TORCH_API CommHookInterface {
 
 // PythonCommHook enables registering a python hook to c10d reducer and is a
 // sub class of CommHookInterface.
-class TORCH_API PythonCommHook : public CommHookInterface {
+class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
  public:
   // The constructor takes a state and a callable hook. Inputs are Python
   // objects. The state is passed to the hook in runHook function can be used to
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 165d6a1c8603..be1752d7366f 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1,7 +1,11 @@
 #include <torch/csrc/python_headers.h>
 
 #include <c10d/FileStore.hpp>
+#ifndef _WIN32
 #include <c10d/HashStore.hpp>
+#include <c10d/TCPStore.hpp>
+#include <c10d/ProcessGroupRoundRobin.hpp>
+#endif
 #include <c10d/ProcessGroup.hpp>
 
 #ifdef USE_C10D_GLOO
@@ -17,8 +21,6 @@
 #endif
 
 #include <c10d/PrefixStore.hpp>
-#include <c10d/ProcessGroupRoundRobin.hpp>
-#include <c10d/TCPStore.hpp>
 #include <pybind11/chrono.h>
 
 #include <torch/csrc/Exceptions.h>
@@ -323,6 +325,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
   shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store)
       .def(py::init<const std::string&, int>());
 
+#ifndef _WIN32
   shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store)
       .def(py::init<>());
 
@@ -340,6 +343,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
           py::arg("is_master"),
           py::arg("timeout") =
               std::chrono::milliseconds(::c10d::Store::kDefaultTimeout));
+#endif
 
   shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store)
       .def(py::init<const std::string&, std::shared_ptr<::c10d::Store>>());
@@ -607,6 +611,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
               py::arg("opts") = ::c10d::BarrierOptions(),
               py::call_guard<py::gil_scoped_release>());
 
+#ifndef _WIN32
   module.def(
       "_round_robin_process_groups",
       [](std::vector<std::shared_ptr<::c10d::ProcessGroup>> processGroups)
@@ -620,6 +625,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
       },
       py::arg("process_groups"),
       py::call_guard<py::gil_scoped_release>());
+#endif
 
 #ifdef USE_C10D_GLOO
   auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>(
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 86916c7994dd..6bbed5e87fea 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -89,10 +89,7 @@ Reducer::Reducer(
       for (size_t variable_index = 0; variable_index < variable_count;
            variable_index++) {
         auto& variable = replicas_[replica_index][variable_index];
-        const auto index = VariableIndex{
-            .replica_index = replica_index,
-            .variable_index = variable_index,
-        };
+        const auto index = VariableIndex(replica_index, variable_index);
 
         // The gradient accumulator function is lazily initialized once.
         // Therefore we can use its presence in the autograd graph as
@@ -100,15 +97,19 @@ Reducer::Reducer(
         auto grad_accumulator =
             torch::autograd::impl::grad_accumulator(variable);
 
+#ifndef _WIN32
         using torch::distributed::autograd::ThreadLocalDistAutogradContext;
+#endif
         // Hook to execute after the gradient accumulator has executed.
         hooks_.emplace_back(
             grad_accumulator->add_post_hook(
                 torch::make_unique<torch::autograd::utils::LambdaPostHook>(
                     [=](const torch::autograd::variable_list& outputs,
                         const torch::autograd::variable_list& /* unused */) {
+#ifndef _WIN32
                       this->rpc_context_.set(
                           ThreadLocalDistAutogradContext::getContextPtr());
+#endif
                       this->autograd_hook(index);
                       return outputs;
                     })),
@@ -477,10 +478,7 @@ void Reducer::push_rebuilt_params_for_all_indices() {
     const auto variable_count = replicas_[replica_index].size();
     for (size_t variable_index = 0; variable_index < variable_count;
          ++variable_index) {
-      const auto index = VariableIndex{
-          .replica_index = replica_index,
-          .variable_index = variable_index,
-      };
+      const auto index = VariableIndex(replica_index, variable_index);
       push_rebuilt_params(index);
     }
   }
@@ -712,8 +710,10 @@ void Reducer::initialize_buckets(
   // bucket_view, then it needs to check rpc context ptr is nullptr or not,
   // If rpc context ptr is nullptr, mutate variable.grad(); otherwise,
   // mutate grad in rpc context.
+#ifndef _WIN32
   using torch::distributed::autograd::ThreadLocalDistAutogradContext;
   this->rpc_context_.set(ThreadLocalDistAutogradContext::getContextPtr());
+#endif
 
   // This shouldn't be called if we're expecting autograd hooks to fire.
   TORCH_CHECK(
@@ -850,10 +850,8 @@ void Reducer::initialize_buckets(
       TORCH_CHECK(
           variable_index < variable_locators_.size(),
           "Out of range variable index specified.");
-      variable_locators_[variable_index] = VariableLocator{
-          .bucket_index = bucket_index,
-          .intra_bucket_index = intra_bucket_index++,
-      };
+      variable_locators_[variable_index] = VariableLocator(
+        bucket_index, intra_bucket_index++);
     }
     bucket.variable_indices = std::move(bucket_indices[bucket_index]);
 
@@ -1235,7 +1233,9 @@ void Reducer::runGradCallbackForVariable(
     cb(variable.mutable_grad());
   } else {
     // Under distributed autograd
+#ifndef _WIN32
     context_ptr->runGradCallbackForVariable(variable, std::move(cb));
+#endif
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 960a32356acf..486b7337366a 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -104,6 +104,13 @@ class Reducer {
   struct VariableIndex {
     size_t replica_index;
     size_t variable_index;
+
+    VariableIndex() = default;
+
+    VariableIndex(size_t replica_index_, size_t variable_index_) {
+      replica_index = replica_index_;
+      variable_index = variable_index_;
+    }
   };
 
   void push_rebuilt_params(const VariableIndex& index);
@@ -281,6 +288,13 @@ class Reducer {
     size_t bucket_index;
     // Index of parameter in single bucket replica.
     size_t intra_bucket_index;
+
+    VariableLocator() = default;
+
+    VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) {
+      bucket_index = bucket_index_;
+      intra_bucket_index = intra_bucket_index_;
+    }
   };
 
   // Map the index of a variable to its location in the bucket structure.
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 65f5a49145c8..4be55a9caa90 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -320,7 +320,7 @@ inline InferredType tryToInferType(py::handle input) {
   if (py::isinstance<Object>(input)) {
     auto object = py::cast<Object>(input);
     return InferredType(object.type());
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   } else if (py::isinstance<torch::distributed::rpc::PyRRef>(input)) {
     auto rref_ivalue = input.cast<torch::distributed::rpc::PyRRef>().toIValue();
     return InferredType(rref_ivalue.type());
@@ -716,7 +716,7 @@ inline IValue toIValue(
       }
     }
     case TypeKind::RRefType: {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
       return obj.cast<torch::distributed::rpc::PyRRef>().toIValue();
 #else
       AT_ERROR("RRef is only supported with the distributed package");
@@ -896,7 +896,7 @@ inline py::object toPyObject(IValue ivalue) {
     }
     return std::move(py_dict);
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     auto RRefPtr =
         c10::dynamic_intrusive_pointer_cast<torch::distributed::rpc::RRef>(
             std::move(ivalue).toRRef());
@@ -942,7 +942,7 @@ inline py::object toPyObject(IValue ivalue) {
     auto py_class = getScriptedClassOrError(qualified_class_name);
     return py_class.attr(enum_holder->name().c_str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     return py::cast(torch::distributed::rpc::PyRRef(
         c10::static_intrusive_pointer_cast<distributed::rpc::RRef>(
             ivalue.toRRef())));
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index ba94d33f37b3..119b6b5e5de7 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -916,7 +916,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) {
     return SpecialFormValue::create(prim::annotate);
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     // RPC module is only avaialble when build flag "USE_DISTRIBUTED" is on.
   } else if (
       obj.ptr() ==
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 337fe66c0789..f61e2597447f 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -23,7 +23,7 @@
 #include <torch/csrc/jit/runtime/profiling_record.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/autograd/context/container.h>
 using torch::distributed::autograd::DistAutogradContainer;
 #endif
@@ -267,7 +267,7 @@ void insertLastUses(Graph& g) {
 }
 
 inline int64_t getDistAutogradContextId() {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   return DistAutogradContainer::currentContextId();
 #else
   return 0;
@@ -1690,7 +1690,7 @@ InterpreterState::InterpreterState(
     : pImpl(std::move(pImpl_)) {}
 
 void InterpreterContinuation::operator()() {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   auto prev_dist_id = DistAutogradContainer::currentContextId();
   DistAutogradContainer::forceCurrentContextId(dist_autograd_context_id_);
 #endif
@@ -1700,7 +1700,7 @@ void InterpreterContinuation::operator()() {
   } else {
     state.runAsync(stack);
   }
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
   DistAutogradContainer::forceCurrentContextId(prev_dist_id);
 #endif
 }
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 6f911f4246cc..2bc9abea8c57 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <aten/src/ATen/quantized/Quantizer.h>
@@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
            "this class.";
     AT_ERROR(err.str());
   } else if (ivalue.isRRef()) {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     TORCH_CHECK(
         torch::distributed::rpc::getAllowJitRRefPickle() == true,
         "RRef jit pickling is only allowed inside RPC calls.");
@@ -166,7 +166,7 @@ void Pickler::pushDevice(const IValue& ivalue) {
   }
 }
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 void Pickler::pushRRef(const IValue& ivalue) {
   // It is the same as how rref is pickled in python, see PyRRef::pickle
   auto rrefInterface = ivalue.toRRef();
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index c416f9641023..9b8fce0b4869 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -1,6 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/core/Dict.h>
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #endif
 #include <torch/csrc/jit/api/function_impl.h>
@@ -549,7 +549,7 @@ void Unpickler::readGlobal(
     stack_.emplace_back(int64_t(globals_.size() - 1));
     return;
   } else if (module_name == "torch.distributed.rpc" && class_name == "rref") {
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
     return rebuildRRef();
 #else
     TORCH_INTERNAL_ASSERT(
@@ -669,7 +669,7 @@ void Unpickler::rebuildTensor(bool quantized) {
   });
 }
 
-#ifdef USE_DISTRIBUTED
+#ifdef USE_RPC
 void Unpickler::rebuildRRef() {
   globals_.emplace_back([this] {
     // It is the same as how rref is unpickled in python,
diff --git a/torch/csrc/utils/future.h b/torch/csrc/utils/future.h
index 6d672ee86cd5..093d043ecf7d 100644
--- a/torch/csrc/utils/future.h
+++ b/torch/csrc/utils/future.h
@@ -26,7 +26,7 @@ class TORCH_API FutureError final : public std::exception {
 // Most implementation is copied from FutureMessage and
 // c10::ivalue::Future
 template <typename T>
-class TORCH_API Future final {
+class TORCH_PYTHON_API Future final {
  public:
   Future() = default;
 
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 292634580aab..4545aea2bf56 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -6,9 +6,12 @@
 import torch._six as six
 import numbers
 import os
-from . import FileStore, TCPStore
+import sys
+from . import FileStore
 from .constants import default_pg_timeout
 
+if sys.platform != 'win32':
+    from . import TCPStore
 
 _rendezvous_handlers = {}
 
@@ -90,6 +93,10 @@ def _error(msg):
 
     result = urlparse(url)
     path = result.path
+    if sys.platform == 'win32':
+        import urllib.request
+        path = urllib.request.url2pathname(result.path)
+
     if not path:
         raise _error("path missing")
     query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
@@ -175,7 +182,8 @@ def _env_error(var):
     # If this configuration is invalidated, there is nothing we can do about it
     raise RuntimeError("Unable to perform rerendezvous using env:// method")
 
+if sys.platform != 'win32':
+    register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+    register_rendezvous_handler("env", _env_rendezvous_handler)
 
 register_rendezvous_handler("file", _file_rendezvous_handler)
-register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
-register_rendezvous_handler("env", _env_rendezvous_handler)
diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 68fe49f411f5..4b206f380111 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -45,15 +45,16 @@ endfunction()
 
 set(C10D_SRCS
   FileStore.cpp
-  HashStore.cpp
   ProcessGroup.cpp
-  ProcessGroupRoundRobin.cpp
   Store.cpp
   PrefixStore.cpp
-  TCPStore.cpp
   Utils.cpp
   )
 
+if(NOT WIN32)
+  list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp TCPStore.cpp)
+endif()
+
 set(C10D_LIBS torch)
 
 if(USE_C10D_NCCL)
@@ -77,14 +78,17 @@ endif()
 add_library(c10d STATIC ${C10D_SRCS})
 set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
-target_compile_options(c10d PUBLIC
-  -Wall
-  -Wextra
-  -Wno-unused-parameter
-  -Wno-missing-field-initializers
-  -Wno-write-strings
-  -Wno-unknown-pragmas
-  )
+
+if(NOT MSVC)
+  target_compile_options(c10d PUBLIC
+    -Wall
+    -Wextra
+    -Wno-unused-parameter
+    -Wno-missing-field-initializers
+    -Wno-write-strings
+    -Wno-unknown-pragmas
+    )
+endif()
 
 add_dependencies(c10d torch)
 
@@ -118,17 +122,19 @@ if(USE_C10D_GLOO)
 endif()
 
 copy_header(FileStore.hpp)
-copy_header(HashStore.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
 copy_header(Store.hpp)
-copy_header(TCPStore.hpp)
 copy_header(Types.hpp)
 copy_header(Utils.hpp)
 if(USE_GLOO)
   copy_header(ProcessGroupGloo.hpp)
   copy_header(GlooDeviceFactory.hpp)
 endif()
+if(NOT WIN32)
+  copy_header(HashStore.hpp)
+  copy_header(TCPStore.hpp)
+endif()
 
 if(USE_C10D_NCCL)
   copy_header(ProcessGroupNCCL.hpp)
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index 55346e0fa635..eb25c52f787a 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -3,9 +3,16 @@
 #include <assert.h>
 #include <fcntl.h>
 #include <stdint.h>
-#include <sys/file.h>
 #include <sys/stat.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fileapi.h>
+#include <io.h>
+#else
+#include <sys/file.h>
 #include <unistd.h>
+#endif
 
 #include <chrono>
 #include <cstdio>
@@ -21,6 +28,40 @@
     throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \
   }
 
+#ifdef _WIN32
+#define LOCK_EX 0x00000001
+#define LOCK_SH 0x00000010
+#define LOCK_UN 0x00000100
+
+int flock_(int fd, int op) {
+    HANDLE hdl = (HANDLE) _get_osfhandle(fd);
+    DWORD low = 1, high = 0;
+    OVERLAPPED offset = {0, 0, 0, 0, NULL};
+
+    if (hdl < 0)
+      return -1;
+
+    switch (op) {
+      case LOCK_EX:
+        if (LockFileEx(hdl, LOCKFILE_EXCLUSIVE_LOCK, 0, low, high, &offset))
+          return 0;
+        break;
+      case LOCK_SH:
+        if (LockFileEx(hdl, 0, 0, low, high, &offset))
+          return 0;
+        break;
+      case LOCK_UN:
+        if(UnlockFileEx(hdl, 0, low, high, &offset) != 0)
+          return 0;
+        break;
+      default:
+        break;
+    }
+    errno = EINVAL;
+    return -1;
+}
+#endif
+
 namespace c10d {
 
 namespace {
@@ -79,7 +120,11 @@ class Lock {
   int fd_{-1};
 
   void flock(int operation) {
+#ifdef _WIN32
+    auto rv = syscall(std::bind(::flock_, fd_, operation));
+#else
     auto rv = syscall(std::bind(::flock, fd_, operation));
+#endif
     SYSASSERT(rv, "flock");
   }
 };
@@ -92,7 +137,11 @@ class File {
       std::chrono::milliseconds timeout) {
     const auto start = std::chrono::steady_clock::now();
     while (true) {
+#ifdef _WIN32
+      fd_ = syscall(std::bind(::open, path.c_str(), flags | _O_BINARY, _S_IREAD | _S_IWRITE));
+#else
       fd_ = syscall(std::bind(::open, path.c_str(), flags, 0644));
+#endif
       // Only retry when the file doesn't exist, since we are waiting for the
       // file to be created in this case to address the following issue:
       // https://github.com/pytorch/pytorch/issues/13750
diff --git a/torch/lib/c10d/GlooDeviceFactory.cpp b/torch/lib/c10d/GlooDeviceFactory.cpp
index 70c3c2bb7a31..dca6b03eb9dd 100644
--- a/torch/lib/c10d/GlooDeviceFactory.cpp
+++ b/torch/lib/c10d/GlooDeviceFactory.cpp
@@ -36,16 +36,16 @@ C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING(
 
 #if GLOO_HAVE_TRANSPORT_TCP
 static std::shared_ptr<::gloo::transport::Device> makeTCPDevice(
-    const std::string& interface,
+    const std::string& interfaceName,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interface.empty() || !hostname.empty(),
+      !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeTCPDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::tcp::attr attr;
-  if (!interface.empty()) {
-    attr.iface = interface;
+  if (!interfaceName.empty()) {
+    attr.iface = interfaceName;
   } else {
     attr.hostname = hostname;
   }
@@ -61,16 +61,16 @@ C10_REGISTER_CREATOR(GlooDeviceRegistry, TCP, makeTCPDevice);
 
 #if GLOO_HAVE_TRANSPORT_UV
 static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
-    const std::string& interface,
+    const std::string& interfaceName,
     const std::string& hostname) {
   TORCH_CHECK(
-      !interface.empty() || !hostname.empty(),
+      !interfaceName.empty() || !hostname.empty(),
       "GlooDeviceFactory::makeUVDevice(): interface or hostname "
       "can't be empty");
 
   ::gloo::transport::uv::attr attr;
-  if (!interface.empty()) {
-    attr.iface = interface;
+  if (!interfaceName.empty()) {
+    attr.iface = interfaceName;
   } else {
     attr.hostname = hostname;
   }
@@ -81,23 +81,28 @@ static std::shared_ptr<::gloo::transport::Device> makeUVDevice(
 // the flexibility of other application to override by priority. Register
 // UV to `UV` for env "GLOO_DEVICE_TRANSPORT" override.
 C10_REGISTER_CREATOR(GlooDeviceRegistry, APPLE, makeUVDevice);
+C10_REGISTER_CREATOR(GlooDeviceRegistry, WIN32, makeUVDevice);
 C10_REGISTER_CREATOR(GlooDeviceRegistry, UV, makeUVDevice);
 #endif
 
 static const char* glooDeviceTransport = getenv("GLOO_DEVICE_TRANSPORT");
 
 std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
-    makeDeviceForInterface(const std::string& interface) {
+    makeDeviceForInterface(const std::string& interfaceName) {
   if (glooDeviceTransport) {
-    return GlooDeviceRegistry()->Create(glooDeviceTransport, interface, "");
+    return GlooDeviceRegistry()->Create(glooDeviceTransport, interfaceName, "");
   }
 
 #ifdef __linux__
-  return GlooDeviceRegistry()->Create("LINUX", interface, "");
+  return GlooDeviceRegistry()->Create("LINUX", interfaceName, "");
 #endif
 
 #ifdef __APPLE__
-  return GlooDeviceRegistry()->Create("APPLE", interface, "");
+  return GlooDeviceRegistry()->Create("APPLE", interfaceName, "");
+#endif
+
+#ifdef _WIN32
+  return GlooDeviceRegistry()->Create("WIN32", interfaceName, "");
 #endif
 
   throw std::runtime_error("makeDeviceForInterface(): unsupported gloo device");
@@ -117,6 +122,10 @@ std::shared_ptr<::gloo::transport::Device> GlooDeviceFactory::
   return GlooDeviceRegistry()->Create("APPLE", "", hostname);
 #endif
 
+#ifdef _WIN32
+  return GlooDeviceRegistry()->Create("WIN32", "", hostname);
+#endif
+
   throw std::runtime_error("makeDeviceForHostname(): unsupported gloo device");
 }
 
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index 531fe751f1c9..c139ac7a34fd 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -2,10 +2,16 @@
 
 #include <c10d/GlooDeviceFactory.hpp>
 
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#include <gloo/common/win.h>
+#else
 #include <netdb.h>
 #include <sys/socket.h>
-#include <sys/types.h>
 #include <unistd.h>
+#endif
+#include <sys/types.h>
 
 #include <type_traits>
 
@@ -36,6 +42,36 @@
 #include <gloo/rendezvous/context.h>
 #include <gloo/rendezvous/prefix_store.h>
 
+#ifdef _WIN32
+#define GENERATE_ALL_TYPES(type, func, ...)            \
+  switch (type) {                                      \
+    case ::at::ScalarType::Float:                      \
+      func<float>(__VA_ARGS__);                        \
+      break;                                           \
+    case ::at::ScalarType::Double:                     \
+      func<double>(__VA_ARGS__);                       \
+      break;                                           \
+    case ::at::ScalarType::Half:                       \
+      func<gloo::float16>(__VA_ARGS__);                \
+      break;                                           \
+    case ::at::ScalarType::Char:                       \
+      func<int8_t>(__VA_ARGS__);                       \
+      break;                                           \
+    case ::at::ScalarType::Byte:                       \
+      func<uint8_t>(__VA_ARGS__);                      \
+      break;                                           \
+    case ::at::ScalarType::Int:                        \
+      func<int32_t>(__VA_ARGS__);                      \
+      break;                                           \
+    case ::at::ScalarType::Long:                       \
+      func<int64_t>(__VA_ARGS__);                      \
+      break;                                           \
+    default:                                           \
+      throw std::runtime_error("Invalid scalar type"); \
+  }
+
+#define HOST_NAME_MAX 256
+#else
 #define GENERATE_ALL_TYPES(type, func, args...)        \
   switch (type) {                                      \
     case ::at::ScalarType::Float:                      \
@@ -62,6 +98,7 @@
     default:                                           \
       throw std::runtime_error("Invalid scalar type"); \
   }
+#endif
 
 namespace c10d {
 
@@ -409,12 +446,19 @@ ProcessGroupGloo::Options::Options()
 
 namespace {
 
+void socketInitialize() {
+#ifdef _WIN32
+  ::gloo::init_winsock();
+#endif
+}
+
 // Gloo assumes that this machine's hostname can always be resolved
 // to an address. If it doesn't it throws a runtime error saying
 // that it can't be resolved. Instead of catching it, we choose
 // to proactively check if an address can be resolved, so we can
 // gracefully fall back to an alternative if it doesn't.
 bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
+  socketInitialize();
   struct addrinfo hints;
   memset(&hints, 0, sizeof(hints));
   hints.ai_family = AF_UNSPEC;
@@ -431,7 +475,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
       continue;
     }
     rv = bind(fd, rp->ai_addr, rp->ai_addrlen);
+#ifdef _WIN32
+    closesocket(fd);
+#else
     close(fd);
+#endif
     if (rv == -1) {
       continue;
     }
@@ -443,14 +491,11 @@ bool doesHostnameResolveToUsableAddress(const std::string& hostname) {
 
 } // namespace
 
-#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
-    createDeviceForInterface(const std::string& interface) {
-  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface);
+    createDeviceForInterface(const std::string& interface_name) {
+  return ::c10d::GlooDeviceFactory::makeDeviceForInterface(interface_name);
 }
-#endif
 
-#if defined(__linux__) || defined(__APPLE__)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDeviceForHostname(const std::string& hostname) {
   TORCH_CHECK(
@@ -460,14 +505,14 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
       " to a (local) address");
   return ::c10d::GlooDeviceFactory::makeDeviceForHostname(hostname);
 }
-#endif
 
-#ifdef __linux__
+#if defined(__linux__) || defined(_WIN32)
 std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::
     createDefaultDevice() {
   // Use the hostname to resolve the network address to
   // use. Note: if the hostname does not resolve to an address (e.g.
   // because of misconfigured /etc/hosts file), this will not work.
+  socketInitialize();
   std::array<char, HOST_NAME_MAX> hostname{};
   auto rv = gethostname(hostname.data(), HOST_NAME_MAX);
   if (rv != 0) {
diff --git a/torch/lib/c10d/Utils.cpp b/torch/lib/c10d/Utils.cpp
index d975f6eb6bc5..6c6e941ef95d 100644
--- a/torch/lib/c10d/Utils.cpp
+++ b/torch/lib/c10d/Utils.cpp
@@ -1,5 +1,6 @@
 #include <c10d/Utils.hpp>
 
+#ifndef _WIN32
 #include <netdb.h>
 #include <sys/poll.h>
 
@@ -354,6 +355,6 @@ std::tuple<int, std::string> accept(
   return std::make_tuple(
       socket, sockaddrToString(reinterpret_cast<struct ::sockaddr*>(&addr)));
 }
-
 } // namespace tcputil
 } // namespace c10d
+#endif
diff --git a/torch/lib/c10d/Utils.hpp b/torch/lib/c10d/Utils.hpp
index 1bdaddde9f24..1116cd39ba1c 100644
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@@ -1,6 +1,8 @@
 #pragma once
 
+#ifndef _WIN32
 #include <sys/socket.h>
+#endif
 #include <sys/types.h>
 
 #include <chrono>
@@ -480,6 +482,7 @@ class ResourceGuard {
   bool released_;
 };
 
+#ifndef _WIN32
 namespace tcputil {
 
 constexpr std::chrono::milliseconds kNoTimeout = std::chrono::milliseconds(-1);
@@ -609,4 +612,5 @@ std::tuple<int, std::string> accept(
     const std::chrono::milliseconds& timeout = kNoTimeout);
 
 } // namespace tcputil
+#endif
 } // namespace c10d
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 8429d1099b29..003f56f30861 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -8,14 +8,19 @@ function(c10d_add_test test_src)
   get_filename_component(test_name ${test_src} NAME_WE)
   add_executable(${test_name} "${test_src}")
   target_include_directories(${test_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
-  target_link_libraries(${test_name} pthread ${ARGN})
-  target_compile_options(${test_name} PRIVATE -Wno-error)
+  target_link_libraries(${test_name} ${ARGN})
+  if(NOT WIN32)
+    target_link_libraries(${test_name} pthread)
+    target_compile_options(${test_name} PRIVATE -Wno-error)
+  endif()
   add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
 endfunction()
 
 c10d_add_test(FileStoreTest.cpp c10d gtest_main)
-c10d_add_test(HashStoreTest.cpp c10d gtest_main)
-c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
+if(NOT WIN32)
+  c10d_add_test(HashStoreTest.cpp c10d gtest_main)
+  c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
+endif()
 
 if(USE_CUDA)
   if(USE_C10D_GLOO)
@@ -29,7 +34,7 @@ if(USE_CUDA)
   endif()
 else()
   if(USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main)
   endif()
 endif()
 
diff --git a/torch/lib/c10d/test/CUDATest.hpp b/torch/lib/c10d/test/CUDATest.hpp
index defaff895a18..328da2faf648 100644
--- a/torch/lib/c10d/test/CUDATest.hpp
+++ b/torch/lib/c10d/test/CUDATest.hpp
@@ -5,9 +5,15 @@
 namespace c10d {
 namespace test {
 
-void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
+#ifdef _WIN32
+#define EXPORT_TEST_API __declspec(dllexport)
+#else
+#define EXPORT_TEST_API
+#endif
 
-int cudaNumDevices();
+EXPORT_TEST_API void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks);
+
+EXPORT_TEST_API int cudaNumDevices();
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp
index 77215f4521c2..cc8da6326091 100644
--- a/torch/lib/c10d/test/FileStoreTest.cpp
+++ b/torch/lib/c10d/test/FileStoreTest.cpp
@@ -1,6 +1,8 @@
 #include <c10d/test/StoreTestCommon.hpp>
 
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 
 #include <iostream>
 #include <thread>
@@ -10,6 +12,11 @@
 #include <c10d/FileStore.hpp>
 #include <c10d/PrefixStore.hpp>
 
+#ifdef _WIN32
+std::string tmppath() {
+  return c10d::test::autoGenerateTmpFilePath();
+}
+#else
 std::string tmppath() {
   const char* tmpdir = getenv("TMPDIR");
   if (tmpdir == nullptr) {
@@ -29,6 +36,7 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
+#endif
 
 void testGetSet(std::string path, std::string prefix = "") {
   // Basic Set/Get on File Store
diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
index 6606e553e733..da4f9b5fc106 100644
--- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp
@@ -1,7 +1,10 @@
+#ifndef _WIN32
 #include <signal.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#endif
+
+#include <sys/types.h>
 
 #include <condition_variable>
 #include <iostream>
@@ -21,6 +24,7 @@ using namespace c10d::test;
 constexpr auto kSendDelay = std::chrono::milliseconds(100);
 constexpr auto kWaitTimeout = std::chrono::milliseconds(1);
 
+#ifndef _WIN32
 class SignalTest {
  public:
   SignalTest(const std::string& path) : path_(path) {}
@@ -92,6 +96,7 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal(
   test.arm(fork.pid, signal);
   return test.run(0, 2);
 }
+#endif
 
 class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo {
  public:
@@ -456,6 +461,7 @@ void testRecv(const std::string& path) {
   EXPECT_TRUE(recvCompleted);
 }
 
+#ifndef _WIN32
 TEST(ProcessGroupGlooTest, testSIGSTOPException) {
   // test SIGSTOP
   // Fork() and TSAN don't play well together, so skip the test if we're testing
@@ -485,6 +491,7 @@ TEST(ProcessGroupGlooTest, testSIGKILLException) {
   EXPECT_FALSE(work->isSuccess());
   EXPECT_THROW(std::rethrow_exception(work->exception()), std::exception);
 }
+#endif
 
 TEST(ProcessGroupGlooTest, testAllReduceCPU) {
   {
diff --git a/torch/lib/c10d/test/TestUtils.hpp b/torch/lib/c10d/test/TestUtils.hpp
index c62695485573..5f5dfca315cb 100644
--- a/torch/lib/c10d/test/TestUtils.hpp
+++ b/torch/lib/c10d/test/TestUtils.hpp
@@ -1,9 +1,12 @@
 #pragma once
 
+#ifndef _WIN32
 #include <signal.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#endif
+
+#include <sys/types.h>
 #include <cstring>
 
 #include <condition_variable>
@@ -37,6 +40,28 @@ class Semaphore {
   std::condition_variable cv_;
 };
 
+#ifdef _WIN32
+std::string autoGenerateTmpFilePath() {
+  char tmp[L_tmpnam_s];
+  errno_t err;
+  err = tmpnam_s(tmp, L_tmpnam_s);
+  if (err != 0)
+  {
+    throw std::system_error(errno, std::system_category());
+  }
+  return std::string(tmp);
+}
+
+std::string tmppath() {
+  const char* tmpfile = getenv("TMPFILE");
+  if (tmpfile) {
+    return std::string(tmpfile);
+  }
+  else {
+    return autoGenerateTmpFilePath();
+  }
+}
+#else
 std::string tmppath() {
   // TMPFILE is for manual test execution during which the user will specify
   // the full temp file path using the environmental variable TMPFILE
@@ -63,6 +88,7 @@ std::string tmppath() {
   close(fd);
   return std::string(tmp.data(), tmp.size());
 }
+#endif
 
 bool isTSANEnabled() {
   auto s = std::getenv("PYTORCH_TEST_WITH_TSAN");
@@ -80,6 +106,7 @@ struct TemporaryFile {
   }
 };
 
+#ifndef _WIN32
 struct Fork {
   pid_t pid;
 
@@ -101,6 +128,7 @@ struct Fork {
     return pid == 0;
   }
 };
+#endif
 
 } // namespace test
 } // namespace c10d
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index f8e5b4822bd8..b2cd30c66812 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -16,7 +16,7 @@
 import torch.distributed as c10d
 
 from functools import partial, reduce
-from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM
+from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, FILE_SCHEMA
 
 class TestSkip(NamedTuple):
     exit_code: int
@@ -143,10 +143,23 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
+def skip_if_win32():
+    return unittest.skipIf(
+        sys.platform == 'win32',
+        "This unit test case is not supportted on Windows platform",
+    )
+
 TIMEOUT_DEFAULT = 100
 TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
 
 
+def create_device(interface=None):
+    if sys.platform == 'win32' or interface is None:
+        return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
+    else:
+        return c10d.ProcessGroupGloo.create_device(interface=interface)
+
+
 def get_timeout(test_id):
     return TIMEOUT_OVERRIDE.get(test_id.split('.')[-1], TIMEOUT_DEFAULT)
 
@@ -206,7 +219,7 @@ def initialize_temp_directories(init_method=None):
     if init_method is not None:
         os.environ["INIT_METHOD"] = init_method
     else:
-        os.environ["INIT_METHOD"] = "file://" + os.path.join(
+        os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join(
             init_dir_path, "shared_init_file"
         )
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 9959551031ff..36434ff8aa2f 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -53,6 +53,10 @@
 
 torch.backends.disable_global_flags()
 
+FILE_SCHEMA = "file://"
+if sys.platform == 'win32':
+    FILE_SCHEMA = "file:///"
+
 IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
 
 class ProfilingMode(Enum):
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
index b88765211df1..93de304a53ca 100644
--- a/torch/testing/_internal/dist_utils.py
+++ b/torch/testing/_internal/dist_utils.py
@@ -7,6 +7,7 @@
 import torch.distributed as dist
 import torch.distributed.rpc as rpc
 from torch.distributed.rpc import _rref_context_get_debug_info  # type: ignore[attr-defined]
+from torch.testing._internal.common_utils import FILE_SCHEMA
 
 
 if not dist.is_available():
@@ -14,7 +15,7 @@
     sys.exit(0)
 
 
-INIT_METHOD_TEMPLATE = "file://{file_name}"
+INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}"
 
 
 def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 1b1f755ed4cc..09db831e9999 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -20,7 +20,7 @@
     skip_if_lt_x_gpu,
     skip_if_rocm,
 )
-from torch.testing._internal.dist_utils import dist_init
+from torch.testing._internal.dist_utils import dist_init, INIT_METHOD_TEMPLATE
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
 )
@@ -329,7 +329,7 @@ def _remote_worker_process(self):
         gLogger.info("The remote worker is running.")
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -346,7 +346,7 @@ def _trainer_process(self, rank: int):
         )
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -363,7 +363,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
         gLogger.info("Running the master process...")
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -500,7 +500,7 @@ def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -567,7 +567,7 @@ def test_ddp_dist_autograd_sparse_grads(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -604,7 +604,7 @@ def test_ddp_dist_autograd_local_vs_remote(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
@@ -651,7 +651,7 @@ def test_ddp_dist_autograd_local_vs_remote_gpu(self):
         torch.manual_seed(self.rank)
         dist.init_process_group(
             backend="gloo",
-            init_method="file://{}".format(self.file_name),
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
             world_size=self.world_size,
             rank=self.rank,
         )
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index f6f2b9a6fbfb..af5e648f6acb 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,5 +1,4 @@
 import copy
-import fcntl
 import itertools
 import random
 import math
@@ -22,6 +21,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed.distributed_c10d import _get_default_group, AllreduceOptions, GroupMember
+from torch.testing._internal.common_utils import FILE_SCHEMA
 from torch.testing._internal.common_distributed import (
     MultiProcessTestCase,
     TEST_SKIPS,
@@ -43,6 +43,10 @@
 except ImportError:
     HAS_TORCHVISION = False
 
+if sys.platform == 'win32':
+    import msvcrt
+else:
+    import fcntl
 
 class Foo:
     def __init__(self, x):
@@ -191,10 +195,17 @@ def _lock():
     lockfile = os.path.join(TEMP_DIR, "lockfile")
     with open(lockfile, "w") as lf:
         try:
-            fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
-            yield
+            if sys.platform == 'win32':
+                msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1)
+                yield
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+                yield
         finally:
-            fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            if sys.platform == 'win32':
+                msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
             lf.close()
 
 
@@ -270,7 +281,7 @@ def tearDown(self):
 
     @property
     def init_method(self):
-        return "file://{file_name}".format(file_name=self.file_name)
+        return "{}{file_name}".format(FILE_SCHEMA, file_name=self.file_name)
 
     @classmethod
     def _run(cls, rank, test_name, file_name):
@@ -2162,8 +2173,13 @@ def _test_DDP_5iter(
                 # save the model in the middle and reload
                 if test_save and idx == 2 and INIT_METHOD.startswith("file://"):
                     with tempfile.NamedTemporaryFile() as tmp:
-                        torch.save(model_DDP, tmp.name)
-                        model_DDP = torch.load(tmp.name)
+                        if sys.platform == 'win32':
+                            torch.save(model_DDP, tmp)
+                            tmp.seek(0)
+                            model_DDP = torch.load(tmp)
+                        else:
+                            torch.save(model_DDP, tmp.name)
+                            model_DDP = torch.load(tmp.name)
 
             with tempfile.TemporaryFile() as tmp_file:
                 torch.save(model_DDP, tmp_file)
@@ -2192,8 +2208,13 @@ def _test_DistributedDataParallel(self, gpu_subset, rank, output_device=None, gr
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                torch.save(model_DDP, tmp.name)
-                model_DDP = torch.load(tmp.name)
+                if sys.platform == 'win32':
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
 
             # dummy data initialization
             local_bs = len(gpu_subset)
@@ -2350,8 +2371,13 @@ def _test_DistributedDataParallel_SyncBatchNorm(self, gpu_subset, rank, local_bs
 
             # test serializable/unserializable
             with tempfile.NamedTemporaryFile() as tmp:
-                torch.save(model_DDP, tmp.name)
-                model_DDP = torch.load(tmp.name)
+                if sys.platform == 'win32':
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
 
             # data initialization
             input_cpu = torch.randn(global_bs, 2)

From d5748d9a1afa85d9e088306234e0b51155ad6f7b Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@devfair004.maas>
Date: Fri, 25 Sep 2020 12:54:21 -0700
Subject: [PATCH 148/449] Enable binary ops with Scalar Lists with for foreach
 APIs (#45298)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45298

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D23931986

Pulled By: izdeby

fbshipit-source-id: 281267cd6f90d57a169af89f9f10b0f4fcab47e3
---
 aten/src/ATen/native/ForeachOpsKernels.cpp    |  24 +
 aten/src/ATen/native/ForeachUtils.h           |  14 +
 .../native/cuda/ForeachBinaryOpScalarList.cu  |  60 ++
 aten/src/ATen/native/cuda/ForeachFunctors.cuh | 115 ++++
 .../src/ATen/native/cuda/MultiTensorApply.cuh |  70 +++
 aten/src/ATen/native/native_functions.yaml    |  97 +++-
 .../check_backward_compatibility.py           |   4 +
 test/test_foreach.py                          | 539 ++++++++++++++----
 tools/autograd/gen_python_functions.py        |   1 +
 .../templates/python_torch_functions.cpp      |   1 +
 tools/codegen/model.py                        |   4 +
 tools/pyi/gen_pyi.py                          |   1 +
 torch/csrc/utils/python_arg_parser.cpp        |   1 +
 torch/csrc/utils/python_arg_parser.h          |  21 +-
 14 files changed, 837 insertions(+), 115 deletions(-)
 create mode 100644 aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu

diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp
index 912b5116c4cc..73eb2070c07d 100644
--- a/aten/src/ATen/native/ForeachOpsKernels.cpp
+++ b/aten/src/ATen/native/ForeachOpsKernels.cpp
@@ -24,6 +24,26 @@ std::vector<Tensor> foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor
   return result;                                                                                          \
 }
 
+#define FOREACH_BINARY_OP_SCALARLIST(NAME)                                                                              \
+void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef<double> scalars) {                \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+                                                                                                                        \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+      tensors[i].NAME##_(scalars[i]);                                                                                   \
+    }                                                                                                                   \
+}                                                                                                                       \
+                                                                                                                        \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef<double> scalars) {  \
+  check_foreach_api_restrictions(tensors, scalars);                                                                     \
+  std::vector<Tensor> result;                                                                                           \
+  result.reserve(tensors.size());                                                                                       \
+  for (int i = 0; i < tensors.size(); i++) {                                                                            \
+    result.emplace_back(tensors[i].NAME(scalars[i]));                                                                   \
+  }                                                                                                                     \
+                                                                                                                        \
+  return result;                                                                                                        \
+}
+
 #define FOREACH_BINARY_OP_LIST(NAME)                                                                      \
 std::vector<Tensor> foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) {  \
   check_foreach_api_restrictions(tensors1, tensors2);                                                     \
@@ -117,6 +137,10 @@ FOREACH_BINARY_OP_SCALAR(add);
 FOREACH_BINARY_OP_SCALAR(sub);
 FOREACH_BINARY_OP_SCALAR(mul);
 FOREACH_BINARY_OP_SCALAR(div);
+FOREACH_BINARY_OP_SCALARLIST(add);
+FOREACH_BINARY_OP_SCALARLIST(sub);
+FOREACH_BINARY_OP_SCALARLIST(mul);
+FOREACH_BINARY_OP_SCALARLIST(div);
 FOREACH_BINARY_OP_LIST(mul);
 FOREACH_BINARY_OP_LIST(div);
 FOREACH_UNARY_OP(sqrt);
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 5a7aced74702..44e6a50297db 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -31,6 +31,12 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
   }
 }
 
+void check_foreach_api_restrictions(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+}
+
 // To go via 'fast' path, several conditions must be satisfied
 // - All tensors must be on the same device
 // - All tensors must have strided layout
@@ -132,5 +138,13 @@ bool can_use_fast_route(TensorList tensors) {
   return true;
 }
 
+bool can_use_fast_route(TensorList tensors, ArrayRef<double> scalars) {
+  TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor.");
+  TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value.");
+  TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list.");
+
+  return can_use_fast_route(tensors);
+}
+
 }
 }} // at::native
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
new file mode 100644
index 000000000000..684f12732ffc
--- /dev/null
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -0,0 +1,60 @@
+#include <ATen/Dispatch.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
+
+namespace at { namespace native {
+
+template<template<class> class Op>
+std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<at::Tensor> vec_res;
+    for (const auto& t: tensors) {
+        vec_res.emplace_back(at::native::empty_like(t));
+    }
+
+    tensor_lists.emplace_back(tensors.vec());
+    tensor_lists.emplace_back(vec_res);
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
+        multi_tensor_apply<2>(tensor_lists, scalars, BinaryOpScalarListFunctor<scalar_t, Op>());
+    });
+    return tensor_lists[1];
+}
+
+template<template<class> class Op>
+void foreach_binary_op_(TensorList tensors, at::ArrayRef<double> scalars) {
+    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    tensor_lists.emplace_back(tensors.vec());
+
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
+        multi_tensor_apply<1>(tensor_lists, scalars, BinaryOpScalarListFunctor_<scalar_t, Op>());
+    });
+}
+
+#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP)                                                                           \
+void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef<double> scalars) {                 \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars);                            \
+    }                                                                                                                    \
+                                                                                                                         \
+    foreach_binary_op_<OP>(tensors, scalars);                                                                            \
+}                                                                                                                        \
+                                                                                                                         \
+std::vector<Tensor> foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef<double> scalars) {   \
+    check_foreach_api_restrictions(tensors);                                                                             \
+                                                                                                                         \
+    if (!can_use_fast_route(tensors, scalars)) {                                                                         \
+        return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars);                             \
+    }                                                                                                                    \
+                                                                                                                         \
+    return foreach_binary_op<OP>(tensors, scalars);                                                                      \
+}
+
+FOREACH_BINARY_OP_SCALARLIST(add, std::plus);
+FOREACH_BINARY_OP_SCALARLIST(sub, std::minus);
+FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies);
+FOREACH_BINARY_OP_SCALARLIST(div, std::divides);
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index a04d27110c9a..e83eca3dd8e1 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -118,6 +118,121 @@ struct BinaryOpScalarFunctor {
         }
 };
 
+template<typename T, template<class> class Op>
+struct BinaryOpScalarListFunctor_ {
+    __device__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<1>& tl) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            double y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+                    // store
+                    load_store(x, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            x[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
+template<typename T, template<class> class Op>
+struct BinaryOpScalarListFunctor {
+    __device__ void operator() (
+        int chunk_size,
+        TensorListScalarListMetadata<2>& tl) {
+            int tensor_loc = tl.block_to_tensor[blockIdx.x];
+            int chunk_idx = tl.block_to_chunk[blockIdx.x];
+            int n = tl.sizes[tensor_loc];
+
+            T* x = (T*)tl.addresses[0][tensor_loc];
+            x += chunk_idx * chunk_size;
+
+            T* out = (T*)tl.addresses[1][tensor_loc];
+            out += chunk_idx * chunk_size;
+
+            double y = tl.scalar_vals[tensor_loc];
+
+            n -= chunk_idx * chunk_size;
+
+            T r_x[kILP];
+
+            // to make things simple, we put aligned case in a different code path
+            if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) {
+                for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) {
+                    // load
+                    load_store(r_x, x, 0 , i_start);
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+                    // store
+                    load_store(out, r_x, i_start, 0);
+                }
+            }
+            else {
+                for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) {
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = 0;
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size) {
+                            r_x[ii] = x[i];
+                        }
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                    }
+#pragma unroll
+                    for(int ii = 0; ii < kILP; ii++) {
+                        int i = i_start + threadIdx.x + ii * blockDim.x;
+                        if(i < n && i < chunk_size)
+                            out[i] = r_x[ii];
+                    }
+                }
+            }
+        }
+};
+
 template<typename T, template<class> class Op>
 struct BinaryOpListAlphaFunctor_ {
     __device__ void operator() (
diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
index f82a0d9a58c8..d162af19fd1b 100644
--- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh
+++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
@@ -26,6 +26,7 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s
 // TensorListMetadata has to be < 4KB - the limit for kernel launch argument
 static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
 static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
 
 template<int n> struct TensorListMetadata
 {
@@ -35,6 +36,15 @@ template<int n> struct TensorListMetadata
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
+template<int n> struct TensorListScalarListMetadata
+{
+  void* addresses[n][depth_to_max_tensors_scalarlist[n-1]];
+  int sizes[depth_to_max_tensors_scalarlist[n-1]];
+  double scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
+  int block_to_chunk[depth_to_max_blocks[n-1]];
+};
+
 template<typename T, typename U, typename... ArgTypes>
 C10_LAUNCH_BOUNDS_1(kBlockSize)
 __global__ void 
@@ -49,11 +59,71 @@ multi_tensor_apply_kernel(
 template<int depth, typename T, typename... ArgTypes>
 void multi_tensor_apply(
     std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<double> scalars,
     T callable,
     ArgTypes... args) {
         TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
         const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
+        size_t n_tensors = tensor_lists[0].size();
+        TensorListScalarListMetadata<depth> tensorListMeta;
+
+        int loc_block_info = 0;
+        int loc_tensor_info = 0;
+        for(size_t t = 0; t < n_tensors; t++) {
+
+            tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t];
+
+            tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+            for (int d = 0; d < depth; d++) {
+                tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+            }
+            loc_tensor_info++;
+
+            int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize;
+            for (int chunk = 0; chunk < chunks; chunk++) {
+                tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+                tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+                loc_block_info++;
+
+                bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] &&
+                    chunk == chunks - 1);
+                bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]);
+                bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1);
+
+                if (tensors_full || blocks_full || last_chunk) {
+                    multi_tensor_apply_kernel<<<loc_block_info, kBlockSize, 0, at::cuda::getCurrentCUDAStream()>>>(
+                        tensorListMeta,
+                        callable,
+                        args...);
+
+                    AT_CUDA_CHECK(cudaGetLastError());
+
+                    // Reset.
+                    loc_block_info = 0;
+                    if(chunk == chunks - 1) {
+                        loc_tensor_info = 0; 
+                    }
+                    else {
+                        tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
+                        tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1];
+                        for(int d = 0; d < depth; d++) {
+                            tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1];
+                        }
+                        loc_tensor_info = 1;
+                    }
+                }
+            }
+        }
+    }
+
 
+template<int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+        TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
+        const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
         size_t n_tensors = tensor_lists[0].size();
         TensorListMetadata<depth> tensorListMeta;
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 78b6d3330300..c4b286d90031 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6310,6 +6310,7 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6317,6 +6318,7 @@
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
 
 - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6324,6 +6326,7 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6331,6 +6334,7 @@
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
 
 - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6338,6 +6342,7 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6345,6 +6350,7 @@
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
 
 - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6352,34 +6358,39 @@
     CUDA: foreach_tensor_div_scalar_kernel_cuda
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
 
-- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
+- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
 
-- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
+- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
 
-- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[]
+- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
 
-- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> ()
+- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6387,6 +6398,7 @@
     CUDA: foreach_tensor_sub_list_kernel_cuda_
 
 - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6394,13 +6406,15 @@
     CUDA: foreach_tensor_mul_list_kernel_cuda
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
 
-- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[]
+- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6408,13 +6422,79 @@
     CUDA: foreach_tensor_div_list_kernel_cuda
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
 
+- func: _foreach_add_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda
+
+- func: _foreach_add_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_add_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
+
+- func: _foreach_sub_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
+
+- func: _foreach_sub_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_sub_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
+
+- func: _foreach_div_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda
+
+- func: _foreach_div_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_div_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
+
+- func: _foreach_mul_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[]
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
+
+- func: _foreach_mul_scalar_list_(Tensor(a!)[] self, float[] scalars) -> ()
+  use_c10_dispatcher: full
+  device_guard: False
+  variants: function
+  dispatch:
+    CPU: foreach_tensor_mul_scalarlist_kernel_slow_
+    CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
+
 - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6422,6 +6502,7 @@
     CUDA: foreach_tensor_exp_cuda
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6429,6 +6510,7 @@
     CUDA: foreach_tensor_exp_cuda_
 
 - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6436,6 +6518,7 @@
     CUDA: foreach_tensor_sqrt_cuda
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6443,6 +6526,7 @@
     CUDA: foreach_tensor_sqrt_cuda_
 
 - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6450,6 +6534,7 @@
     CUDA: foreach_tensor_addcdiv_cuda_
 
 - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6457,6 +6542,7 @@
     CUDA: foreach_tensor_addcmul_cuda_
 
 - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
@@ -6464,6 +6550,7 @@
     CUDA: foreach_tensor_addcdiv_cuda
 
 - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+  use_c10_dispatcher: full
   device_guard: False
   variants: function
   dispatch:
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 92c4f1060a64..a01a004a0850 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -102,6 +102,10 @@
     ("aten::_addr", datetime.date(2020, 10, 31)),
     ("aten::_addr_", datetime.date(2020, 10, 31)),
     ("aten::_addr.out", datetime.date(2020, 10, 31)),
+    ("aten::_foreach_add", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_div", datetime.date(2020, 10, 1)),
+    ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
 ]
 
 
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 8369ba5b9be5..b91735a8e62f 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -4,21 +4,44 @@
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm
 
 class TestForeach(TestCase):
-    bin_ops = [
+    foreach_bin_ops = [
         torch._foreach_add,
-        torch._foreach_add_,
         torch._foreach_sub,
-        torch._foreach_sub_,
         torch._foreach_mul,
-        torch._foreach_mul_,
         torch._foreach_div,
+    ]
+
+    foreach_bin_ops_ = [
+        torch._foreach_add_,
+        torch._foreach_sub_,
+        torch._foreach_mul_,
         torch._foreach_div_,
     ]
 
+    foreach_bin_ops_sl = [
+        torch._foreach_add_scalar_list,
+        torch._foreach_sub_scalar_list,
+        torch._foreach_mul_scalar_list,
+        torch._foreach_div_scalar_list,
+    ]
+
+    foreach_bin_ops_sl_ = [
+        torch._foreach_add_scalar_list_,
+        torch._foreach_sub_scalar_list_,
+        torch._foreach_mul_scalar_list_,
+        torch._foreach_div_scalar_list_,
+    ]
+
+    torch_bin_ops = [
+        torch.add,
+        torch.sub,
+        torch.mul,
+        torch.div,
+    ]
+
     def _get_test_data(self, device, dtype, N):
         if dtype in [torch.bfloat16, torch.bool, torch.float16]:
             tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)]
-
         elif dtype in torch.testing.get_all_int_dtypes():
             tensors = [torch.randint(1, 100, (N, N), device=device, dtype=dtype) for _ in range(N)]
         else:
@@ -26,36 +49,39 @@ def _get_test_data(self, device, dtype, N):
 
         return tensors
 
-    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-
-        expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
-        res = foreach_op(tensors1, tensors2)
-        foreach_op_(tensors1, tensors2)
-        self.assertEqual(res, tensors1)
-        self.assertEqual(tensors1, expected)
-
-    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        expected = [torch_op(tensors1[i]) for i in range(N)]
-        res = foreach_op(tensors1)
-        foreach_op_(tensors1)
-        self.assertEqual(res, tensors1)
-        self.assertEqual(tensors1, expected)
-
-    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors = self._get_test_data(device, dtype, N)
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-        value = 2
-
-        expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
-
-        res = foreach_op(tensors, tensors1, tensors2, value)
-        foreach_op_(tensors, tensors1, tensors2, value)
-        self.assertEqual(res, tensors)
-        self.assertEqual(tensors, expected)
+    def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+
+            expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
+            res = foreach_op(tensors1, tensors2)
+            foreach_op_(tensors1, tensors2)
+            self.assertEqual(res, tensors1)
+            self.assertEqual(tensors1, res)
+
+    def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            expected = [torch_op(tensors1[i]) for i in range(N)]
+            res = foreach_op(tensors1)
+            foreach_op_(tensors1)
+            self.assertEqual(res, tensors1)
+            self.assertEqual(tensors1, expected)
+
+    def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors = self._get_test_data(device, dtype, N)
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+            value = 2
+
+            expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
+
+            res = foreach_op(tensors, tensors1, tensors2, value)
+            foreach_op_(tensors, tensors1, tensors2, value)
+            self.assertEqual(res, tensors)
+            self.assertEqual(tensors, expected)
 
     def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
         tensors1 = self._get_test_data(device, dtype, N)
@@ -63,8 +89,8 @@ def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_
         alpha = 2
 
         expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)]
-        res = foreach_op(tensors1, tensors2, alpha)
-        foreach_op_(tensors1, tensors2, alpha)
+        res = foreach_op(tensors1, tensors2, alpha=alpha)
+        foreach_op_(tensors1, tensors2, alpha=alpha)
         self.assertEqual(res, tensors1)
 
         if dtype == torch.bool:
@@ -88,7 +114,7 @@ def test_exp(self, device, dtype):
     @skipCUDAIfRocm
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
     def test_addcmul(self, device, dtype):
-        if device == 'cpu':
+        if self.device_type == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcmul_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcmul,
@@ -105,7 +131,7 @@ def test_addcdiv(self, device, dtype):
                 self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, torch._foreach_addcdiv_, torch.addcdiv)
             return
 
-        if device == 'cpu':
+        if self.device_type == 'cpu':
             if dtype == torch.half:
                 with self.assertRaisesRegex(RuntimeError, r"\"addcdiv_cpu_out\" not implemented for \'Half\'"):
                     self._test_pointwise_op(device, dtype, torch._foreach_addcdiv,
@@ -118,83 +144,372 @@ def test_addcdiv(self, device, dtype):
     #
     @dtypes(*torch.testing.get_all_dtypes())
     def test_int_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        int_scalar = 1
-
-        # bool tensor + 1 will result in int64 tensor
-        if dtype == torch.bool:
-            expected = [torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10)]
-        else:
-            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, int_scalar)
-        self.assertEqual(res, expected)
-
-        if dtype in [torch.bool]:
-            with self.assertRaisesRegex(RuntimeError,
-                                        "result type Long can't be cast to the desired output type Bool"):
-                torch._foreach_add_(tensors, int_scalar)
-        else:
-            torch._foreach_add_(tensors, int_scalar)
-            self.assertEqual(res, tensors)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                res = foreach_bin_op(tensors, scalar)
+
+                if dtype == torch.bool:
+                    self.assertEqual(res, expected)
+
+                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+
+                if foreach_bin_op_ == torch._foreach_div_ and dtype in torch.testing.integral_types() and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError,
+                                                "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+                # TODO[type promotion]: Fix once type promotion is enabled.
+                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                    self.assertEqual(res, [e.to(dtype) for e in expected])
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, [e.to(dtype) for e in expected])
+                else:
+                    self.assertEqual(res, expected)
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, expected)
+
+    # TODO[Fix scalar list]: 
+    # We need to update codegen to correctly handle function overloads with float[] and int[].
+    # As optimizers work with float tensors, the result will always be torch.float32 for now.
+    # Current schema is using 'float[]' as scalar list type.
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_int_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [1 for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                # we dont support bool and complex types on CUDA for now
+                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op_(tensors, scalars)
+
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op(tensors, scalars)
+                    return
+
+                res = foreach_bin_op(tensors, scalars)
+
+                if dtype == torch.bool:
+                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
+
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                if dtype in torch.testing.integral_types():
+                    if self.device_type == 'cpu':
+                        self.assertEqual(res, [e.to(torch.float32) for e in expected])
+                    else:
+                        # TODO[type promotion]: Fix once type promotion is enabled.
+                        self.assertEqual(res, [e.to(dtype) for e in expected])
+                else: 
+                    self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types() and self.device_type == 'cpu':
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+                else:
+                    foreach_bin_op_(tensors, scalars)
+                    self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_float_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        float_scalar = 1.
-
-        # float scalar + integral tensor will result in float tensor
-        if dtype in [torch.uint8, torch.int8, torch.int16,
-                     torch.int32, torch.int64, torch.bool]:
-            expected = [torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10)]
-        else:
-            expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, float_scalar)
-        self.assertEqual(res, expected)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3.3
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                res = foreach_bin_op(tensors, scalar)
+                self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types():
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                    return
+
+                foreach_bin_op_(tensors, scalar)
+                self.assertEqual(tensors, expected)
 
-        if dtype in [torch.uint8, torch.int8, torch.int16,
-                     torch.int32, torch.int64, torch.bool]:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar))
-        else:
-            torch._foreach_add_(tensors, float_scalar)
-            self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_float_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [1.1 for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                # we dont support bool and complex types on CUDA for now
+                if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op_(tensors, scalars)
+
+                    with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                        foreach_bin_op(tensors, scalars)
+                    return
+
+                res = foreach_bin_op(tensors, scalars)
+
+                if dtype == torch.bool:
+                    # see TODO[Fix scalar list]
+                    self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)])
+
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                    # see TODO[Fix scalar list]
+                    self.assertEqual(res, [e.to(dtype) for e in expected])
+
+                    foreach_bin_op_(tensors, scalars)
+                    self.assertEqual(tensors, res)
+                    return
+                else:
+                    self.assertEqual(res, expected)
+
+                if dtype in torch.testing.integral_types() and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalars)
+                    return
+
+                foreach_bin_op_(tensors, scalars)
+                self.assertEqual(tensors, expected)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_complex_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        complex_scalar = 3 + 5j
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = 3 + 5j
+                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                if dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=True) and \
+                   self.device_type == 'cuda':
+                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
+                        foreach_bin_op_(tensors, scalar)
+
+                    with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"):
+                        foreach_bin_op(tensors, scalar)
+                    return
+
+                res = foreach_bin_op(tensors, scalar)
+                self.assertEqual(res, expected)
+
+                if dtype not in [torch.complex64, torch.complex128]:
+                    with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"):
+                        foreach_bin_op_(tensors, scalar)
+                else:
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(res, tensors)
 
-        # bool tensor + 1 will result in int64 tensor
-        expected = [torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10)]
-
-        if dtype in [torch.float16, torch.float32, torch.float64, torch.bfloat16] and device == 'cuda:0':
-            # value cannot be converted to dtype without overflow:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar))
-            return
-
-        res = torch._foreach_add(tensors, complex_scalar)
-        self.assertEqual(res, expected)
-
-        if dtype not in [torch.complex64, torch.complex128]:
-            self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar))
-        else:
-            torch._foreach_add_(tensors, complex_scalar)
-            self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_complex_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [3 + 5j for _ in range(N)]
+                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                if dtype == torch.bool:
+                    if foreach_bin_op == torch._foreach_sub:
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op_(tensors, scalar)
+
+                        with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"):
+                            foreach_bin_op(tensors, scalar)
+                    return
+
+                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
+                    res = foreach_bin_op(tensors, scalars)
+
+                with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"):
+                    foreach_bin_op_(tensors, scalars)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_bool_scalar(self, device, dtype):
-        tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)]
-        bool_scalar = True
-
-        expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)]
-
-        res = torch._foreach_add(tensors, bool_scalar)
-        self.assertEqual(res, expected)
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops,
+                                                                     self.foreach_bin_ops_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalar = True
+
+                if dtype == torch.bool:
+                    expected = [torch_bin_op(t, scalar) for t in tensors]
+                    res = foreach_bin_op(tensors, scalar)
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, res)
+                    return
+
+                if foreach_bin_op == torch._foreach_sub and self.device_type == "cpu":
+                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
+                        res = foreach_bin_op(tensors, scalar)
+
+                    with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"):
+                        foreach_bin_op_(tensors, scalar)
+                elif foreach_bin_op == torch._foreach_sub and self.device_type == 'cuda':
+                    res = foreach_bin_op(tensors, scalar)
+                    self.assertEqual(res, foreach_bin_op(tensors, 1))
+
+                    foreach_bin_op_(tensors, scalar)
+                    self.assertEqual(tensors, res)
+                else:
+                    expected = [torch_bin_op(t, scalar) for t in tensors]
+                    res = foreach_bin_op(tensors, scalar)
+
+                    # TODO[type promotion]: Fix once type promotion is enabled.
+                    if dtype in torch.testing.integral_types() and self.device_type == 'cuda':
+                        self.assertEqual(res, [e.to(dtype) for e in expected])
+                    else:
+                        self.assertEqual(res, expected)
+
+                    if dtype in torch.testing.integral_types():
+                        if foreach_bin_op == torch._foreach_div and self.device_type == "cpu":
+                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
+                                foreach_bin_op_(tensors, scalar)
+                        else:
+                            foreach_bin_op_(tensors, scalar)
+                            self.assertEqual(tensors, res)
+                    else:
+                        foreach_bin_op_(tensors, scalar)
+                        self.assertEqual(tensors, expected)
 
-        torch._foreach_add_(tensors, bool_scalar)
-        self.assertEqual(res, tensors)
+    @dtypes(*torch.testing.get_all_dtypes())
+    def test_bool_scalarlist(self, device, dtype):
+        for N in [30, 300]:
+            for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl,
+                                                                     self.foreach_bin_ops_sl_,
+                                                                     self.torch_bin_ops):
+                tensors = self._get_test_data(device, dtype, N)
+                scalars = [True for _ in range(N)]
+
+                if dtype == torch.bool:
+                    if self.device_type == 'cuda':
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op(tensors, scalars)
+
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op_(tensors, scalars)
+                        return
+                    else:
+                        if foreach_bin_op == torch._foreach_sub_scalar_list:
+                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
+                                foreach_bin_op_(tensors, scalars)
+
+                            with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"):
+                                foreach_bin_op(tensors, scalars)
+                        else:
+                            with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired"):
+                                foreach_bin_op_(tensors, scalars)
+
+                            res = foreach_bin_op(tensors, scalars)
+                            for r in res:
+                                self.assertTrue(r.dtype == torch.float32)
+                else:
+                    # we dont support bool and complex types on CUDA for now
+                    if (dtype in torch.testing.get_all_complex_dtypes()) and self.device_type == 'cuda':
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op_(tensors, scalars)
+
+                        with self.assertRaisesRegex(RuntimeError, "not implemented for"):
+                            foreach_bin_op(tensors, scalars)
+                        return
+
+                    if foreach_bin_op == torch._foreach_sub_scalar_list:
+                        if self.device_type == "cpu":
+                            # see TODO[Fix scalar list]
+                            res = foreach_bin_op(tensors, scalars)
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [r.to(torch.float32) for r in [torch_bin_op(t, 1) for t in tensors]])
+
+                                with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the "):
+                                    foreach_bin_op_(tensors, scalars)
+                            else:
+                                self.assertEqual(res, [torch_bin_op(t, 1) for t in tensors])
+                                foreach_bin_op_(tensors, scalars)
+                                self.assertEqual(res, tensors)
+                        else:
+                            # see TODO[Fix scalar list]
+                            res = foreach_bin_op(tensors, scalars)
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [r.to(dtype) for r in [torch_bin_op(t, 1) for t in tensors]])
+                            else:
+                                self.assertEqual(res, [torch_bin_op(t, 1) for t in tensors])
+
+                            foreach_bin_op_(tensors, scalars)
+                            self.assertEqual(res, tensors)
+                    else:
+                        if self.device_type == "cpu":
+                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+                            res = foreach_bin_op(tensors, scalars)
+
+                            # see TODO[Fix scalar list]
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [e.to(torch.float32) for e in expected])
+                            else:
+                                self.assertEqual(res, expected)
+
+                            if dtype in torch.testing.integral_types():
+                                with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "):
+                                    foreach_bin_op_(tensors, scalars)
+                            else:
+                                foreach_bin_op_(tensors, scalars)
+                                self.assertEqual(tensors, expected)
+                        else:
+                            expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+                            res = foreach_bin_op(tensors, scalars)
+
+                            if dtype in torch.testing.integral_types():
+                                self.assertEqual(res, [e.to(dtype) for e in expected])
+                            else:
+                                self.assertEqual(res, expected)
+
+                            foreach_bin_op_(tensors, scalars)
+                            self.assertEqual(res, tensors)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_add_with_different_size_tensors(self, device, dtype):
@@ -318,13 +633,25 @@ def test_div_list(self, device, dtype):
                 self.skipTest("Skipped! See https://github.com/pytorch/pytorch/issues/44489")
             return
 
-        self._test_bin_op_list(device, dtype, torch._foreach_div, torch._foreach_div_, torch.div)
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+
+            if dtype in [torch.bfloat16, torch.bool, torch.float16]:
+                tensors2 = [torch.zeros(N, N, device=device, dtype=dtype).add(2) for _ in range(N)]
+            else:
+                tensors2 = self._get_test_data(device, dtype, N)
+
+            expected = [torch.div(tensors1[i], tensors2[i]) for i in range(N)]
+            res = torch._foreach_div(tensors1, tensors2)
+            torch._foreach_div_(tensors1, tensors2)
+            self.assertEqual(res, tensors1)
+            self.assertEqual(tensors1, res)
 
     def test_bin_op_list_error_cases(self, device):
         tensors1 = []
         tensors2 = []
 
-        for bin_op in self.bin_ops:
+        for bin_op in self.foreach_bin_ops + self.foreach_bin_ops_:
             # Empty lists
             with self.assertRaises(RuntimeError):
                 bin_op(tensors1, tensors2)
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 995dff38030b..8f272de9a5f6 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -281,6 +281,7 @@ def create_python_bindings(python_functions, is_python_method, module):
     'c10::optional<bool>': 'toBoolOptional',
     'c10::optional<double>': 'toDoubleOptional',
     'c10::optional<ArrayRef<double>>': 'doublelistOptional',
+    'ArrayRef<double>': 'doublelist',
     'IntArrayRef': 'intlist',
     'Scalar': 'scalar',
     'ScalarType': 'scalartype',
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 62e9b8dd227f..673af99bce77 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -44,6 +44,7 @@ using at::Generator;
 using at::TensorList;
 using at::Dimname;
 using at::DimnameList;
+using at::ArrayRef;
 
 using namespace torch::autograd::utils;
 
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index b0c470c91b6a..5b4b25096f35 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -304,6 +304,10 @@ def __post_init__(self) -> None:
             # TODO: fixme
             if str(self.name) not in [
                     '_amp_non_finite_check_and_unscale_',
+                    '_foreach_add_scalar_list_',
+                    '_foreach_sub_scalar_list_',
+                    '_foreach_mul_scalar_list_',
+                    '_foreach_div_scalar_list_',
                     '_foreach_add_.Scalar',
                     '_foreach_sub_.Scalar',
                     '_foreach_mul_.Scalar',
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 7079c6750223..d24966f9fb52 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -146,6 +146,7 @@ def type_to_python(typename, size=None):
         'Dimname': 'Union[str, ellipsis, None]',
         'DimnameList': 'Sequence[Union[str, ellipsis, None]]',
         'QScheme': '_qscheme',
+        'ArrayRef<double>' : 'Sequence[float]'
     }[typename]
 
     return typename
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index e954bef398e9..81c55b83bf8c 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -901,6 +901,7 @@ PythonArgs PythonArgParser::raw_parse(PyObject* self, PyObject* args, PyObject*
   print_error(self, args, kwargs, parsed_args);
 }
 
+
 void PythonArgParser::print_error(PyObject* self, PyObject* args, PyObject* kwargs, PyObject* parsed_args[]) {  // NOLINT
   auto num_args = PyTuple_GET_SIZE(args) + (kwargs ? PyDict_Size(kwargs) : 0);
   std::vector<int> plausible_idxs;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 78efb6cf2db3..0454e7e2af51 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -173,6 +173,8 @@ struct PythonArgs {
   inline c10::optional<bool> toBoolOptional(int i);
   inline c10::optional<double> toDoubleOptional(int i);
   inline c10::OptionalArray<double> doublelistOptional(int i);
+  inline std::vector<double> doublelist(int i);
+  inline std::vector<double> getDoublelist(int i);
   inline at::Layout layout(int i);
   inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
   inline c10::optional<at::Layout> layoutOptional(int i);
@@ -369,10 +371,7 @@ inline c10::OptionalArray<int64_t> PythonArgs::intlistOptional(int i) {
   return intlist(i);
 }
 
-inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
-  if (!args[i]) {
-    return {};
-  }
+inline std::vector<double> PythonArgs::getDoublelist(int i) {
   PyObject* arg = args[i];
   auto tuple = PyTuple_Check(arg);
   auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -390,6 +389,20 @@ inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
   return res;
 }
 
+inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return this->getDoublelist(i);
+}
+
+inline std::vector<double> PythonArgs::doublelist(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return this->getDoublelist(i);
+}
+
 inline at::ScalarType PythonArgs::scalartypeWithDefault(int i, at::ScalarType default_scalartype) {
   if (!args[i]) return default_scalartype;
   return scalartype(i);

From 27ab9bc0f9b72dcf2eacd5baa35930f87c9fd60a Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Fri, 25 Sep 2020 13:17:24 -0700
Subject: [PATCH 149/449] [RPC profiling] Extend RPC profiling to support async
 function execution over RPC. (#44664)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664

Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.

To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.

For example, if the following async function is ran on a server over RPC:

```
def slow_add(x, y):
    time.sleep(1)
    return torch.add(x, y)

rpc.functions.async_execution
def slow_async_add(to, x, y):
    return rpc.rpc_async(to, slow_add, args=(x, y))
```

we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:

```
-------------------------------------------------------------------------------------------------------------------------  ---------------  ---------------  ---------------  --------
-------  ---------------  ---------------  ---------------
Name                                                                                                                       Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls  Node ID
-------------------------------------------------------------------------------------------------------------------------  ---------------  ---------------  ---------------  --------
-------  ---------------  ---------------  ---------------                                                                                                                            rpc_async#slow_async_add(worker1 -> worker2)                                                                               0.00%            0.000us          0                1.012s
         1.012s           1                1
aten::empty                                                                                                                7.02%            11.519us         7.02%            11.519us         11.519us         1                1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)                             0.00%            0.000us          0                1.006s
         1.006s           1                2                                                                                                                                          rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty                                                        7.21%            11.843us         7.21%            11.843us
         11.843us         1                2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add        71.94%           118.107us        85.77%           140.802us        140.802us        1                3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty      13.82%           22.695us         13.82%           22.695us
         22.695us         1                3                                                                                                                                          -------------------------------------------------------------------------------------------------------------------------  ---------------  ---------------  ---------------  --------
-------  ---------------  ---------------  ---------------
Self CPU time total: 164.164us
```

This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470

Test Plan:
```
rvarm1@devbig978:fbcode  (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```

Reviewed By: mrshenli

Differential Revision: D23638387

fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
---
 test/cpp/jit/test_misc.cpp                    |  27 +++-
 torch/csrc/autograd/init.cpp                  |   6 +-
 torch/csrc/autograd/profiler.cpp              |   4 +-
 torch/csrc/autograd/profiler.h                |  33 +++-
 .../rpc/request_callback_no_python.cpp        | 150 ++++++------------
 torch/csrc/distributed/rpc/utils.cpp          |  79 +++++++++
 torch/csrc/distributed/rpc/utils.h            |   9 ++
 torch/testing/_internal/dist_utils.py         |  19 +++
 .../rpc/process_group_agent_test_fixture.py   |  19 ++-
 .../_internal/distributed/rpc/rpc_test.py     |  84 +++++++++-
 10 files changed, 304 insertions(+), 126 deletions(-)

diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index fd260cb0b4b1..39fb703fc366 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -2168,15 +2168,16 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
       torch::autograd::profiler::ProfilerConfig(
           torch::autograd::profiler::ProfilerState::CPU, false, false));
   auto s1 = c10::make_intrusive<Future>(IntType::get());
-  s1->addCallback(wrapPropagateTLSState<void>([&profilerEnabledCb] {
+  auto verifyProfilerCb = wrapPropagateTLSState<void>([&profilerEnabledCb] {
     // Ensure the profiler is still enabled in this thread.
     profilerEnabledCb();
     auto t1 = torch::ones({2, 2});
     auto t2 = torch::ones({2, 2});
     torch::add(t1, t2);
     // Don't cleanup TLSState, and just consolidate.
+    auto opts = torch::autograd::profiler::ProfilerDisableOptions(false, true);
     auto thread_event_lists =
-        torch::autograd::profiler::disableProfiler(false, true);
+        torch::autograd::profiler::disableProfiler(std::move(opts));
     // Ensure that the events from this thread are still profiled and we obtain
     // the expected in events in our consolidated list when calling
     // disableProfiler().
@@ -2190,14 +2191,32 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
           found_ones = true;
         }
       }
+      if (found_add && found_ones) {
+        break;
+      }
     }
     ASSERT_TRUE(found_ones);
     ASSERT_TRUE(found_add);
-  }));
+  });
+
+  s1->addCallback(verifyProfilerCb);
   // Disable the profiler, but do not consolidate results in the main thread.
-  torch::autograd::profiler::disableProfiler(true, false);
+  auto opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
+  torch::autograd::profiler::disableProfiler(std::move(opts));
   std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); });
   t.join();
+
+  // Similar to above test, but verifies correctness in the case where
+  // continuation runs on the main thread.
+  torch::autograd::profiler::enableProfiler(
+      torch::autograd::profiler::ProfilerConfig(
+          torch::autograd::profiler::ProfilerState::CPU, false, false));
+  s1 = c10::make_intrusive<Future>(IntType::get());
+  s1->addCallback(verifyProfilerCb);
+  // Runs callback inline
+  s1->markCompleted(at::IValue(1));
+  opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
+  torch::autograd::profiler::disableProfiler(std::move(opts));
 }
 
 TEST(IValueKWargsTest, Basic) {
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 69759d1948b2..5fac8dfb6d9f 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -60,12 +60,14 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .def("is_remote", &Event::isRemote)
       .def("sequence_nr", &Event::sequence_nr);
 
+  py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")
+    .def(py::init<bool, bool>());
+
   m.def("_enable_profiler", enableProfiler);
   m.def(
       "_disable_profiler",
       disableProfiler,
-      py::arg("cleanup_tls_states") = true,
-      py::arg("consolidate") = true);
+      py::arg("profiler_disable_options") = ProfilerDisableOptions());
   m.def("_profiler_enabled", profilerEnabled);
   m.def("_enable_record_function", [](bool enable) {
     at::enableRecordFunction(enable);
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index bab21ee5a7a8..c817174f02fc 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -467,7 +467,9 @@ void enableProfiler(const ProfilerConfig& new_config) {
   state->mark("__start_profile", false);
 }
 
-thread_event_lists disableProfiler(bool cleanupTLSState, bool consolidate) {
+thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profilerDisableOptions) {
+  auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true;
+  auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true;
   // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard
   std::shared_ptr<c10::DebugInfoBase> state;
   if (cleanupTLSState) {
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 6a7c5095a071..12c5e409e7f7 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -88,6 +88,21 @@ inline int64_t getTime() {
 #endif
 }
 
+// A struct to control settings of disableProfiler options.
+struct TORCH_API ProfilerDisableOptions {
+  ProfilerDisableOptions() = default;
+  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
+      : cleanupTLSState(shouldCleanupTLSState),
+        consolidate(shouldConsolidate) {}
+  // Whether we should clean up profiler states that are thread local, such as
+  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
+  bool cleanupTLSState = true;
+  // Whether we should consolidate all currently recorded profiled events. If
+  // false, will not consolidate and other threads can continue to write to the
+  // event lists.
+  bool consolidate = true;
+};
+
 enum class C10_API_ENUM ProfilerState {
     Disabled,
     CPU, // CPU-only profiling
@@ -341,7 +356,7 @@ using thread_event_lists = std::vector<std::vector<Event>>;
 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfiler(const ProfilerConfig&);
-TORCH_API thread_event_lists disableProfiler(bool cleanupTLSState = true, bool consolidate = true);
+TORCH_API thread_event_lists disableProfiler(c10::optional<ProfilerDisableOptions> profilerDisableOptions = c10::nullopt);
 // adds profiledEvents to the current thread local recorded events. Each event
 // will be marked with node ID given by fromNodeId.
 TORCH_API void addEventList(std::vector<Event>&& profiledEvents);
@@ -383,19 +398,27 @@ struct TORCH_API TLSProfilerGuard {
   explicit TLSProfilerGuard(
       const ProfilerConfig& cfg,
       c10::optional<std::function<void(const thread_event_lists&)>>
-          resultCallback = c10::nullopt)
-      : cb_(std::move(resultCallback)) {
+          resultCallback = c10::nullopt,
+      c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+          c10::nullopt)
+      : cb_(std::move(resultCallback)),
+        profilerDisableOptions_(std::move(profilerDisableOptions)) {
     enableProfiler(cfg);
   }
   ~TLSProfilerGuard() {
-    thread_event_lists event_lists = disableProfiler();
+    thread_event_lists event_lists = disableProfiler(profilerDisableOptions_);
     if (cb_) {
-      (*cb_)(event_lists);
+      try {
+        (*cb_)(event_lists);
+      } catch (const std::exception& e) {
+        LOG(ERROR) << "Got error processing profiler events: " << e.what();
+      }
     }
   }
 
  private:
   c10::optional<std::function<void(const thread_event_lists&)>> cb_;
+  const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
 };
 
 } // namespace profiler
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index 9aa1f2b2aa55..f5df65e87145 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -486,93 +486,14 @@ void RequestCallbackNoPython::processRpc(
       const auto profilingKeyId = rpcWithProfilingReq.getProfilingId();
       auto wrappedRpcResponseFuture = std::make_shared<FutureMessage>();
       // Enable the profiler with the config from the sender.
-      std::vector<torch::autograd::profiler::Event> profiledEvents;
+      // When enabling on the main thread, ensure profiler states are cleaned
+      // up, but defer consolidation of all profiled events to the continuation
+      // below.
+      torch::autograd::profiler::ProfilerDisableOptions requestThreadOptions(
+          true /* cleanup TLS state */, false /* consolidate events */);
       {
         torch::autograd::profiler::TLSProfilerGuard g(
-            profilingConfig,
-            [&profiledEvents, profilingConfig](
-                const std::vector<std::vector<
-                    torch::autograd::profiler::Event>>& event_lists) {
-              // Gather all events into a vector
-              for (auto& l : event_lists) {
-                for (auto& e : l) {
-                  profiledEvents.push_back(e);
-                }
-              }
-              // find __start_profile event and __cuda_start_event.
-              bool cuda_profiling_enabled = profilingConfig.state ==
-                  torch::autograd::profiler::ProfilerState::CUDA;
-              bool found_cpu_start = false;
-              const torch::autograd::profiler::Event* profilerStart = nullptr;
-              // Each device has its own cudaProfilerStart, so we must take
-              // care to use the correct one depending on the device the
-              // operation ran on.
-              std::unordered_map<int, const torch::autograd::profiler::Event*>
-                  cudaProfilerStarts;
-              for (auto& e : profiledEvents) {
-                if (!found_cpu_start &&
-                    0 == strcmp(e.name(), "__start_profile")) {
-                  profilerStart = &e;
-                  found_cpu_start = true;
-                }
-                if (cuda_profiling_enabled &&
-                    0 == strcmp(e.name(), "__cuda_start_event")) {
-                  e.setCudaUs(e.cpu_us());
-                  auto device = e.device();
-                  TORCH_CHECK(
-                      device != -1,
-                      "CUDA profiling was enabled but could not find CUDA device.");
-                  TORCH_CHECK(
-                      cudaProfilerStarts.find(device) ==
-                          cudaProfilerStarts.end(),
-                      c10::str(
-                          "Duplicate __cuda_start_event found for ", device));
-                  cudaProfilerStarts[device] = &e;
-                }
-                // TODO: determine no. of CUDA devices and break here if we have
-                // a cudaProfilerStart for all of them, in the case of cuda
-                // profiling.
-                if (found_cpu_start && !cuda_profiling_enabled) {
-                  break;
-                }
-              }
-              // We should always find __start_profile.
-              TORCH_CHECK(
-                  profilerStart != nullptr,
-                  "Expected to find __start_profile event.");
-              // Should have >= 1 CUDA start event.
-              // TODO: we can enhance this assert by ensuring we have found a
-              // start for every available CUDA device.
-              TORCH_CHECK(
-                  !cuda_profiling_enabled || cudaProfilerStarts.size() > 0,
-                  "Profiler was enabled with CUDA recording, but did not find __cuda_start_event.");
-
-              if (cuda_profiling_enabled) {
-                // Compute and set global time for when this CUDA kernel was
-                // launched/ended, since deserialized event will not have a
-                // corresponding CUDA event.
-                for (auto& e : profiledEvents) {
-                  if (e.has_cuda()) {
-                    auto cuda_device = e.device();
-                    TORCH_CHECK(
-                        cuda_device != -1,
-                        "CUDA profiling was enabled but could not find CUDA device.");
-                    auto it = cudaProfilerStarts.find(cuda_device);
-                    TORCH_CHECK(
-                        it != cudaProfilerStarts.end(),
-                        c10::str(
-                            "Failed to find __cuda_start_event for device ",
-                            cuda_device));
-                    auto cudaProfilerStartEvent = it->second;
-                    double cuda_elapsed_us =
-                        cudaProfilerStartEvent->cuda_elapsed_us(e);
-                    int64_t cuda_us =
-                        cuda_elapsed_us + cudaProfilerStartEvent->cpu_us();
-                    e.setCudaUs(cuda_us);
-                  }
-                }
-              }
-            });
+            profilingConfig, c10::nullopt, requestThreadOptions);
         TORCH_INTERNAL_ASSERT(
             torch::autograd::profiler::profilerEnabled(),
             "Expected profiler to be enabled!");
@@ -583,25 +504,48 @@ void RequestCallbackNoPython::processRpc(
             wrappedMsgType,
             messageId,
             wrappedRpcResponseFuture);
-      }
-      wrappedRpcResponseFuture->addCallback([wrappedRpcResponseFuture,
+
+        wrappedRpcResponseFuture->addCallback(
+            at::wrapPropagateTLSState<void>([wrappedRpcResponseFuture,
                                              responseFuture,
-                                             profiledEvents =
-                                                 std::move(profiledEvents),
-                                             profilingKeyId] {
-        if (wrappedRpcResponseFuture->hasError()) {
-          // Propagate error
-          responseFuture->setError(wrappedRpcResponseFuture->error()->what());
-        } else {
-          auto rpcWithProfilingResp = std::make_unique<RpcWithProfilingResp>(
-              MessageType::RUN_WITH_PROFILING_RESP,
-              std::move(*wrappedRpcResponseFuture).moveValue(),
-              profiledEvents,
-              profilingKeyId);
-          responseFuture->markCompleted(
-              std::move(*rpcWithProfilingResp).toMessage());
-        }
-      });
+                                             profilingKeyId,
+                                             profilingConfig] {
+              std::vector<torch::autograd::profiler::Event> profiledEvents;
+              // Defer consolidation of profiler events until async work has
+              // completed (such as async UDF)
+
+              TORCH_INTERNAL_ASSERT(
+                  torch::autograd::profiler::profilerEnabled(),
+                  "Expected profiler to be enabled!");
+
+              // On continuation thread, don't clean up profiler states, since
+              // they will be cleaned up by main thread, and consolidate all
+              // events so we obtain asynchronously run events.
+              torch::autograd::profiler::ProfilerDisableOptions opts(
+                  false, true);
+              auto event_lists =
+                  torch::autograd::profiler::disableProfiler(opts);
+              if (wrappedRpcResponseFuture->hasError()) {
+                // Propagate error
+                // No need to propagate remote events in the case of an error.
+                responseFuture->setError(
+                    wrappedRpcResponseFuture->error()->what());
+              } else {
+                populateRemoteProfiledEvents(
+                    profiledEvents, profilingConfig, event_lists);
+                auto rpcWithProfilingResp =
+                    std::make_unique<RpcWithProfilingResp>(
+                        MessageType::RUN_WITH_PROFILING_RESP,
+                        std::move(*wrappedRpcResponseFuture).moveValue(),
+                        profiledEvents,
+                        profilingKeyId);
+                responseFuture->markCompleted(
+                    std::move(*rpcWithProfilingResp).toMessage());
+              }
+            }));
+        // Exiting the scope will disable the profiler on this thread with the
+        // options specified above.
+      }
       return;
     }
     default: {
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 981cfd50f95e..636cf7f67c36 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -501,6 +501,85 @@ std::vector<at::IValue> readWrappedPayload(
   payload.resize(payload.size() - additionalPayloadSize);
   return tupleElements;
 }
+
+void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::Event>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilingConfig,
+    const std::vector<std::vector<torch::autograd::profiler::Event>>&
+        eventLists) {
+  // Gather all events into a vector
+  for (auto& l : eventLists) {
+    for (auto& e : l) {
+      profiledEvents.push_back(e);
+    }
+  }
+  // find __start_profile event and __cuda_start_event.
+  bool cudaProfilingEnabled =
+      profilingConfig.state == torch::autograd::profiler::ProfilerState::CUDA;
+  bool foundCpuStart = false;
+  const torch::autograd::profiler::Event* profilerStart = nullptr;
+  // Each device has its own cudaProfilerStart, so we must take
+  // care to use the correct one depending on the device the
+  // operation ran on.
+  std::unordered_map<int, const torch::autograd::profiler::Event*>
+      cudaProfilerStarts;
+  for (auto& e : profiledEvents) {
+    if (!foundCpuStart && 0 == strcmp(e.name(), "__start_profile")) {
+      profilerStart = &e;
+      foundCpuStart = true;
+    } else if (cudaProfilingEnabled && 0 == strcmp(e.name(), "__cuda_start_event")) {
+      e.setCudaUs(e.cpu_us());
+      auto device = e.device();
+      TORCH_CHECK(
+          device != -1,
+          "CUDA profiling was enabled but could not find CUDA device.");
+      TORCH_CHECK(
+          cudaProfilerStarts.find(device) == cudaProfilerStarts.end(),
+          c10::str("Duplicate __cuda_start_event found for ", device));
+      cudaProfilerStarts[device] = &e;
+    }
+
+    // TODO: determine no. of CUDA devices and break here if we have
+    // a cudaProfilerStart for all of them, in the case of cuda
+    // profiling.
+    if (foundCpuStart && !cudaProfilingEnabled) {
+      break;
+    }
+  }
+  // We should always find __start_profile.
+  TORCH_CHECK(
+      profilerStart != nullptr, "Expected to find __start_profile event.");
+  // Should have >= 1 CUDA start event if cudaProfilingEnabled.
+  // TODO: we can enhance this assert by ensuring we have found a
+  // start for every available CUDA device.
+  TORCH_CHECK(
+      !cudaProfilingEnabled || cudaProfilerStarts.size() > 0,
+      "Profiler was enabled with CUDA recording, but did not find __cuda_start_event.");
+
+  if (cudaProfilingEnabled) {
+    // Compute and set global time for when this CUDA kernel was
+    // launched/ended, since deserialized event will not have a
+    // corresponding CUDA event.
+    for (auto& e : profiledEvents) {
+      if (e.has_cuda()) {
+        auto cudaDevice = e.device();
+        TORCH_CHECK(
+            cudaDevice != -1,
+            "CUDA profiling was enabled but could not find CUDA device.");
+        auto it = cudaProfilerStarts.find(cudaDevice);
+        TORCH_CHECK(
+            it != cudaProfilerStarts.end(),
+            c10::str(
+                "Failed to find __cuda_start_event for device ", cudaDevice));
+        auto cudaProfilerStartEvent = it->second;
+        double cudaElapsedUs = cudaProfilerStartEvent->cuda_elapsed_us(e);
+        int64_t cudaUs = cudaElapsedUs + cudaProfilerStartEvent->cpu_us();
+        e.setCudaUs(cudaUs);
+      }
+    }
+  }
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/utils.h b/torch/csrc/distributed/rpc/utils.h
index 806b52208eb0..f91dfb4f4c7d 100644
--- a/torch/csrc/distributed/rpc/utils.h
+++ b/torch/csrc/distributed/rpc/utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <c10/core/Device.h>
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 #include <torch/csrc/utils/byte_order.h>
@@ -78,6 +79,14 @@ TORCH_API std::vector<at::IValue> readWrappedPayload(
     std::vector<char>& payload,
     const rpc::Message& message);
 
+// Takes a list of events from autograd profiler and populates them into
+// profiledEvents to be carried over RPC.
+TORCH_API void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::Event>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilerConfig,
+    const std::vector<std::vector<torch::autograd::profiler::Event>>&
+        eventLists);
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
index 93de304a53ca..18d7a0417eac 100644
--- a/torch/testing/_internal/dist_utils.py
+++ b/torch/testing/_internal/dist_utils.py
@@ -18,6 +18,25 @@
 INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}"
 
 
+def single_threaded_process_group_agent(f):
+    """
+    Forces ProcessGroupAgent to use only a single thread in the ThreadPool for
+    sending and processing requests.
+    """
+    @wraps(f)
+    def wrapper(self, *args, **kwargs):
+        backend_type = self.rpc_backend
+        if backend_type == rpc.backend_registry.BackendType["PROCESS_GROUP"]:
+            self.rpc_backend_options = rpc.backend_registry.construct_rpc_backend_options(
+                self.rpc_backend,
+                init_method=self.init_method,
+                num_send_recv_threads=1,
+            )
+        return_value = f(self, *args, **kwargs)
+        return return_value
+    return wrapper
+
+
 def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
               faulty_messages=None, messages_to_delay=None):
     """
diff --git a/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py
index 893e5b8e17b0..3a4930476345 100644
--- a/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py
+++ b/torch/testing/_internal/distributed/rpc/process_group_agent_test_fixture.py
@@ -13,12 +13,19 @@ def rpc_backend(self):
 
     @property
     def rpc_backend_options(self):
-        return rpc.backend_registry.construct_rpc_backend_options(
-            self.rpc_backend,
-            init_method=self.init_method,
-            # Some tests need additional threads (ex: test_trainer_ps)
-            num_send_recv_threads=8,
-        )
+        try:
+            return self._rpc_backend_options
+        except AttributeError:
+            return rpc.backend_registry.construct_rpc_backend_options(
+                self.rpc_backend,
+                init_method=self.init_method,
+                # Some tests need additional threads (ex: test_trainer_ps)
+                num_send_recv_threads=8,
+            )
+
+    @rpc_backend_options.setter
+    def rpc_backend_options(self, new_rpc_backend_options):
+        self._rpc_backend_options = new_rpc_backend_options
 
     def get_shutdown_error_regex(self):
         error_regexes = [
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 163a772628a5..6c8efd680625 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -32,6 +32,7 @@
     wait_until_pending_futures_and_users_flushed,
     wait_until_owners_and_forks_on_rank,
     worker_name,
+    single_threaded_process_group_agent,
 )
 from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
     RpcAgentTestFixture,
@@ -69,6 +70,9 @@ def udf_with_torch_ops(device=-1, use_record_function=False):
     "aten::sigmoid",
 ]
 
+# Remote operations are prefixed with the following string for RPC profiling.
+REMOTE_OP_STR = "#remote_op: "
+
 
 VALUE_FUTURE = concurrent.futures.Future()
 DONE_FUTURE = concurrent.futures.Future()
@@ -386,6 +390,19 @@ def async_wrong_type():
 def async_add(to, x, y):
     return rpc.rpc_async(to, torch.add, args=(x, y))
 
+
+def slow_add(x, y, device="cpu"):
+    time.sleep(1)
+    x = x.to(device)
+    y = y.to(device)
+    return torch.add(x, y).cpu()
+
+
+@rpc.functions.async_execution
+def slow_async_add(to, x, y, device="cpu"):
+    return rpc.rpc_async(to, slow_add, args=(x, y, device))
+
+
 @rpc.functions.async_execution
 def async_add_with_future_ctor(to, x, y, z):
     fut = torch.futures.Future()
@@ -1102,7 +1119,6 @@ def test_profiler_remote_cuda(self):
 
         # Validate that EXPECTED_REMOTE_EVENTS is a subset of remotely profiled
         # events.
-        REMOTE_OP_STR = "#remote_op: "
 
         def get_name(event):
             return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR):]
@@ -1235,7 +1251,6 @@ def test_profiler_remote_events_profiled(self):
             )
 
             for expected_remote_event_name in EXPECTED_REMOTE_EVENTS:
-                REMOTE_OP_STR = "#remote_op: "
                 expected_key = rpc_profiling_key + REMOTE_OP_STR + expected_remote_event_name
                 self.assertTrue(expected_key in remote_events)
                 remote_event = remote_events[expected_key]
@@ -1269,6 +1284,67 @@ def run_profiling_workload(self, dst):
         )
         fut.wait()
 
+    def _run_rpc_profiling_async_function(self, device="cpu"):
+        if self.rank != 1:
+            return
+
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+        x = torch.ones(2)
+        y = torch.ones(2)
+        with torch.autograd.profiler.profile() as prof:
+            ret = rpc.rpc_async(
+                dst1, slow_async_add, args=(dst2, x, y, device), timeout=20
+            )
+            out = ret.wait()
+
+        function_events = prof.function_events
+        # slow_async_add resulted in an RPC from dst1 -> dst2, so this should be
+        # recorded.
+        key_prefix = _build_rpc_profiling_key(
+            RPCExecMode.ASYNC, slow_async_add.__qualname__, worker_name(self.rank), dst1
+        )
+
+        nested_rpc_key_prefix = _build_rpc_profiling_key(
+            RPCExecMode.ASYNC, slow_add.__qualname__, dst1, dst2
+        )
+        expected_key = key_prefix + REMOTE_OP_STR + nested_rpc_key_prefix
+        remote_events = [event for event in function_events if event.is_remote]
+        rpc_remote_event = [
+            event for event in remote_events if event.name == expected_key
+        ]
+        self.assertEqual(1, len(rpc_remote_event))
+        rpc_remote_event = rpc_remote_event[0]
+        self.assertEqual(rpc_remote_event.node_id, (self.rank + 1) % self.world_size)
+        # slow_async_add's RPC does an add on dst2, which should be reflected as well.
+        remote_add_key = (
+            expected_key + REMOTE_OP_STR + torch.jit._builtins._find_builtin(torch.add)
+        )
+        remote_add_event = [
+            event for event in remote_events if event.name == remote_add_key
+        ]
+        self.assertEqual(1, len(remote_add_event))
+        remote_add_event = remote_add_event[0]
+        # Validate that node_id is dst2.
+        self.assertEqual(remote_add_event.node_id, (self.rank + 2) % self.world_size)
+
+    @dist_init
+    def test_rpc_profiling_async_function(self):
+        initialize_pg(self.init_method, self.rank, self.world_size)
+        self._run_rpc_profiling_async_function()
+        if torch.cuda.is_available():
+            dist.barrier()
+            self._run_rpc_profiling_async_function(device="cuda:0")
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_rpc_profiling_async_function_single_threaded(self):
+        initialize_pg(self.init_method, self.rank, self.world_size)
+        self._run_rpc_profiling_async_function()
+        if torch.cuda.is_available():
+            dist.barrier()
+            self._run_rpc_profiling_async_function(device="cuda:0")
+
     @dist_init
     def test_rpc_profiling_remote_record_function(self):
         # test that functions run over RPC with record_function show the expected
@@ -1339,11 +1415,9 @@ def get_cpu_children(event):
                     )
 
     def validate_profiling_workload(self, dst, prof):
-        REMOTE_OP_STR = "#remote_op: "
 
         def convert_remote_to_local(event_name):
-            remote_op_key = REMOTE_OP_STR
-            return event_name[event_name.find(remote_op_key) + len(remote_op_key) :]
+            return event_name[event_name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR) :]
 
         events = prof.function_events
         remote_events = {

From 7c5436d5571c59d992ad9e95309ccac13d3a0cc0 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Fri, 25 Sep 2020 13:17:24 -0700
Subject: [PATCH 150/449] [RPC profiling] Add tests to ensure RPC profiling
 works on single threaded (#44923)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44923

This ensures that RPC profiling works in single-threaded server
scenarios and that we won't make the assumption that we'll have multiple
threads when working on this code. For example, this assumption resulted in a
bug in the previous diff (which was fixed)
ghstack-source-id: 112868469

Test Plan: CI

Reviewed By: lw

Differential Revision: D23691304

fbshipit-source-id: b17d34ade823794cbe949b70a5ab35723d974203
---
 .../_internal/distributed/rpc/rpc_test.py     | 115 +++++++++++++++---
 1 file changed, 101 insertions(+), 14 deletions(-)

diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 6c8efd680625..abb1ab037e68 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1217,8 +1217,7 @@ def rpc_with_profiling(dst_worker):
                 for fut in futs:
                     fut.result()
 
-    @dist_init
-    def test_profiler_remote_events_profiled(self):
+    def _run_test_profiler_remote_events_profiled(self):
         # Tests that we can successfully invoke the profiler on a remote node,
         # and collect the remote events back in the local profiler.
         if self.rank != 1:
@@ -1273,6 +1272,15 @@ def convert_remote_to_local(event_name):
             ]
             self.assertEqual(remote_events_list, EXPECTED_REMOTE_EVENTS)
 
+    @dist_init
+    def test_profiler_remote_events_profiled(self):
+        self._run_test_profiler_remote_events_profiled()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_remote_events_profiled_single_threaded(self):
+        self._run_test_profiler_remote_events_profiled()
+
     def run_profiling_workload(self, dst):
         fut = rpc.rpc_async(
             worker_name(dst),
@@ -1436,8 +1444,7 @@ def convert_remote_to_local(event_name):
             RPCExecMode.ASYNC,
         )
 
-    @dist_init
-    def test_profiler_with_autograd_context(self):
+    def _run_test_profiler_with_autograd_context(self):
         dst = (self.rank + 1) % self.world_size
         if self.rank == 1:
             # Cases where we can double wrap messages with profiling information and autograd info.
@@ -1455,6 +1462,15 @@ def test_profiler_with_autograd_context(self):
 
             self.validate_profiling_workload(dst, prof)
 
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_autograd_context_single_threaded(self):
+        self._run_test_profiler_with_autograd_context()
+
+    @dist_init
+    def test_profiler_with_autograd_context(self):
+        self._run_test_profiler_with_autograd_context()
+
     def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function=False, dst=None):
         dst = dst if dst is not None else (self.rank + 1) % self.world_size
 
@@ -1512,14 +1528,21 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function
                 rpc_event_idx = next(i for i, event in enumerate(events) if rpc_exec_mode.value in event.name)
                 self.assertLess(foo_event_ix, rpc_event_idx)
 
-    @dist_init
-    def test_profiler_with_sync_rpc_udf(self):
+    def _run_test_profiler_with_sync_rpc_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,))
         self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,),
                                      use_record_function=True)
 
     @dist_init
-    def test_profiler_with_sync_rpc_builtin(self):
+    def test_profiler_with_sync_rpc_udf(self):
+        self._run_test_profiler_with_sync_rpc_udf()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_sync_rpc_udf_single_threaded(self):
+        self._run_test_profiler_with_sync_rpc_udf()
+
+    def _run_test_profiler_with_sync_rpc_builtin(self):
         self._profiler_test_with_rpc(
             RPCExecMode.SYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
@@ -1529,13 +1552,29 @@ def test_profiler_with_sync_rpc_builtin(self):
         )
 
     @dist_init
-    def test_profiler_with_async_rpc_udf(self):
+    def test_profiler_with_sync_rpc_builtin(self):
+        self._run_test_profiler_with_sync_rpc_builtin()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_sync_rpc_builtin_single_threaded(self):
+        self._run_test_profiler_with_sync_rpc_builtin()
+
+    def _run_test_profiler_with_async_rpc_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,))
         self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,),
                                      use_record_function=True)
 
     @dist_init
-    def test_profiler_with_async_rpc_builtin(self):
+    def test_profiler_with_async_rpc_udf(self):
+        self._run_test_profiler_with_async_rpc_udf()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_async_rpc_udf_single_threaded(self):
+        self._run_test_profiler_with_async_rpc_udf()
+
+    def _run_test_profiler_with_async_rpc_builtin(self):
         self._profiler_test_with_rpc(
             RPCExecMode.ASYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
@@ -1545,7 +1584,15 @@ def test_profiler_with_async_rpc_builtin(self):
         )
 
     @dist_init
-    def test_profiler_with_remote_udf(self):
+    def test_profiler_with_async_rpc_builtin(self):
+        self._run_test_profiler_with_async_rpc_builtin()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_async_rpc_builtin_single_threaded(self):
+        self._run_test_profiler_with_async_rpc_builtin()
+
+    def _run_test_profiler_with_remote_udf(self):
         self._profiler_test_with_rpc(RPCExecMode.REMOTE, my_sleep_func, args=(1,))
         self._profiler_test_with_rpc(
             RPCExecMode.REMOTE, my_sleep_func, args=(1,), use_record_function=True
@@ -1556,7 +1603,15 @@ def test_profiler_with_remote_udf(self):
         )
 
     @dist_init
-    def test_profiler_with_remote_builtin(self):
+    def test_profiler_with_remote_udf(self):
+        self._run_test_profiler_with_remote_udf()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_remote_udf_single_threaded(self):
+        self._run_test_profiler_with_remote_udf()
+
+    def _run_test_profiler_with_remote_builtin(self):
         self._profiler_test_with_rpc(
             RPCExecMode.REMOTE, torch.mul, args=(torch.ones(1), torch.ones(1))
         )
@@ -1573,7 +1628,15 @@ def test_profiler_with_remote_builtin(self):
         )
 
     @dist_init
-    def test_profiler_with_script_async_rpc(self):
+    def test_profiler_with_remote_builtin(self):
+        self._run_test_profiler_with_remote_builtin()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_remote_builtin_single_threaded(self):
+        self._run_test_profiler_with_remote_builtin()
+
+    def _run_test_profiler_with_script_async_rpc(self):
         self._profiler_test_with_rpc(
             RPCExecMode.ASYNC, my_script_func, args=(torch.tensor(1),)
         )
@@ -1585,7 +1648,15 @@ def test_profiler_with_script_async_rpc(self):
         )
 
     @dist_init
-    def test_profiler_with_script_sync_rpc(self):
+    def test_profiler_with_script_async_rpc(self):
+        self._run_test_profiler_with_script_async_rpc()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_script_async_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_async_rpc()
+
+    def _run_test_profiler_with_script_sync_rpc(self):
         self._profiler_test_with_rpc(
             RPCExecMode.SYNC, my_script_func, args=(torch.tensor(1),)
         )
@@ -1597,7 +1668,15 @@ def test_profiler_with_script_sync_rpc(self):
         )
 
     @dist_init
-    def test_profiler_with_script_remote_rpc(self):
+    def test_profiler_with_script_sync_rpc(self):
+        self._run_test_profiler_with_script_sync_rpc()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_script_sync_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_sync_rpc()
+
+    def _run_test_profiler_with_script_remote_rpc(self):
         self._profiler_test_with_rpc(
             RPCExecMode.REMOTE, my_script_func, args=(torch.tensor(1),)
         )
@@ -1612,6 +1691,14 @@ def test_profiler_with_script_remote_rpc(self):
             RPCExecMode.REMOTE, my_script_func, args=(torch.tensor(1),), dst=self.rank
         )
 
+    @dist_init
+    def test_profiler_with_script_remote_rpc(self):
+        self._run_test_profiler_with_script_remote_rpc()
+
+    @single_threaded_process_group_agent
+    @dist_init
+    def test_profiler_with_script_remote_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_remote_rpc()
 
     def _assert_top_level_events(self, process_global_events, expected_top_level_event_names):
         top_level_event_names = []

From eee7dad3768f24b00ab9a28253ebaa16dec0f6f5 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Fri, 25 Sep 2020 13:44:53 -0700
Subject: [PATCH 151/449] Add torch.do_assert, which is symbolically traceable
 (#45188)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45188

This is a symbolically traceable alternative to Python's `assert`.
It should be useful to allow people who want to use FX to also
be able to assert things.

A bunch of TODO(before) land are inline - would love thoughts
on where is the best place for this code to live, and what this
function should be called (since `assert` is reserved).

Test Plan:
```
python test/test_fx.py TestFX.test_symbolic_trace_assert
```

Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D23861567

fbshipit-source-id: d9d6b9556140faccc0290eba1fabea401d7850de
---
 docs/source/torch.rst |  1 +
 test/test_fx.py       | 16 ++++++++++++++++
 test/test_utils.py    |  8 ++++++++
 torch/__init__.py     |  9 +++++++++
 4 files changed, 34 insertions(+)

diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index beab6c449df1..0063c6cc8db4 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -536,3 +536,4 @@ Utilities
     set_deterministic
     is_deterministic
     vmap
+    Assert
diff --git a/test/test_fx.py b/test/test_fx.py
index a48274e16809..5eae41538f5c 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -603,6 +603,22 @@ def test_construct_root_dict(self):
         ref_out : torch.Tensor = linear_mod(x) + add_param
         self.assertEqual(out, ref_out)
 
+    def test_symbolic_trace_assert(self):
+        message = "assert_foobar"
+
+        class AssertsTensorShape(torch.nn.Module):
+            def forward(self, x):
+                torch.Assert(x.shape[1] > 4, message)
+                return x
+
+        m = AssertsTensorShape()
+        # verify traceability
+        traced = symbolic_trace(m)
+        # verify assertion on traced model works correctly at runtime
+        traced(torch.rand(4, 5))
+        with self.assertRaisesRegex(AssertionError, message):
+            traced(torch.rand(4, 3))
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_utils.py b/test/test_utils.py
index 398a10971d0d..c545aae5bfaf 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -795,5 +795,13 @@ def test_fuzzer(self):
                 x, torch.Tensor(expected_results[i]), rtol=1e-3, atol=1e-3)
 
 
+class TestAssert(TestCase):
+    def test_assert_true(self):
+        # verify assertions work as expected
+        torch.Assert(True, "foo")
+        with self.assertRaisesRegex(AssertionError, "bar"):
+            torch.Assert(False, "bar")
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/__init__.py b/torch/__init__.py
index da9eecad7df5..a88e441b9238 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -612,3 +612,12 @@ def compiled_with_cxx11_abi():
 # class usage. We add these lines here to preserve backward compatbility.
 quantized_lstm = torch.ops.aten.quantized_lstm
 quantized_gru = torch.ops.aten.quantized_gru
+
+from .overrides import has_torch_function, handle_torch_function
+
+def Assert(condition, message):
+    r"""A wrapper around Python's assert which is symbolically traceable.
+    """
+    if type(condition) is not torch.Tensor and has_torch_function((condition,)):
+        return handle_torch_function(Assert, (condition,), condition, message)
+    assert condition, message

From 04be420549ac8db72c5e22467461f90cf96a59c2 Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Fri, 25 Sep 2020 14:44:35 -0700
Subject: [PATCH 152/449] [static runtime] Remove ops in static from backwards
 compatibility checks (#45354)

Summary:
This should get the builds green again

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45354

Reviewed By: zhangguanheng66

Differential Revision: D23939615

Pulled By: bwasti

fbshipit-source-id: e93b11bc9592205e52330bb15928603b0aea21ac
---
 test/backward_compatibility/check_backward_compatibility.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index a01a004a0850..e720f8a4fdbc 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -27,6 +27,8 @@
 # NB: function name DOES NOT include overload name!
 allow_list = [
     ("c10_experimental", datetime.date(2222, 1, 1)),
+    # Internal
+    ("static", datetime.date(9999, 1, 1)),
     # Internal, profiler-specific ops
     ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)),
     ("profiler::_record_function_enter", datetime.date(9999, 1, 1)),

From 3b7e4f89b22eba0b9c2f122fe1d683a6d14a9f7e Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Fri, 25 Sep 2020 15:39:02 -0700
Subject: [PATCH 153/449] Add deprecation warning to PG backend and make TP
 backend stable. (#45356)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45356

In this PR, I'm adding a warning to the PG backend mentioning it would
be deprecated in the future. In addition to this I removed the warning from the
TP backend that it is a beta feature.
ghstack-source-id: 112940501

Test Plan: waitforbuildbot

Reviewed By: mrshenli

Differential Revision: D23940144

fbshipit-source-id: d44054aa1e4ef61004a40bbe0ec45ff07829aad4
---
 docs/source/rpc.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst
index 1e4788c99634..bb15c2af2e0a 100644
--- a/docs/source/rpc.rst
+++ b/docs/source/rpc.rst
@@ -142,9 +142,6 @@ to configure the backend's behavior.
 TensorPipe Backend
 """"""""""""""""""
 
-.. warning::
-    The TensorPipe backend is a **beta feature**.
-
 The TensorPipe agent, which is the default, leverages `the TensorPipe library
 <https://github.com/pytorch/tensorpipe>`_, which provides a natively
 point-to-point communication primitive specifically suited for machine learning
@@ -192,6 +189,10 @@ Example::
 Process Group Backend
 """""""""""""""""""""
 
+.. warning ::
+     The Process Group Backend will be deprecated soon, we recommend using the
+     TensorPipe Backend instead.
+
 The Process Group agent instantiates a process group from
 the :mod:`~torch.distributed` module and utilizes its point-to-point
 communication capabilities to send RPC messages. Internally, the process

From a2b4177c5b9a4778a77fcee52857e8f152ded111 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Fri, 25 Sep 2020 15:45:07 -0700
Subject: [PATCH 154/449] Add barrier() at the end of init_process_group and
 new_group. (#45181)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45181

`init_process_group` and `new_group` update a bunch of global
variables after initializing the actual process group. As a result, there is a
race that after initializing the process group on say rank 0, if we immediately
check the default process group on rank 1 (say via RPC), we might actually get
an error since rank 1 hasn't yet updated its _default_pg variable.

To resolve this issue, I've added barrier() at the end of both of these calls.
This ensures that once these calls return we are guaranteed about correct
initialization on all ranks.

Since these calls are usually done mostly during initialization, it should be
fine to add the overhead of a barrier() here.

#Closes: https://github.com/pytorch/pytorch/issues/40434, https://github.com/pytorch/pytorch/issues/40378
ghstack-source-id: 112923112

Test Plan:
Reproduced the failures in
https://github.com/pytorch/pytorch/issues/40434 and
https://github.com/pytorch/pytorch/issues/40378 and verified that this PR fixes
the issue.

Reviewed By: mrshenli

Differential Revision: D23858025

fbshipit-source-id: c4d5e46c2157981caf3ba1525dec5310dcbc1830
---
 test/cpp_extensions/cpp_c10d_extension.cpp         |  2 +-
 test/distributed/test_c10d.py                      | 10 ++++++----
 torch/distributed/distributed_c10d.py              |  9 +++++++++
 .../distributed/ddp_under_dist_autograd_test.py    | 14 ++++++++++++--
 .../_internal/distributed/distributed_test.py      |  6 ++++--
 5 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/test/cpp_extensions/cpp_c10d_extension.cpp b/test/cpp_extensions/cpp_c10d_extension.cpp
index 188484cf9248..b4901cdbcf4d 100644
--- a/test/cpp_extensions/cpp_c10d_extension.cpp
+++ b/test/cpp_extensions/cpp_c10d_extension.cpp
@@ -63,7 +63,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::allgather_base(
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::barrier(
     const BarrierOptions& opts) {
-  throw std::runtime_error("ProcessGroupTest does not support barrier");
+  return std::make_shared<ProcessGroupTest::WorkTest>();
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupTest::gather(
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 911a73ce432e..0e893c87efb1 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -334,11 +334,11 @@ def test_unknown_handler(self):
 @skip_if_win32()
 class RendezvousEnvTest(TestCase):
     @retry_on_connect_failures
+    @requires_nccl()
     def test_common_errors(self):
-        # TODO remove this hack
-        if not hasattr(c10d, "ProcessGroupNCCL"):
-            raise unittest.SkipTest("C10D is not built with NCCL process group,"
-                                    " skipping test")
+        if torch.cuda.device_count() == 0:
+            raise unittest.SkipTest("No GPUs available, skipping test")
+
         vars = {
             "WORLD_SIZE": "1",
             "RANK": "0",
@@ -579,6 +579,8 @@ def _test_default_store_timeout(self, backend):
     @requires_nccl()
     @retry_on_connect_failures
     def test_default_store_timeout_nccl(self):
+        if torch.cuda.device_count() == 0:
+            raise unittest.SkipTest("No GPUs available, skipping test")
         self._test_default_store_timeout('nccl')
 
     @requires_gloo()
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index c7d66f322bb1..ae4338cd28fc 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -436,6 +436,10 @@ def init_process_group(backend,
     _backend = _pg_map[_default_pg][0]
     _default_pg_init_method = init_method
 
+    # barrier at the end to ensure that once we return from this method, all
+    # process groups including global variables are updated correctly on all
+    # ranks.
+    barrier()
 
 def _new_process_group_helper(world_size,
                               rank,
@@ -2025,4 +2029,9 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None):
         for group_rank, global_rank in enumerate(ranks)
     }
 
+    # barrier at the end to ensure that once we return from this method, all
+    # process groups including global variables are updated correctly on all
+    # ranks.
+    barrier()
+
     return pg
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 09db831e9999..305b0fcb82bf 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -325,7 +325,7 @@ def trainer_name(self, rank):
         # The name has to be consistent with that in 'dist_init' decorator.
         return f"worker{rank}"
 
-    def _remote_worker_process(self):
+    def _remote_worker_process(self, ddp_mode):
         gLogger.info("The remote worker is running.")
         dist.init_process_group(
             backend="gloo",
@@ -333,6 +333,11 @@ def _remote_worker_process(self):
             world_size=self.world_size,
             rank=self.rank,
         )
+
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
         global shutdown_signal
         with shutdown_signal:
             shutdown_signal.wait()
@@ -367,6 +372,7 @@ def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
             world_size=self.world_size,
             rank=self.rank,
         )
+
         remote_em_rref = rpc.remote(
             self.remote_worker_name(), RemoteEM, args=(NUM_EM_ROW, D_SPARSE)
         )
@@ -401,6 +407,10 @@ def do_test_on_master(
                 )
             )
 
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
         training_examples = get_training_examples()
         for _ in range(3):
             futures = []
@@ -455,7 +465,7 @@ def _do_test(self, ddp_mode, simulate_uneven_inputs=False):
         if self.rank == MASTER_RANK:
             self._master_process(ddp_mode, simulate_uneven_inputs)
         elif self.rank == REMOTE_WORKER_RANK:
-            self._remote_worker_process()
+            self._remote_worker_process(ddp_mode)
         elif self.rank in TRAINER_RANKS:
             self._trainer_process(self.rank)
         else:
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index af5e648f6acb..153a9963d540 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -285,6 +285,8 @@ def init_method(self):
 
     @classmethod
     def _run(cls, rank, test_name, file_name):
+        if BACKEND == 'nccl' and not torch.cuda.is_available():
+            sys.exit(TEST_SKIPS['no_cuda'].exit_code)
         self = cls(test_name)
         self.rank = rank
         self.file_name = file_name
@@ -2283,7 +2285,7 @@ def test_DistributedDataParallel_requires_grad(self):
         @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
         @skip_if_rocm
         def test_DistributedDataParallel_non_default_stream(self):
-            stream = torch.cuda.Stream()
+            stream = torch.cuda.Stream(self.rank)
             rank = self.rank
             with torch.cuda.stream(stream):
                 net = torch.nn.parallel.DistributedDataParallel(
@@ -3020,7 +3022,7 @@ def _run_uneven_inputs_test(
             rank = self.rank
             sync_interval = test_case.sync_interval
             # Ensure all outsanding GPU work is comlete so this test runs independently.
-            torch.cuda.synchronize()
+            dist.barrier()
             # Bucket_cap_mb is intentionally low to test allreduce scheduling when
             # there are many buckets.
             net = torch.nn.parallel.DistributedDataParallel(

From 37513a111814e85569ecbc6974e23d7a68b01205 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 25 Sep 2020 16:24:40 -0700
Subject: [PATCH 155/449] Use explicit templates in CUDALoops kernels (#44286)

Summary:
Reland attempt of https://github.com/pytorch/pytorch/pull/41059
Use explicit templates instead of lambdas to reduce binary size without affecting the perf by 100-200Kb per arch per CU, namely:
BinaryMulDivKernel.cu 3.8Mb -> 3.5Mb
CompareEQKernel.cu 1.8Mb -> 1.7Mb
BinaryAddSubKernel.cu 2.0Mb -> 1.8Mb
BinaryBitwiseOpsKernels.cu 2.6Mb -> 2.3Mb

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44286

Reviewed By: ngimel

Differential Revision: D23859691

Pulled By: malfet

fbshipit-source-id: 2c4e86f35e0f94a62294dc5d52a3ba364db23e2d
---
 aten/src/ATen/native/cuda/AbsKernel.cu        |  11 +-
 .../ATen/native/cuda/BinaryAddSubKernel.cu    |  16 ++-
 .../native/cuda/BinaryBitwiseOpsKernels.cu    | 101 ++++++++++--------
 .../ATen/native/cuda/BinaryMulDivKernel.cu    |  80 +++++++++-----
 aten/src/ATen/native/cuda/CompareEQKernel.cu  |  11 +-
 aten/src/ATen/native/cuda/CompareGEKernel.cu  |  11 +-
 aten/src/ATen/native/cuda/CompareGTKernel.cu  |  11 +-
 aten/src/ATen/native/cuda/CompareLEKernel.cu  |  11 +-
 aten/src/ATen/native/cuda/CompareLTKernel.cu  |  11 +-
 aten/src/ATen/native/cuda/CompareNEKernel.cu  |  11 +-
 aten/src/ATen/native/cuda/FillKernel.cu       |  15 ++-
 aten/src/ATen/native/cuda/Loops.cuh           |   1 -
 .../ATen/native/cuda/ReduceMinMaxKernel.cu    |  24 +++--
 13 files changed, 203 insertions(+), 111 deletions(-)

diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu
index 4113115d7b12..649b235bf654 100644
--- a/aten/src/ATen/native/cuda/AbsKernel.cu
+++ b/aten/src/ATen/native/cuda/AbsKernel.cu
@@ -6,11 +6,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct AbsFunctor {
+  __device__ __forceinline__ scalar_t operator() (const scalar_t a) const {
+    return std::abs(a);
+  }
+};
+
 void abs_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, iter.dtype(), "abs_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
-      return std::abs(a);
-    });
+    gpu_kernel(iter, AbsFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
index f05d73453dcf..864fb0a848df 100644
--- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu
@@ -8,12 +8,20 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct AddFunctor {
+  AddFunctor(scalar_t a): alpha(a) {}
+  __device__ __forceinline__ scalar_t operator() (const scalar_t a, const scalar_t b) const {
+    return a + alpha * b;
+  }
+  private:
+    scalar_t alpha;
+};
+
 void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() {
-    auto alpha = alpha_scalar.to<scalar_t>();
-    gpu_kernel_with_scalars(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-      return a + alpha * b;
-    });
+    AddFunctor<scalar_t> f(alpha_scalar.to<scalar_t>());
+    gpu_kernel_with_scalars(iter, f);
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu
index 128c05bed3cb..30894b568762 100644
--- a/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu
@@ -9,60 +9,67 @@
 
 namespace at { namespace native {
 
-void bitwise_and_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
-    gpu_kernel_with_scalars(
-        iter,
-        []GPU_LAMBDA(bool a, bool b) {
-          return a && b;
-    });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_and_cuda", [&]() {
-      gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-            return a & b;
-      });
-    });
+template<typename scalar_t>
+struct BitwiseAndFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a & b;
+  }
+};
+
+template<>
+struct BitwiseAndFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a && b;
   }
+};
+
+void bitwise_and_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_and_cuda", [&]() {
+    BitwiseAndFunctor<scalar_t> f;
+    gpu_kernel_with_scalars(iter, f);
+  });
 }
 
-void bitwise_or_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
-    gpu_kernel_with_scalars(
-        iter,
-        []GPU_LAMBDA(bool a, bool b) {
-          return a || b;
-    });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_or_cuda", [&]() {
-      gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-            return a | b;
-      });
-    });
+template<typename scalar_t>
+struct BitwiseOrFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a | b;
+  }
+};
+
+template<>
+struct BitwiseOrFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a || b;
   }
+};
+
+void bitwise_or_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_or_cuda", [&]() {
+    BitwiseOrFunctor<scalar_t> f;
+    gpu_kernel_with_scalars(iter, f);
+  });
 }
 
-void bitwise_xor_kernel_cuda(TensorIterator& iter) {
-  if (iter.dtype() == ScalarType::Bool) {
-    // Boolean type does not work with ^ (bitwise XOR) in C++. bitwise_xor wraps this operation for both Boolean and
-    // integral types.
-    gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(bool a, bool b) {
-            return a != b;
-          });
-  } else {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_xor_cuda", [&]() {
-      gpu_kernel_with_scalars(
-          iter,
-          []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-            return a ^ b;
-      });
-    });
+template<typename scalar_t>
+struct BitwiseXorFunctor {
+  __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a ^ b;
   }
+};
+
+template<>
+struct BitwiseXorFunctor<bool> {
+  __device__ __forceinline__ bool operator()(bool a, bool b) const {
+    return a != b;
+  }
+};
+
+void bitwise_xor_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_xor_cuda", [&]() {
+    BitwiseXorFunctor<scalar_t> f;
+    gpu_kernel_with_scalars(iter, f);
+  });
 }
 
 REGISTER_DISPATCH(bitwise_and_stub, &bitwise_and_kernel_cuda);
diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
index 044fc955b954..be3f4f0bb01e 100644
--- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu
@@ -11,6 +11,39 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t, typename accscalar_t>
+struct MulScalarFunctor {
+    MulScalarFunctor(accscalar_t b_): b(b_) {}
+    __device__ scalar_t operator() (scalar_t a) const {
+      return a * b;
+    }
+  private:
+    accscalar_t b;
+};
+
+template<typename scalar_t>
+struct DivFunctor {
+  __device__ scalar_t operator() (scalar_t a, scalar_t b) const {
+    return a / b;
+  }
+};
+
+template<typename scalar_t>
+struct MulFunctor {
+  __device__ scalar_t operator() (scalar_t a, scalar_t b) const {
+    return a * b;
+  }
+};
+
+// Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
+template<>
+struct MulFunctor<bool> {
+  __device__ bool operator() (bool a, bool b) const {
+    return a && b;
+  }
+};
+
+
 void div_kernel_cuda(TensorIterator& iter) {
   if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && iter.is_cpu_scalar(2)) {
     // optimization for floating-point types: if the second operand is a CPU
@@ -20,44 +53,35 @@ void div_kernel_cuda(TensorIterator& iter) {
       using accscalar_t = at::acc_type<scalar_t, true>;
       auto inv_b = accscalar_t(1.0) / iter.scalar_value<accscalar_t>(2);
       iter.remove_operand(2);
-      gpu_kernel(iter, [inv_b]GPU_LAMBDA(scalar_t a) -> scalar_t {
-        return a * inv_b;
-      });
+      MulScalarFunctor<scalar_t, decltype(inv_b)> f(inv_b);
+      gpu_kernel(iter, f);
     });
   } else {
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_cuda", [&]() {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a / b;
-      });
+      DivFunctor<scalar_t> f;
+      gpu_kernel_with_scalars(iter, f);
     });
   }
 }
 
 void mul_kernel_cuda(TensorIterator& iter) {
-  if (iter.common_dtype() == ScalarType::Bool) {
-    // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context]
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(bool a, bool b) -> bool {
-      return a && b;
-    });
-  } else if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) &&
+  if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) &&
     (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) {
-  //if common dtype is half the scalar constant can overflow in half precision, and yet the result can
-  //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy
-          AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() {
-            using accscalar_t = at::acc_type<scalar_t, true>;
-            int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2;
-            auto b = iter.scalar_value<accscalar_t>(scalar_arg);
-            iter.remove_operand(scalar_arg);
-            const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1)));
-            gpu_kernel(iter, [b]GPU_LAMBDA(scalar_t a) -> scalar_t {
-              return a * b;
-            });
-          });
+    //if common dtype is half the scalar constant can overflow in half precision, and yet the result can
+    //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2;
+      auto b = iter.scalar_value<accscalar_t>(scalar_arg);
+      iter.remove_operand(scalar_arg);
+      const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1)));
+      MulScalarFunctor<scalar_t, decltype(b)> f(b);
+      gpu_kernel(iter, f);
+    });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() {
-      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * b;
-      });
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() {
+      MulFunctor<scalar_t> f;
+      gpu_kernel_with_scalars(iter, f);
     });
   }
 }
diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu
index 947b53bce8fd..20f76ce0d8e1 100644
--- a/aten/src/ATen/native/cuda/CompareEQKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareEQKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareEqFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a == b;
+  }
+};
+
 void eq_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a == b;
-    });
+    gpu_kernel_with_scalars(iter, CompareEqFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareGEKernel.cu b/aten/src/ATen/native/cuda/CompareGEKernel.cu
index e276237ea8e6..c96b7f3929bc 100644
--- a/aten/src/ATen/native/cuda/CompareGEKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareGEKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareGEFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a >= b;
+  }
+};
+
 void ge_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "ge_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a >= b;
-    });
+    gpu_kernel_with_scalars(iter, CompareGEFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareGTKernel.cu b/aten/src/ATen/native/cuda/CompareGTKernel.cu
index c17b14855dd6..cbd189ed1b6d 100644
--- a/aten/src/ATen/native/cuda/CompareGTKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareGTKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareGTFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a > b;
+  }
+};
+
 void gt_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "gt_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a > b;
-    });
+    gpu_kernel_with_scalars(iter, CompareGTFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareLEKernel.cu b/aten/src/ATen/native/cuda/CompareLEKernel.cu
index 3987b87e918c..13e60a78ffb2 100644
--- a/aten/src/ATen/native/cuda/CompareLEKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareLEKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareLEFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a <= b;
+  }
+};
+
 void le_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "le_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a <= b;
-    });
+    gpu_kernel_with_scalars(iter, CompareLEFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareLTKernel.cu b/aten/src/ATen/native/cuda/CompareLTKernel.cu
index 3684d65f6631..e301284c83e7 100644
--- a/aten/src/ATen/native/cuda/CompareLTKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareLTKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareLTFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a < b;
+  }
+};
+
 void lt_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "lt_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a < b;
-    });
+    gpu_kernel_with_scalars(iter, CompareLTFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/CompareNEKernel.cu b/aten/src/ATen/native/cuda/CompareNEKernel.cu
index 0834a0d2b3bb..3ef397ec5200 100644
--- a/aten/src/ATen/native/cuda/CompareNEKernel.cu
+++ b/aten/src/ATen/native/cuda/CompareNEKernel.cu
@@ -10,11 +10,16 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct CompareNEFunctor {
+  __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const {
+    return a != b;
+  }
+};
+
 void ne_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "ne_cuda", [&]() {
-    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
-      return a != b;
-    });
+    gpu_kernel_with_scalars(iter, CompareNEFunctor<scalar_t>());
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/FillKernel.cu b/aten/src/ATen/native/cuda/FillKernel.cu
index 7376ecfa6394..e4fe4b68f2eb 100644
--- a/aten/src/ATen/native/cuda/FillKernel.cu
+++ b/aten/src/ATen/native/cuda/FillKernel.cu
@@ -6,12 +6,19 @@
 
 namespace at { namespace native {
 
+template<typename scalar_t>
+struct FillFunctor {
+  FillFunctor(scalar_t v): value(v) {}
+  __device__ __forceinline__ scalar_t operator() () const {
+    return value;
+  }
+  private:
+    scalar_t value;
+};
+
 void fill_kernel_cuda(TensorIterator& iter, Scalar value) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "fill_cuda", [&]() {
-    auto value_converted = value.to<scalar_t>();
-    gpu_kernel(iter, [value_converted]GPU_LAMBDA() -> scalar_t {
-      return value_converted;
-    });
+    gpu_kernel(iter, FillFunctor<scalar_t>(value.to<scalar_t>()));
   });
 }
 
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index bb913dc0ec9e..412f6b70c2c5 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -140,7 +140,6 @@ struct BUnaryFunctor {
 
 template <typename func_t>
 void gpu_kernel_with_scalars(TensorIterator& iter, const func_t& f) {
-  ASSERT_HOST_DEVICE_LAMBDA(func_t);
   TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
 
   using traits = function_traits<func_t>;
diff --git a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
index 83d11ed9f9e1..cb070e15f191 100644
--- a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu
@@ -13,20 +13,32 @@
 
 namespace at { namespace native {
 
+template <typename acc_t>
+struct MaxNanFunctor {
+  __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
+      return (THCNumerics<acc_t>::isnan(a) || a > b) ? a : b;
+  }
+};
+
 template <typename scalar_t, typename acc_t=scalar_t>
 void max_values_kernel_cuda_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, scalar_t>(
-    iter, func_wrapper<acc_t> ([]GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-      return (THCNumerics<acc_t>::isnan(a) || a > b) ? a : b;
-    }), at::numeric_limits<acc_t>::lower_bound());
+    iter, func_wrapper<acc_t> (MaxNanFunctor<acc_t>()),
+    at::numeric_limits<acc_t>::lower_bound());
 }
 
+template <typename acc_t>
+struct MinNanFunctor {
+  __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const {
+      return (THCNumerics<acc_t>::isnan(a) || a < b) ? a : b;
+  }
+};
+
 template <typename scalar_t, typename acc_t=scalar_t>
 void min_values_kernel_cuda_impl(TensorIterator& iter) {
   gpu_reduce_kernel<scalar_t, scalar_t>(
-    iter, func_wrapper<acc_t> ([]GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-      return (THCNumerics<acc_t>::isnan(a) || a < b) ? a : b;
-    }), at::numeric_limits<acc_t>::upper_bound());
+    iter, func_wrapper<acc_t> (MinNanFunctor<acc_t>()),
+    at::numeric_limits<acc_t>::upper_bound());
 }
 
 void max_values_kernel_cuda(TensorIterator& iter) {

From 439930c81b5c711f30e739e900db7a0599659ead Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 25 Sep 2020 16:34:24 -0700
Subject: [PATCH 156/449] adding a beta parameter to the smooth_l1 loss fn
 (#44433)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44433

Not entirely sure why, but changing the type of beta from `float` to `double in autocast_mode.cpp and FunctionsManual.h fixes my compiler errors, failing instead at link time

fixing some type errors, updated fn signature in a few more files

removing my usage of Scalar, making beta a double everywhere instead

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D23636720

Pulled By: bdhirsh

fbshipit-source-id: caea2a1f8dd72b3b5fd1d72dd886b2fcd690af6d
---
 aten/src/ATen/autocast_mode.cpp               |  2 +-
 aten/src/ATen/native/BinaryOps.h              |  3 +-
 aten/src/ATen/native/Loss.cpp                 | 33 ++++++++----
 aten/src/ATen/native/PointwiseOps.h           |  3 +-
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp  | 17 +++---
 .../ATen/native/cpu/PointwiseOpsKernel.cpp    | 27 +++++++---
 .../ATen/native/cuda/BinaryMiscOpsKernels.cu  |  9 ++--
 .../ATen/native/cuda/PointwiseOpsKernel.cu    | 13 ++---
 aten/src/ATen/native/native_functions.yaml    |  8 +--
 test/cpp/api/functional.cpp                   | 12 +++++
 tools/autograd/derivatives.yaml               | 14 ++---
 .../api/include/torch/nn/functional/loss.h    | 14 ++---
 torch/csrc/autograd/FunctionsManual.cpp       | 14 +++--
 torch/csrc/autograd/FunctionsManual.h         |  4 +-
 torch/nn/functional.py                        | 16 ++----
 torch/nn/modules/loss.py                      | 13 +++--
 torch/overrides.py                            |  2 +-
 torch/testing/_internal/common_nn.py          | 52 ++++++++++++++++---
 18 files changed, 170 insertions(+), 86 deletions(-)

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index cb1ea44d2e7d..8f19cebb1f52 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -357,7 +357,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(hinge_embedding_loss), "hinge_embedding_loss", Tensor (const Tensor &, const Tensor &, double, int64_t), fp32)
   KERNEL(ADD_NS(kl_div), "kl_div", Tensor (const Tensor &, const Tensor &, int64_t, bool), fp32)
   KERNEL(ADD_NS(l1_loss), "l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
-  KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
+  KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
   KERNEL(ADD_NS(mse_loss), "mse_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
   KERNEL(ADD_NS(margin_ranking_loss), "margin_ranking_loss", Tensor (const Tensor &, const Tensor &, const Tensor &, double, int64_t), fp32)
   KERNEL(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32)
diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
index e2dad35eb7ec..38a8de7337b8 100644
--- a/aten/src/ATen/native/BinaryOps.h
+++ b/aten/src/ATen/native/BinaryOps.h
@@ -25,6 +25,7 @@ inline void sub_check(const Tensor& self, const Tensor& other) {
 }
 
 using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha);
+using binary_fn_beta = void(*)(TensorIterator&, double beta);
 using binary_fn = void(*)(TensorIterator&);
 using binary_clamp_fn_alpha =
     void(*)(TensorIterator&, Scalar alpha, Scalar min_val, Scalar max_val);
@@ -54,7 +55,7 @@ DECLARE_DISPATCH(binary_fn, max_elementwise_stub);
 DECLARE_DISPATCH(binary_fn, min_elementwise_stub);
 DECLARE_DISPATCH(binary_fn, maximum_stub);
 DECLARE_DISPATCH(binary_fn, minimum_stub);
-DECLARE_DISPATCH(binary_fn, smooth_l1_stub);
+DECLARE_DISPATCH(binary_fn_beta, smooth_l1_stub);
 DECLARE_DISPATCH(binary_fn, sigmoid_backward_stub);
 DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
 DECLARE_DISPATCH(binary_fn, tanh_backward_stub);
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 8dc5432d8a8c..3563a747cdde 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -295,24 +295,37 @@ Tensor soft_margin_loss(
   return output;
 }
 
-Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction) {
+Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss(input, target, reduction);
   Tensor loss;
   auto iter = TensorIterator::binary_op(loss, input, target);
-  smooth_l1_stub(iter.device_type(), iter);
+  smooth_l1_stub(iter.device_type(), iter, beta);
   return apply_loss_reduction(iter.output(), reduction);
 }
 
-Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_out(result, input, target, reduction);
   if (reduction != Reduction::None) {
-    result = at::smooth_l1_loss(input, target, reduction);
+    Tensor loss;
+    auto iter = TensorIterator::binary_op(loss, input, target);
+    smooth_l1_stub(iter.device_type(), iter, beta);
+    if (reduction == Reduction::Mean) {
+      at::mean_out(result, iter.output(), 0);
+    } else {
+      at::sum_out(result, iter.output(), 0);
+    }
   } else {
     auto iter = TensorIterator::binary_op(result, input, target);
-    smooth_l1_stub(iter.device_type(), iter);
+    smooth_l1_stub(iter.device_type(), iter, beta);
   }
   return result;
 }
 
-Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
   auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.;
   auto iter = at::TensorIteratorConfig()
     .add_output(grad_input)
@@ -320,13 +333,15 @@ Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_outpu
     .add_input(target)
     .add_input(grad_output)
     .build();
-  smooth_l1_backward_stub(iter.device_type(), iter, norm);
+  smooth_l1_backward_stub(iter.device_type(), iter, norm, beta);
   return grad_input;
 }
 
-Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) {
+Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
+  if (beta <= 0)
+      return at::native::l1_loss_backward(grad_output, input, target, reduction);
   auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction);
+  return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction, beta);
 }
 
 Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) {
diff --git a/aten/src/ATen/native/PointwiseOps.h b/aten/src/ATen/native/PointwiseOps.h
index e81a89454905..98df21121ba3 100644
--- a/aten/src/ATen/native/PointwiseOps.h
+++ b/aten/src/ATen/native/PointwiseOps.h
@@ -11,10 +11,11 @@ struct TensorIterator;
 namespace native {
 
 using pointwise_fn = void (*)(TensorIterator&, Scalar scalar);
+using pointwise_fn_beta = void (*)(TensorIterator&, Scalar scalar, double beta);
 
 DECLARE_DISPATCH(pointwise_fn, addcmul_stub);
 DECLARE_DISPATCH(pointwise_fn, addcdiv_stub);
-DECLARE_DISPATCH(pointwise_fn, smooth_l1_backward_stub);
+DECLARE_DISPATCH(pointwise_fn_beta, smooth_l1_backward_stub);
 DECLARE_DISPATCH(pointwise_fn, mse_backward_stub);
 
 } // namespace native
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 67a961401fb0..fce8c348919b 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -502,24 +502,25 @@ void minimum_kernel(TensorIterator& iter) {
   }
 }
 
-void smooth_l1_kernel(TensorIterator& iter) {
+void smooth_l1_kernel(TensorIterator& iter, double beta) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
         kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() {
         using Vec = Vec256<scalar_t>;
-        const Vec one_vec(static_cast<scalar_t>(1));
+        const scalar_t beta_val(beta);
+        const Vec beta_val_vec(beta_val);
         const Vec point_five_vec(static_cast<scalar_t>(0.5));
         cpu_kernel_vec(
             iter,
-            [](scalar_t a, scalar_t b) -> scalar_t {
+            [&beta_val](scalar_t a, scalar_t b) -> scalar_t {
               auto z = std::abs(a - b);
-              return z < static_cast<scalar_t>(1)
-                  ? static_cast<scalar_t>(0.5) * z * z
-                  : z - static_cast<scalar_t>(0.5);
+              return z < beta_val
+                  ? static_cast<scalar_t>(0.5) * z * z / beta_val
+                  : z - static_cast<scalar_t>(0.5) * beta_val;
             },
-            [&one_vec, &point_five_vec](Vec a, Vec b) {
+            [&beta_val_vec, &point_five_vec](Vec a, Vec b) {
               auto z = (a - b).abs();
               return Vec::blendv(
-                  point_five_vec * z * z, z - point_five_vec, z >= one_vec);
+                  point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec);
             });
       });
 }
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
index 45c803e0fec2..4a52178972fc 100644
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -46,28 +46,39 @@ static void addcdiv_cpu_kernel(TensorIterator& iter, Scalar value) {
   });
 }
 
-static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm) {
+static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm, double beta) {
   ScalarType dtype = iter.dtype(0);
   AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
     auto norm_val = norm.to<scalar_t>();
+    scalar_t beta_val(beta);
     auto norm_val_vec = Vec256<scalar_t>(norm_val);
+    auto beta_val_vec = Vec256<scalar_t>(beta_val);
     const auto neg_1_vec = Vec256<scalar_t>(-1);
+    const auto zero_vec = Vec256<scalar_t>(0);
     const auto pos_1_vec = Vec256<scalar_t>(1);
     cpu_kernel_vec(iter,
       [=](scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
         const auto x = input - target;
-        if (x < -1.)
+        if (x <= -beta)
           return -norm_val * grad_output;
-        else if (x > 1.)
+        else if (x >= beta)
           return norm_val * grad_output;
         else
-          return norm_val * x * grad_output;
+          return norm_val * x * grad_output / beta;
       },
-      [norm_val_vec, neg_1_vec, pos_1_vec](
+      [norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
          Vec256<scalar_t> input, Vec256<scalar_t> target, Vec256<scalar_t> grad_output) -> Vec256<scalar_t> {
-        auto x = input - target;
-        x = clamp(x, neg_1_vec, pos_1_vec);
-        return norm_val_vec * x * grad_output;
+        // using two blendv calls to simulate the 3 cases
+        // 1        if  x >= beta
+        // -1       if x <= -beta
+        // x / beta if |x| < beta
+        const auto x = input - target;
+        const auto pos_or_neg_1_vec = Vec256<scalar_t>::blendv(
+            neg_1_vec, pos_1_vec, x > zero_vec);
+        const auto x_abs = x.abs();
+        const auto output = Vec256<scalar_t>::blendv(
+            x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
+        return norm_val_vec * output * grad_output;
       }
     );
   });
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index a2ffdb75c84b..fc9aa74f91f4 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -19,11 +19,12 @@ void atan2_kernel_cuda(TensorIterator& iter) {
   });
 }
 
-void smooth_l1_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&]() {
-    gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
+void smooth_l1_kernel_cuda(TensorIterator& iter, double beta) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&iter, beta]() {
+    scalar_t beta_val(beta);
+    gpu_kernel(iter, [beta_val] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t {
       auto z = ::abs(a - b);
-      return z < scalar_t(1.) ? scalar_t(0.5) * z * z : z - scalar_t(0.5);
+      return z < beta_val ? scalar_t(0.5) * z * z / beta_val : z - scalar_t(0.5) * beta_val;
     });
   });
 }
diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
index 501ef90477da..33162b3d5271 100644
--- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu
@@ -26,17 +26,18 @@ void addcdiv_cuda_kernel(TensorIterator& iter, Scalar value) {
   });
 }
 
-void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm) {
-  AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&]() {
+void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm, double beta) {
+  AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&iter, &norm, beta] {
       auto norm_val = norm.to<scalar_t>();
-      gpu_kernel(iter, [norm_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
+      scalar_t beta_val(beta);
+      gpu_kernel(iter, [norm_val, beta_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
         const auto x = input - target;
-        if (x < scalar_t(-1))
+        if (x < -beta_val)
           return -norm_val * grad_output;
-        else if (x > scalar_t(1))
+        else if (x > beta_val)
           return norm_val * grad_output;
         else
-          return norm_val * x * grad_output;
+          return norm_val * x * grad_output / beta_val;
     });
   });
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index c4b286d90031..d13c6799bff0 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6767,25 +6767,25 @@
     CPU: nll_loss2d_backward_cpu
     CUDA: legacy::cuda::_thnn_nll_loss2d_backward
 
-- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_out
     CUDA: smooth_l1_loss_out
 
-- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 4efdb122efc8..8c75e5fed10b 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -246,6 +246,18 @@ TEST_F(FunctionalTest, SmoothL1LossDefaultOptions) {
   ASSERT_TRUE(input.sizes() == input.grad().sizes());
 }
 
+TEST_F(FunctionalTest, SmoothL1LossBeta) {
+  auto input = torch::tensor({0.1, 1.5, 10.0}, torch::dtype(torch::kFloat).requires_grad(true));
+  auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
+  auto output =
+      F::smooth_l1_loss(input, target, /*reduction=*/torch::kMean, /*beta=*/0.5);
+  auto expected = torch::tensor(1.67, torch::kFloat);
+  auto s = output.sum();
+  s.backward();
+  ASSERT_TRUE(output.allclose(expected));
+  ASSERT_TRUE(input.sizes() == input.grad().sizes());
+}
+
 TEST_F(FunctionalTest, SmoothL1LossNoReduction) {
   auto input = torch::tensor({0.1, 1.2, 4.7}, torch::dtype(torch::kFloat).requires_grad(true));
   auto target = torch::tensor({0., 1., 5.}, torch::kFloat);
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 3bc0199ebf47..3925d496e804 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1221,9 +1221,9 @@
   self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight)
   target: non_differentiable
 
-- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  self: smooth_l1_loss_backward(grad, self, target, reduction)
-  target: smooth_l1_loss_backward(grad, target, self, reduction)
+- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+  self: smooth_l1_loss_backward(grad, self, target, reduction, beta)
+  target: smooth_l1_loss_backward(grad, target, self, reduction, beta)
 
 - name: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
   self: soft_margin_loss_backward(grad, self, target, reduction)
@@ -1589,10 +1589,10 @@
   grad_output: replication_pad3d(grad, padding)
   self: zeros_like(self)
 
-- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction)
-  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
-  target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction)
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+  grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
+  self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
+  target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
 
 - name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
   grad_output: softplus_backward(grad, self, beta, threshold, output)
diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h
index b5a06f4cfb14..cf6afe8b2b9f 100644
--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -307,9 +307,9 @@ inline Tensor cosine_embedding_loss(
 
 // ============================================================================
 
-inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target) {
+inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target, double beta = 1.) {
     auto t = torch::abs(input - target);
-    return torch::where(t < 1, 0.5 * torch::pow(t, 2), t - 0.5);
+    return torch::where(t < beta, 0.5 * torch::pow(t, 2) / beta, t - 0.5 * beta);
 }
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
@@ -317,7 +317,8 @@ namespace detail {
 inline Tensor smooth_l1_loss(
     const Tensor& input,
     const Tensor& target,
-    SmoothL1LossFuncOptions::reduction_t reduction) {
+    SmoothL1LossFuncOptions::reduction_t reduction,
+    double beta = 1.) {
   if (target.sizes() != input.sizes()) {
     TORCH_WARN("Using a target size (", target.sizes(), ") that is different to the input size (", input.sizes(), "). ",
                   "This will likely lead to incorrect results due to broadcasting. ",
@@ -325,7 +326,7 @@ inline Tensor smooth_l1_loss(
   }
 
   std::vector<Tensor> expanded_tensors = torch::broadcast_tensors({input, target});
-  return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction));
+  return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction), beta);
 }
 } // namespace detail
 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
@@ -344,8 +345,9 @@ inline Tensor smooth_l1_loss(
 inline Tensor smooth_l1_loss(
     const Tensor& input,
     const Tensor& target,
-    const SmoothL1LossFuncOptions& options = {}) {
-  return detail::smooth_l1_loss(input, target, options.reduction());
+    const SmoothL1LossFuncOptions& options = {},
+    double beta = 1.) {
+  return detail::smooth_l1_loss(input, target, options.reduction(), beta);
 }
 
 // ============================================================================
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 5a91038a94e6..c09c916ca3f8 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -957,20 +957,24 @@ Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & i
   return output;
 }
 
-Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
+Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
+  // special case to protect against a divide-by-zero.
+  if (beta == 0) {
+      return at::zeros(grad.sizes(), grad.options());
+  }
   auto d = (input - target).abs();
-  auto grad_input = grad * (d < 1).type_as(grad);
+  auto grad_input = grad * (d < beta).type_as(grad) / beta;
   if (reduction == at::Reduction::Mean) {
     grad_input /= input.numel();
   }
   return grad_input;
 }
 
-Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) {
+Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction, double beta) {
   if (reduction == at::Reduction::None) {
-    return smooth_l1_loss_backward(grad, input, target, reduction);
+    return smooth_l1_loss_backward(grad, input, target, reduction, beta);
   }
-  auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction);
+  auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction, beta);
   return (r * grad).sum();
 }
 
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 1e2af0772b52..c253d9ba48b1 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -104,8 +104,8 @@ at::Tensor log_softmax_double_backward(const at::Tensor & grad, const at::Tensor
 at::Tensor binary_cross_entropy_double_backward(const at::Tensor & grad_output, const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
 at::Tensor binary_cross_entropy_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional<at::Tensor>& weight, int64_t reduction);
 at::Tensor l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
-at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
-at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
+at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
+at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta);
 at::Tensor mse_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, int64_t reduction);
 at::Tensor mse_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
 at::Tensor soft_margin_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction);
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 2fdb40b2d93f..c35f7daf5401 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2583,16 +2583,10 @@ def binary_cross_entropy_with_logits(input, target, weight=None, size_average=No
     return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
 
 
-def _smooth_l1_loss(input, target):
-    # type: (Tensor, Tensor) -> Tensor
-    t = torch.abs(input - target)
-    return torch.where(t < 1, 0.5 * t ** 2, t - 0.5)
-
-
-def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
-    # type: (Tensor, Tensor, Optional[bool], Optional[bool], str) -> Tensor
+def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mean', beta=1.0):
+    # type: (Tensor, Tensor, Optional[bool], Optional[bool], str, float) -> Tensor
     r"""Function that uses a squared term if the absolute
-    element-wise error falls below 1 and an L1 term otherwise.
+    element-wise error falls below beta and an L1 term otherwise.
 
     See :class:`~torch.nn.SmoothL1Loss` for details.
     """
@@ -2601,7 +2595,7 @@ def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mea
         if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
             return handle_torch_function(
                 smooth_l1_loss, tens_ops, input, target, size_average=size_average,
-                reduce=reduce, reduction=reduction)
+                reduce=reduce, reduction=reduction, beta=beta)
     if not (target.size() == input.size()):
         warnings.warn("Using a target size ({}) that is different to the input size ({}). "
                       "This will likely lead to incorrect results due to broadcasting. "
@@ -2611,7 +2605,7 @@ def smooth_l1_loss(input, target, size_average=None, reduce=None, reduction='mea
         reduction = _Reduction.legacy_get_string(size_average, reduce)
 
     expanded_input, expanded_target = torch.broadcast_tensors(input, target)
-    return torch._C._nn.smooth_l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
+    return torch._C._nn.smooth_l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction), beta)
 
 
 def l1_loss(input, target, size_average=None, reduce=None, reduction='mean'):
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 91a62a85771e..bd3be4e10daa 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -771,13 +771,15 @@ class SmoothL1Loss(_Loss):
     .. math::
         z_{i} =
         \begin{cases}
-        0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
-        |x_i - y_i| - 0.5, & \text{otherwise }
+        0.5 (x_i - y_i)^2 / beta, & \text{if } |x_i - y_i| < beta \\
+        |x_i - y_i| - 0.5 * beta, & \text{otherwise }
         \end{cases}
 
     :math:`x` and :math:`y` arbitrary shapes with a total of :math:`n` elements each
     the sum operation still operates over all the elements, and divides by :math:`n`.
 
+    beta is an optional parameter that defaults to 1.
+
     The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
 
     Args:
@@ -796,6 +798,8 @@ class SmoothL1Loss(_Loss):
             elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
             and :attr:`reduce` are in the process of being deprecated, and in the meantime,
             specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss.
+            This value defaults to 1.0.
 
     Shape:
         - Input: :math:`(N, *)` where :math:`*` means, any number of additional
@@ -807,11 +811,12 @@ class SmoothL1Loss(_Loss):
     """
     __constants__ = ['reduction']
 
-    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', beta: float = 1.0) -> None:
         super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
+        self.beta = beta
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
-        return F.smooth_l1_loss(input, target, reduction=self.reduction)
+        return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
 
 
 class SoftMarginLoss(_Loss):
diff --git a/torch/overrides.py b/torch/overrides.py
index d5f247e5d51a..352ba76b9593 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -614,7 +614,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.nn.functional.rrelu: lambda input, lower=0.125, upper=0.3333333333333333, training=False, inplace=False: -1,
         torch.nn.functional.selu: lambda input, inplace=False: -1,
         torch.nn.functional.silu: lambda input, inplace=False: -1,
-        torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean', beta=1.: -1,
         torch.nn.functional.soft_margin_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
         torch.nn.functional.softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
         torch.nn.functional.softmin: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 2de86795cda7..e2d10c52d578 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -871,6 +871,36 @@ def smoothl1loss_no_reduce_scalar_test():
         pickle=False)
 
 
+def smoothl1loss_beta_test():
+    t = torch.randn(2, 3, 4)
+    return dict(
+        fullname='SmoothL1Loss_beta',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none', beta=0.5)),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone), 0.5)''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0.5),
+        pickle=False)
+
+
+def smoothl1loss_zero_beta_test():
+    t = torch.randn(2, 3, 4)
+    return dict(
+        fullname='SmoothL1Loss_zero_beta',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none', beta=0)),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone), 0)''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0),
+        pickle=False)
+
+
 def multilabelmarginloss_0d_no_reduce_test():
     t = torch.zeros(()).long()
     return dict(
@@ -1244,6 +1274,8 @@ def fractional_max_pool3d_test(test_case):
     nlllossNd_no_reduce_ignore_index_test(),
     smoothl1loss_no_reduce_test(),
     smoothl1loss_no_reduce_scalar_test(),
+    smoothl1loss_beta_test(),
+    smoothl1loss_zero_beta_test(),
     multilabelmarginloss_0d_no_reduce_test(),
     multilabelmarginloss_1d_no_reduce_test(),
     multilabelmarginloss_index_neg_test(),
@@ -3612,11 +3644,15 @@ def nll_loss_helper(input, target, weight, ignore_index):
         return losses_tensor
 
 
-def smoothl1loss_reference(input, target, reduction='mean'):
+def smoothl1loss_reference(input, target, reduction='mean', beta=1.0):
     abs_diff = (input - target).abs()
-    ge_one_mask = (abs_diff >= 1).type_as(abs_diff)
-    lt_one_mask = (abs_diff < 1).type_as(abs_diff)
-    output = ge_one_mask * (abs_diff - 0.5) + lt_one_mask * 0.5 * (abs_diff ** 2)
+    ge_beta_mask = (abs_diff >= beta).type_as(abs_diff)
+    lt_beta_mask = (abs_diff < beta).type_as(abs_diff)
+    # when beta <= 0 we should just use l1_loss
+    if beta == 0:
+        output = abs_diff
+    else:
+        output = ge_beta_mask * (abs_diff - 0.5 * beta) + lt_beta_mask * 0.5 * (abs_diff ** 2) / beta
     if reduction == 'mean':
         return output.mean()
     elif reduction == 'sum':
@@ -4114,8 +4150,8 @@ def padding3d_circular(input, pad):
         input_size=(5, 10),
         target_fn=lambda: torch.randn((5, 10), requires_grad=True),
         check_sum_reduction=True,
-        reference_fn=lambda i, t, m:
-            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
+        reference_fn=lambda i, t, m, b=1.0:
+            smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b),
     ),
     dict(
         module_name='SoftMarginLoss',
@@ -4355,8 +4391,8 @@ def padding3d_circular(input, pad):
         input_size=(),
         target_fn=lambda: torch.randn((), requires_grad=True),
         check_sum_reduction=True,
-        reference_fn=lambda i, t, m:
-            smoothl1loss_reference(i, t, reduction=get_reduction(m)),
+        reference_fn=lambda i, t, m, b=1.0:
+            smoothl1loss_reference(i, t, reduction=get_reduction(m), beta=b),
         desc='scalar',
     ),
     dict(

From 5211fb97ac4c246151f1286c78d63e0e317a8a4a Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 25 Sep 2020 16:49:32 -0700
Subject: [PATCH 157/449] Remove device maps from TensorPipe for v1.7 release
 (#45353)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45353

Temporarily removing this feature, will add this back after branch cut.

Test Plan: Imported from OSS

Reviewed By: rohan-varma

Differential Revision: D23939865

Pulled By: mrshenli

fbshipit-source-id: 7dceaffea6b9a16512b5ba6036da73e7f8f83a8e
---
 torch/csrc/distributed/rpc/init.cpp           |  14 +-
 .../csrc/distributed/rpc/tensorpipe_agent.cpp |  54 +--
 torch/distributed/rpc/__init__.py             |   2 -
 torch/distributed/rpc/backend_registry.py     |  66 +--
 torch/distributed/rpc/options.py              |  78 +---
 .../_internal/distributed/rpc/rpc_test.py     | 381 ++----------------
 6 files changed, 51 insertions(+), 544 deletions(-)

diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 34023afdce91..6f30666518c4 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -483,15 +483,12 @@ PyObject* rpc_init(PyObject* /* unused */) {
               optional<std::vector<std::string>>,
               optional<std::vector<std::string>>,
               float,
-              std::string,
-              std::unordered_map<std::string, tensorpipe::DeviceMap>>(),
+              std::string>(),
           py::arg("num_worker_threads") = kDefaultNumWorkerThreads,
           py::arg("_transports") = optional<std::vector<std::string>>(),
           py::arg("_channels") = optional<std::vector<std::string>>(),
           py::arg("rpc_timeout") = kDefaultRpcTimeoutSeconds,
-          py::arg("init_method") = kDefaultInitMethod,
-          py::arg("device_maps") =
-              std::unordered_map<std::string, tensorpipe::DeviceMap>())
+          py::arg("init_method") = kDefaultInitMethod)
       .def_readwrite(
           "num_worker_threads",
           &TensorPipeRpcBackendOptions::numWorkerThreads,
@@ -499,12 +496,7 @@ PyObject* rpc_init(PyObject* /* unused */) {
               The number of threads in the thread-pool used by
               :class:`~torch.distributed.rpc.TensorPipeAgent` to execute
               requests.
-          )")
-      .def_readwrite(
-          "device_maps",
-          &TensorPipeRpcBackendOptions::deviceMaps,
-          R"(The device map locations.)")
-      .def("set_device_map", &TensorPipeRpcBackendOptions::setDeviceMap);
+          )");
 
   module.attr("_DEFAULT_NUM_WORKER_THREADS") =
       py::cast(kDefaultNumWorkerThreads);
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 11c5408c2c35..0484cbc955cb 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -32,12 +32,9 @@ const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls";
 inline void checkCPUTensor(const torch::Tensor& tensor) {
   TORCH_CHECK(
       tensor.device() == at::kCPU,
-      "TensorPipeAgent only supports CPU tensors by default. Sending "
-      "GPU tensors using RPC requires explicitly configurations using "
-      "`set_device_map` on `TensorPipeRpcBackendOptions`. Got a tensor "
-      "with device ",
-      tensor.device(),
-      ", but no device map is specified.");
+      "TensorPipe RPC backend only supports CPU tensors, please move your ",
+      "tensors to CPU before sending them over RPC. Found tensor on device: ",
+      tensor.device());
 }
 
 std::vector<c10::DeviceIndex> getDevicesForTensors(
@@ -480,41 +477,16 @@ void TensorPipeAgent::sendCompletedResponseMessage(
   Message&& responseMessage = std::move(*futureResponseMessage).moveValue();
   responseMessage.setId(messageId);
   if (!error) {
-    const auto& iter = reverseDeviceMaps_.find(pipe->getRemoteName());
-    if (iter == opts_.deviceMaps.end()) {
-      for (const auto& t : responseMessage.tensors()) {
-        if (!t.device().is_cpu()) {
-          responseMessage = createExceptionResponse(
-              c10::str(
-                  "TensorPipe RPC backend only supports CPU tensors by default,"
-                  " please move your tensors to CPU before sending them over "
-                  "RPC, or call `set_device_map` on "
-                  "`TensorPipeRpcBackendOptions` to explicitly configure "
-                  "device mapping. Response device mapping is not available for "
-                  "destination ",
-                  pipe->getRemoteName(),
-                  ", but found tensor on device: ",
-                  t.device()),
-              responseMessage.id());
-          break;
-        }
-      }
-    } else {
-      const auto& deviceMap = iter->second;
-      for (const auto& t : responseMessage.tensors()) {
-        if (!t.device().is_cpu() &&
-            deviceMap.find(t.device().index()) == deviceMap.end()) {
-          responseMessage = createExceptionResponse(
-              c10::str(
-                  "TensorPipe RPC backend only supports CPU tensors by default."
-                  " Response device mapping is not available for destination ",
-                  pipe->getRemoteName(),
-                  " for device ",
-                  t.device(),
-                  " but received a tensor on that device."),
-              responseMessage.id());
-          break;
-        }
+    for (const auto& tensor : responseMessage.tensors()) {
+      if (!tensor.device().is_cpu()) {
+        responseMessage = createExceptionResponse(
+            c10::str(
+                "TensorPipe RPC backend only supports CPU tensors, please ",
+                "move your tensors to CPU before sending them over RPC. Found ",
+                "tensor on device: ",
+                tensor.device()),
+            responseMessage.id());
+        break;
       }
     }
 
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index 4598c78e72fe..2c579bcc8fe9 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -19,8 +19,6 @@ def is_available():
     raise RuntimeError("Failed to initialize torch.distributed.rpc")
 
 
-
-
 if is_available():
     from . import api, backend_registry, functions, _set_profiler_node_id
     from . import (
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 6dac7cb0863a..57ff9de99ad2 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -3,10 +3,8 @@
 from datetime import timedelta
 import enum
 
-import torch
 import torch.distributed as dist
 
-from . import api
 from . import constants as rpc_constants
 
 
@@ -185,57 +183,6 @@ def _tensorpipe_construct_rpc_backend_options_handler(
     )
 
 
-# detect if any worker has invalid device_map configurations, and return
-# names of failed workers
-def _tensorpipe_check_device_maps(agent, device_maps):
-    if device_maps is None:
-        device_maps = {}
-
-    def check_one_worker(name, device_maps, all_device_counts):
-        device_count = all_device_counts[name]
-        wrong_worker_names = set(device_maps) - set(all_device_counts)
-        if wrong_worker_names:
-            raise ValueError(f"Wrong worker names: {wrong_worker_names}")
-        for worker_name in all_device_counts:
-            remote_device_count = all_device_counts[worker_name]
-            if worker_name in device_maps:
-                device_map = device_maps[worker_name]
-                key_set = set(device_map.keys())
-                val_set = set(device_map.values())
-                if not all([
-                    len(device_map) == len(key_set),
-                    len(device_map) == len(val_set),  # check 1-to-1 mapping
-                    min(key_set) >= 0,
-                    max(key_set) < device_count,  # check local range
-                    min(val_set) >= 0,
-                    max(val_set) < remote_device_count  # check remote range
-                ]):
-                    raise ValueError(
-                        f"Invalid device_map configuration on {name}:\n"
-                        f"device_maps = {device_maps}"
-                    )
-
-    gathered = api._all_gather([torch.cuda.device_count(), device_maps])
-    all_device_counts = {name: gathered[name][0] for name in gathered}
-    all_device_maps = {name: gathered[name][1] for name in gathered}
-    for worker_name in all_device_maps:
-        worker_device_maps = all_device_maps[worker_name]
-        check_one_worker(worker_name, worker_device_maps, all_device_counts)
-
-    # passed all checked, construct reverse mapping for return values
-    reverse_device_maps = {}
-    local_name = api.get_worker_info().name
-    for worker_name in all_device_maps:
-        remote_device_maps = all_device_maps[worker_name]
-        if local_name in remote_device_maps:
-            remote_device_map = remote_device_maps[local_name]
-            reverse_device_maps[worker_name] = {
-                remote_device_map[k]: k for k in remote_device_map
-            }
-
-    agent._set_reverse_device_maps(reverse_device_maps)
-
-
 def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_options):
     from . import TensorPipeRpcBackendOptions
     from . import TensorPipeAgent
@@ -259,21 +206,10 @@ def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_
     group = _init_process_group(store, rank, world_size)
 
     # TODO: add try-except and destroy _agent in all processes if any fails.
-    agent = TensorPipeAgent(
+    return TensorPipeAgent(
         store, name, rank, world_size, group, rpc_backend_options
     )
 
-    api._init_rpc_states(agent)
-
-    try:
-        _tensorpipe_check_device_maps(agent, rpc_backend_options.device_maps)
-        agent.join()
-    except Exception:
-        api.shutdown()
-        raise
-
-    return agent
-
 
 register_backend(
     "TENSORPIPE",
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index 149a2544d217..a5b72333aa65 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -1,9 +1,7 @@
 from . import _TensorPipeRpcBackendOptionsBase
 from . import constants as rpc_contants
 
-import torch
-
-from typing import Dict, List
+from typing import List
 
 
 class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
@@ -27,11 +25,6 @@ class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
             store used for rendezvous. It takes any value accepted for the
             same argument of :meth:`~torch.distributed.init_process_group`
             (default: ``env://``).
-        device_maps (Dict[str, Dict]): Device placement mappings from this
-            worker to the callee. Key is the callee worker name and value the
-            dictionary (``Dict`` of ``int``, ``str``, or ``torch.device``) that
-            maps this worker's devices to the callee worker's devices.
-            (default: ``None``)
     """
     def __init__(
         self,
@@ -39,7 +32,6 @@ def __init__(
         num_worker_threads: int = rpc_contants.DEFAULT_NUM_WORKER_THREADS,
         rpc_timeout: float = rpc_contants.DEFAULT_RPC_TIMEOUT_SEC,
         init_method: str = rpc_contants.DEFAULT_INIT_METHOD,
-        device_maps: Dict = None,
         _transports: List = None,
         _channels: List = None,
     ):
@@ -48,71 +40,5 @@ def __init__(
             _transports,
             _channels,
             rpc_timeout,
-            init_method,
-            device_maps if device_maps else {}
+            init_method
         )
-
-    def set_device_map(self, to: str, device_map: Dict):
-        r"""
-        Set device mapping between each RPC caller and callee pair. This
-        function can be called multiple times to incrementally add
-        device placement configurations.
-
-        Arguments:
-            worker_name (str): Callee name.
-            device_map (Dict of int, str, or torch.device): Device placement
-                mappings from this worker to the callee. This map must be
-                invertible.
-
-        Example::
-            >>> # both workers
-            >>> def add(x, y):
-            >>>     print(x)  # tensor([1., 1.], device='cuda:1')
-            >>>     return x + y, (x + y).to(2)
-            >>>
-            >>> # on worker 0
-            >>> options = TensorPipeRpcBackendOptions(
-            >>>     num_worker_threads=8,
-            >>>     device_maps={"worker1": {0, 1}}
-            >>>     # maps worker0's cuda:0 to worker1's cuda:1
-            >>> )
-            >>> options.set_device_map("worker1", {1, 2})
-            >>> # maps worker0's cuda:1 to worker1's cuda:2
-            >>>
-            >>> rpc.init_rpc(
-            >>>     "worker0",
-            >>>     rank=0,
-            >>>     world_size=2
-            >>>     backend=rpc.BackendType.TENSORPIPE,
-            >>>     rpc_backend_options=options
-            >>> )
-            >>>
-            >>> x = torch.ones(2)
-            >>> rets = rpc.rpc_sync("worker1", add, args=(x.to(0), 1))
-            >>> # The first argument will be moved to cuda:1 on worker1. When
-            >>> # sending the return value back, it will follow the invert of
-            >>> # the device map, and hence will be moved back to cuda:0 and
-            >>> # cuda:1 on worker0
-            >>> print(rets[0])  # tensor([2., 2.], device='cuda:0')
-            >>> print(rets[0])  # tensor([2., 2.], device='cuda:1')
-        """
-        device_index_map = {}
-        curr_device_maps = super().device_maps
-        for k in device_map:
-            v = device_map[k]
-            k, v = torch.device(k), torch.device(v)
-            if k.type != 'cuda' or v.type != 'cuda':
-                raise ValueError(
-                    "`set_device_map` only supports CUDA devices, "
-                    f"but got device pair {k}: {v}"
-
-                )
-            if to in curr_device_maps and k.index in curr_device_maps[to]:
-                curr_v = super().device_maps[to][k.index]
-                if curr_v != v.index:
-                    raise ValueError(
-                        "`set_device_map` only supports 1-to-1 mapping, "
-                        f"trying to map {k} to {v} and {curr_v}"
-                    )
-            device_index_map[k.index] = v.index
-        super().set_device_map(to, device_index_map)
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index abb1ab037e68..734f79db66b6 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -2876,6 +2876,38 @@ def _return_gpu_tensor_list():
     def _gpu_tensor_list_arg(tensor_list):
         return torch.rand(3, 3)
 
+    @skip_if_lt_x_gpu(2)
+    @dist_init
+    def test_cuda(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        t1 = torch.rand(3, 3).cuda(0)
+        t2 = torch.rand(3, 3).cuda(1)
+        t3 = torch.rand(3, 3)
+
+        # cuda tensors as args fail.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, torch.add, args=(t1, t2))
+
+        # mix of cpu and cuda tensors as args fail.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, torch.add, args=(t1, t3))
+
+        # gpu tensor list as args fails.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, RpcTest._gpu_tensor_list_arg, args=([t1, t2]))
+
+        # cuda tensors as return values fail.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor, args=())
+
+        # cuda tensors as a list of return value fails
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor_list, args=())
+
+        # Sending to self should fail too.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(worker_name(self.rank), torch.add, args=(t1, t2))
+
     def _create_rref(self):
         owner_rank = (self.rank + 2) % self.world_size
         return rpc.remote(
@@ -3609,39 +3641,6 @@ def test_logs_deprecation_warning(self):
             "\n".join(cm.output),
         )
 
-    @skip_if_lt_x_gpu(2)
-    @dist_init
-    def test_cuda(self):
-        dst = worker_name((self.rank + 1) % self.world_size)
-        t1 = torch.rand(3, 3).cuda(0)
-        t2 = torch.rand(3, 3).cuda(1)
-        t3 = torch.rand(3, 3)
-
-        # cuda tensors as args fail.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, torch.add, args=(t1, t2))
-
-        # mix of cpu and cuda tensors as args fail.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, torch.add, args=(t1, t3))
-
-        # gpu tensor list as args fails.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, RpcTest._gpu_tensor_list_arg, args=([t1, t2]))
-
-        # cuda tensors as return values fail.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor, args=())
-
-        # cuda tensors as a list of return value fails
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor_list, args=())
-
-        # Sending to self should fail too.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(worker_name(self.rank), torch.add, args=(t1, t2))
-
-
     def test_single_threaded_rref_owner(self):
         # We need a process group in order to perform a barrier at the end.
         dist.init_process_group(
@@ -4288,319 +4287,3 @@ def test_tensorpipe_options_throw_on_timedelta_timeout(self):
                 num_worker_threads=self.rpc_backend_options.num_worker_threads,
                 rpc_timeout=timeout,
             )
-
-    def _test_device_maps(self, options, errMsg="Invalid device_map"):
-        with self.assertRaisesRegex(ValueError, errMsg):
-            rpc.init_rpc(
-                name=worker_name(self.rank),
-                backend=self.rpc_backend,
-                rank=self.rank,
-                world_size=self.world_size,
-                rpc_backend_options=options,
-            )
-
-        self.assertFalse(rpc.api._is_current_rpc_agent_set())
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_wrong_worker_name(self):
-        options = self.rpc_backend_options
-        options.set_device_map("none_exist", {0: 1})
-        self._test_device_maps(options, "Wrong worker names")
-
-    @skip_if_lt_x_gpu(1)
-    def test_device_maps_invalid_max_local_device(self):
-        options = self.rpc_backend_options
-        dst = worker_name((self.rank + 1) % self.world_size)
-        options.set_device_map(dst, {torch.cuda.device_count(): 0})
-
-        self._test_device_maps(options)
-
-    @skip_if_lt_x_gpu(1)
-    def test_device_maps_invalid_max_remote_device(self):
-        options = self.rpc_backend_options
-        dst = worker_name((self.rank + 1) % self.world_size)
-        options.set_device_map(dst, {0: torch.cuda.device_count()})
-
-        self._test_device_maps(options)
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_many_to_one(self):
-        options = self.rpc_backend_options
-        dst = worker_name((self.rank + 1) % self.world_size)
-        options.set_device_map(dst, {1: 0})
-        options.set_device_map(dst, {0: 0})
-
-        self._test_device_maps(options)
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_one_to_many(self):
-        if self.rank == 0:
-            options = self.rpc_backend_options
-            dst = worker_name((self.rank + 1) % self.world_size)
-            options.set_device_map(dst, {0: 1})
-            with self.assertRaisesRegex(
-                ValueError, "`set_device_map` only supports 1-to-1 mapping"
-            ):
-                options.set_device_map(dst, {0: 0})
-
-    @skip_if_lt_x_gpu(1)
-    def test_device_maps_invalid_min_device(self):
-        options = self.rpc_backend_options
-        dst = worker_name((self.rank + 1) % self.world_size)
-        with self.assertRaisesRegex(
-            RuntimeError, "Device index must not be negative"
-        ):
-            options.set_device_map(dst, {-1: 0})
-
-        with self.assertRaisesRegex(
-            RuntimeError, "Device index must not be negative"
-        ):
-            options.set_device_map(dst, {0: -1})
-
-    @staticmethod
-    def _gpu_add(x, y):
-        if all([x.is_cuda, x.device.index == 1, y.is_cuda, y.device.index == 1]):
-            return (x + y).to(0)
-        else:
-            raise ValueError("Wrong device affinity")
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_gpu(self):
-        options = self.rpc_backend_options
-        dst = worker_name((self.rank + 1) % self.world_size)
-        options.set_device_map(dst, {0: 1, 1: 0})
-
-        rpc.init_rpc(
-            name=worker_name(self.rank),
-            backend=self.rpc_backend,
-            rank=self.rank,
-            world_size=self.world_size,
-            rpc_backend_options=options,
-        )
-
-        ret = rpc.rpc_sync(
-            dst,
-            TensorPipeAgentRpcTest._gpu_add,
-            args=(torch.zeros(2).to(0), torch.ones(2).to(0))
-        )
-        self.assertEqual(ret.device, torch.device(1))
-        self.assertEqual(ret, (torch.zeros(2) + torch.ones(2)).to(1))
-        rpc.shutdown()
-
-    @staticmethod
-    def _gpu_add_multi_gpu(x, y):
-        if all([x.is_cuda, x.device.index == 0, y.is_cuda, y.device.index == 1]):
-            return x + y.to(0), x.to(1) - y
-        else:
-            raise ValueError("Wrong device affinity")
-
-    def _test_device_maps_multi_gpu(self, dst):
-        options = self.rpc_backend_options
-        options.set_device_map(dst, {1: 0})
-        options.set_device_map(dst, {0: 1})
-
-        rpc.init_rpc(
-            name=worker_name(self.rank),
-            backend=self.rpc_backend,
-            rank=self.rank,
-            world_size=self.world_size,
-            rpc_backend_options=options,
-        )
-
-        rets = rpc.rpc_sync(
-            dst,
-            TensorPipeAgentRpcTest._gpu_add_multi_gpu,
-            args=(torch.zeros(2).to(1), torch.ones(2).to(0))
-        )
-        self.assertEqual(rets[0].device, torch.device(1))
-        self.assertEqual(rets[1].device, torch.device(0))
-        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(1))
-        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
-        rpc.shutdown()
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_multi_gpu(self):
-        dst = worker_name((self.rank + 1) % self.world_size)
-        self._test_device_maps_multi_gpu(dst)
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_multi_gpu_self(self):
-        dst = worker_name(self.rank)
-        self._test_device_maps_multi_gpu(dst)
-
-    @staticmethod
-    def _gpu_add_return_to_gpu(x, y):
-        if x.device.type == 'cpu' and y.device.type == 'cpu':
-            return (x + y).to(0), (x - y).to(1), (x * y).to(2), (x / y).to(3)
-        else:
-            raise ValueError("Wrong device affinity")
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_in_options(self):
-        dst = worker_name((self.rank + 1) % self.world_size)
-        options = self.rpc_backend_options
-
-        rpc.init_rpc(
-            name=worker_name(self.rank),
-            backend=self.rpc_backend,
-            rank=self.rank,
-            world_size=self.world_size,
-            rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
-                init_method=options.init_method,
-                num_worker_threads=options.num_worker_threads,
-                device_maps={dst: {0: 1, 1: 0}}
-            )
-        )
-
-        rets = rpc.rpc_sync(
-            dst,
-            TensorPipeAgentRpcTest._gpu_add_multi_gpu,
-            args=(torch.zeros(2).to(1), torch.ones(2).to(0))
-        )
-        self.assertEqual(rets[0].device, torch.device(1))
-        self.assertEqual(rets[1].device, torch.device(0))
-        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(1))
-        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
-        rpc.shutdown()
-
-    def _test_device_maps_return_to_gpu(self, dst):
-        options = self.rpc_backend_options
-
-        options.set_device_map(dst, {0: 1})
-        options.set_device_map(dst, {1: 2})
-        options.set_device_map(dst, {2: 3})
-        options.set_device_map(dst, {3: 0})
-
-        rpc.init_rpc(
-            name=worker_name(self.rank),
-            backend=self.rpc_backend,
-            rank=self.rank,
-            world_size=self.world_size,
-            rpc_backend_options=options,
-        )
-
-        rets = rpc.rpc_sync(
-            dst,
-            TensorPipeAgentRpcTest._gpu_add_return_to_gpu,
-            args=(torch.zeros(2), torch.ones(2))
-        )
-        for i in range(len(rets)):
-            self.assertEqual(rets[i].device, torch.device((3 + i) % 4))
-        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(3))
-        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
-        self.assertEqual(rets[2], (torch.zeros(2) * torch.ones(2)).to(1))
-        self.assertEqual(rets[3], (torch.zeros(2) / torch.ones(2)).to(2))
-        rpc.shutdown()
-
-    @skip_if_lt_x_gpu(4)
-    def test_device_maps_return_to_gpu(self):
-        dst = worker_name((self.rank + 1) % self.world_size)
-        self._test_device_maps_return_to_gpu(dst)
-
-    @skip_if_lt_x_gpu(4)
-    def test_device_maps_return_to_gpu_self(self):
-        dst = worker_name(self.rank)
-        self._test_device_maps_return_to_gpu(dst)
-
-    @staticmethod
-    def _add_to_gpu(x, y):
-        return (x + y).to(0)
-
-    def _test_device_maps_missing_config(self, mode):
-        dst = worker_name((self.rank + 1) % self.world_size)
-        errMsg = (
-            "TensorPipeAgent only supports CPU tensors by default.*"
-            "`set_device_map` on `TensorPipeRpcBackendOptions`"
-        )
-
-        with self.assertRaisesRegex(RuntimeError, errMsg):
-            if mode == RPCExecMode.SYNC:
-                rpc.rpc_sync(dst, torch.add, args=(torch.zeros(2).to(0), 1))
-            elif mode == RPCExecMode.REMOTE:
-                rpc.remote(dst, torch.add, args=(torch.zeros(2).to(0), 1)).to_here()
-            else:
-                raise ValueError(f"unexpected mode {mode}")
-
-        # make sure RPC is still functioning
-        ret = rpc.rpc_sync(dst, torch.add, args=(torch.ones(2), 1))
-        self.assertEqual(ret, torch.ones(2) + 1)
-
-    def _test_device_maps_missing_config_response(self, mode):
-        dst = worker_name((self.rank + 1) % self.world_size)
-        errMsg = "Response device mapping is not available"
-
-        with self.assertRaisesRegex(RuntimeError, errMsg):
-            if mode == RPCExecMode.SYNC:
-                rpc.rpc_sync(
-                    dst,
-                    TensorPipeAgentRpcTest._add_to_gpu,
-                    args=(torch.zeros(2), 1)
-                )
-            elif mode == RPCExecMode.REMOTE:
-                rpc.remote(
-                    dst,
-                    TensorPipeAgentRpcTest._add_to_gpu,
-                    args=(torch.zeros(2), 1)
-                ).to_here()
-            else:
-                raise ValueError(f"unexpected mode {mode}")
-
-        # make sure RPC is still functioning
-        ret = rpc.rpc_sync(dst, torch.add, args=(torch.ones(2), 1))
-        self.assertEqual(ret, torch.ones(2) + 1)
-
-    @skip_if_lt_x_gpu(1)
-    @dist_init
-    def test_device_maps_missing_config(self):
-        self._test_device_maps_missing_config(RPCExecMode.SYNC)
-
-    @skip_if_lt_x_gpu(1)
-    @dist_init
-    def test_device_maps_missing_config_loop(self):
-        for _ in range(self.rpc_backend_options.num_worker_threads + 5):
-            self._test_device_maps_missing_config(RPCExecMode.SYNC)
-
-    @skip_if_lt_x_gpu(1)
-    @dist_init
-    def test_device_maps_missing_config_response(self):
-        self._test_device_maps_missing_config_response(RPCExecMode.SYNC)
-
-    @skip_if_lt_x_gpu(1)
-    @dist_init
-    def test_device_maps_missing_config_response_loop(self):
-        for _ in range(self.rpc_backend_options.num_worker_threads + 5):
-            self._test_device_maps_missing_config_response(RPCExecMode.SYNC)
-
-    @skip_if_lt_x_gpu(1)
-    @dist_init
-    def test_device_maps_missing_config_remote(self):
-        self._test_device_maps_missing_config(RPCExecMode.REMOTE)
-
-    @skip_if_lt_x_gpu(1)
-    @dist_init
-    def test_device_maps_missing_config_remote_response(self):
-        self._test_device_maps_missing_config_response(RPCExecMode.REMOTE)
-
-    @skip_if_lt_x_gpu(2)
-    def test_device_maps_remote(self):
-        options = self.rpc_backend_options
-        dst = worker_name((self.rank + 1) % self.world_size)
-        options.set_device_map(dst, {1: 0})
-
-        rpc.init_rpc(
-            name=worker_name(self.rank),
-            backend=self.rpc_backend,
-            rank=self.rank,
-            world_size=self.world_size,
-            rpc_backend_options=options,
-        )
-
-        rref = rpc.remote(
-            dst,
-            TensorPipeAgentRpcTest._add_to_gpu,
-            args=(torch.zeros(2), 1)
-        )
-
-        self.assertEqual(rref.to_here(), torch.ones(2).to(1))
-
-        rpc.shutdown()

From 8ab2ad306d247679fc1ab2b5941f9e978f8e178b Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 25 Sep 2020 16:58:39 -0700
Subject: [PATCH 158/449] Enable `torch.cuda.nccl` typechecking (#45344)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45336

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45344

Reviewed By: walterddr

Differential Revision: D23935306

Pulled By: malfet

fbshipit-source-id: dd09d4f8ff7a327131764487158675027a13bf69
---
 mypy.ini                        |  6 ------
 torch/_C/__init__.pyi.in        | 26 ++++++++++++++++++++++++++
 torch/csrc/cuda/python_nccl.cpp | 12 +++++++++---
 torch/cuda/nccl.py              | 31 +++++++++++++++++++++----------
 4 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 50287b8ee99e..195686d15ec3 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -186,12 +186,6 @@ ignore_errors = True
 [mypy-torch.cuda.amp.*]
 ignore_errors = True
 
-#[mypy-torch.cuda.comm]
-#ignore_errors = True
-
-[mypy-torch.cuda.nccl]
-ignore_errors = True
-
 [mypy-torch._lobpcg]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 2543e724b1e0..8fcfb4e176b5 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -511,6 +511,32 @@ def _cuda_lock_mutex() -> None: ...
 def _cuda_unlock_mutex() -> None: ...
 def _nccl_version() -> _int: ...
 def _nccl_unique_id() -> bytes: ...
+def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ...
+def _nccl_reduce(input: Sequence[Tensor],
+                 output: Tensor,
+                 root: _int,
+                 op: _int,
+                 streams: Optional[Sequence[_CudaStreamBase]],
+                 comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_all_reduce(input: Sequence[Tensor],
+                     output: Sequence[Tensor],
+                     op: _int,
+                     streams: Optional[Sequence[_CudaStreamBase]],
+                     comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_broadcast(input: Sequence[Tensor],
+                    root: _int,
+                    streams: Optional[Sequence[_CudaStreamBase]],
+                    comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_all_gather(input: Sequence[Tensor],
+                     output: Sequence[Tensor],
+                     streams: Optional[Sequence[_CudaStreamBase]],
+                     comms: Optional[Sequence[object]]) -> None: ...
+def _nccl_reduce_scatter(input: Sequence[Tensor],
+                         output: Sequence[Tensor],
+                         op: _int,
+                         streams: Optional[Sequence[_CudaStreamBase]],
+                         comms: Optional[Sequence[object]]) -> None: ...
+
 
 class _CudaDeviceProperties:
     name: str
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 403bcb2b85da..35dbeae3f3aa 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -199,7 +199,9 @@ PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
         nullptr,
         "nccl_broadcast",
         1,
-        "(sequence[Tensor] inputs, int root)");
+        "(sequence[Tensor] inputs, int root"
+        " sequence[torch.cuda.Stream] streams,"
+        " sequence[torch.cuda.nccl.Communicator] comms)");
     return nullptr;
   }
 
@@ -228,7 +230,9 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
         nullptr,
         "nccl_all_gather",
         1,
-        "(sequence[Tensor] inputs, sequence[Tensor] outputs");
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs"
+        " sequence[torch.cuda.Stream] streams,"
+        " sequence[torch.cuda.nccl.Communicator] comms)");
     return nullptr;
   }
 
@@ -258,7 +262,9 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
         nullptr,
         "nccl_reduce_scatter",
         1,
-        "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op");
+        "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op"
+        " sequence[torch.cuda.Stream] streams,"
+        " sequence[torch.cuda.nccl.Communicator] comms)");
     return nullptr;
   }
 
diff --git a/torch/cuda/nccl.py b/torch/cuda/nccl.py
index 9ce7a51f0dd1..94108a3dadad 100644
--- a/torch/cuda/nccl.py
+++ b/torch/cuda/nccl.py
@@ -3,6 +3,7 @@
 
 import torch._six
 import torch.cuda
+from typing import Optional, Sequence, Union
 
 
 __all__ = ['all_reduce', 'reduce', 'broadcast', 'all_gather', 'reduce_scatter']
@@ -43,7 +44,7 @@ def init_rank(num_ranks, uid, rank):
     return torch._C._nccl_init_rank(num_ranks, uid, rank)
 
 
-def _check_sequence_type(inputs):
+def _check_sequence_type(inputs: Union[torch.Tensor, Sequence[torch.Tensor]]) -> None:
     if not isinstance(inputs, collections.Container) or isinstance(inputs, torch.Tensor):
         raise TypeError("Inputs should be a collection of tensors")
 
@@ -58,8 +59,15 @@ def all_reduce(inputs, outputs=None, op=SUM, streams=None, comms=None):
 
 # `output` used to be `outputs`, taking in a list of tensors. So we have two
 # arguments for BC reasons.
-def reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None, *, outputs=None):
+def reduce(inputs: Sequence[torch.Tensor],
+           output: Optional[Union[torch.Tensor, Sequence[torch.Tensor]]] = None,
+           root: int = 0,
+           op: int = SUM,
+           streams: Optional[Sequence[torch.cuda.Stream]] = None,
+           comms=None, *,
+           outputs: Optional[Sequence[torch.Tensor]] = None) -> None:
     _check_sequence_type(inputs)
+    _output: torch.Tensor
     if outputs is not None:
         if output is not None:
             raise ValueError(
@@ -70,30 +78,33 @@ def reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None, *, out
             warnings.warn(
                 "nccl.reduce with an output tensor list is deprecated. "
                 "Please specify a single output tensor with argument 'output' instead instead.")
-            output = outputs[root]
+            _output = outputs[root]
     elif not isinstance(output, torch.Tensor) and isinstance(output, torch._six.container_abcs.Sequence):
         # User called old API with positional arguments of list of output tensors.
         warnings.warn(
             "nccl.reduce with an output tensor list is deprecated. "
             "Please specify a single output tensor.")
-        output = output[root]
-    elif output is None:
-        output = inputs[root]
-    torch._C._nccl_reduce(inputs, output, root, op, streams, comms)
+        _output = output[root]
+    else:
+        _output = inputs[root] if output is None else output
+    torch._C._nccl_reduce(inputs, _output, root, op, streams, comms)
 
 
-def broadcast(inputs, root=0, streams=None, comms=None):
+def broadcast(inputs: Sequence[torch.Tensor], root: int = 0, streams=None, comms=None) -> None:
     _check_sequence_type(inputs)
     torch._C._nccl_broadcast(inputs, root, streams, comms)
 
 
-def all_gather(inputs, outputs, streams=None, comms=None):
+def all_gather(inputs: Sequence[torch.Tensor], outputs: Sequence[torch.Tensor], streams=None, comms=None) -> None:
     _check_sequence_type(inputs)
     _check_sequence_type(outputs)
     torch._C._nccl_all_gather(inputs, outputs, streams, comms)
 
 
-def reduce_scatter(inputs, outputs, op=SUM, streams=None, comms=None):
+def reduce_scatter(inputs: Sequence[torch.Tensor],
+                   outputs: Sequence[torch.Tensor],
+                   op: int = SUM,
+                   streams=None, comms=None) -> None:
     _check_sequence_type(inputs)
     _check_sequence_type(outputs)
     torch._C._nccl_reduce_scatter(inputs, outputs, op, streams, comms)

From 0444c372e16f1c7d9ea3565510cae1e0ba1b1287 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Fri, 25 Sep 2020 17:02:16 -0700
Subject: [PATCH 159/449] [optimizer] introduce optimizer functional API,
 refactor Adagrad (#44715)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44715

We have provided a nice and intuitive API in Python. But in the context of large scale distributed training (e.g. Distributed Model Parallel), users often want to use multithreaded training instead of multiprocess training as it provides better resource utilization and efficiency.

This PR introduces functional optimizer concept (that is similar to the concept of `nn.functional`), we split optimizer into two parts: 1. optimizer state management 2. optimizer computation. We expose the computation part as a separate functional API that is available to be used by internal and OSS developers, the caller of the functional API will maintain their own states in order to directly calls the functional API. While maintaining the end user API be the same, the functional API is TorchScript friendly, and could be used by the distributed optimizer to speed up the training without GIL.

Test Plan: Imported from OSS

Reviewed By: ailzhang

Differential Revision: D23935258

Pulled By: wanchaol

fbshipit-source-id: d2a5228439edb3bc64f7771af2bb9e891847136a
---
 torch/optim/adagrad.py    | 58 ++++++++++++++++-----------------------
 torch/optim/functional.py | 46 +++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 34 deletions(-)
 create mode 100644 torch/optim/functional.py

diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 761648537fc6..a4bffc0efbc6 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -1,4 +1,5 @@
 import torch
+from . import functional as F
 from .optimizer import Optimizer
 
 
@@ -63,40 +64,29 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-
-                grad = p.grad
-                state = self.state[p]
-
-                state['step'] += 1
+            params_with_grad = []
+            grads = []
+            state_sums = []
+            state_steps = []
 
-                if group['weight_decay'] != 0:
-                    if p.grad.is_sparse:
-                        raise RuntimeError("weight_decay option is not compatible with sparse gradients")
-                    grad = grad.add(p, alpha=group['weight_decay'])
-
-                clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])
-
-                if grad.is_sparse:
-                    grad = grad.coalesce()  # the update is non-linear so indices must be unique
-                    grad_indices = grad._indices()
-                    grad_values = grad._values()
-                    size = grad.size()
-
-                    def make_sparse(values):
-                        constructor = grad.new
-                        if grad_indices.dim() == 0 or values.dim() == 0:
-                            return constructor().resize_as_(grad)
-                        return constructor(grad_indices, values, size)
-                    state['sum'].add_(make_sparse(grad_values.pow(2)))
-                    std = state['sum'].sparse_mask(grad)
-                    std_values = std._values().sqrt_().add_(group['eps'])
-                    p.add_(make_sparse(grad_values / std_values), alpha=-clr)
-                else:
-                    state['sum'].addcmul_(grad, grad, value=1)
-                    std = state['sum'].sqrt().add_(group['eps'])
-                    p.addcdiv_(grad, std, value=-clr)
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+                    state = self.state[p]
+                    state_sums.append(state['sum'])
+                    # update the steps for each param group update
+                    state['step'] += 1
+                    # record the step after step update
+                    state_steps.append(state['step'])
+
+            F.adagrad(params_with_grad,
+                      grads,
+                      state_sums,
+                      state_steps,
+                      group['lr'],
+                      group['weight_decay'],
+                      group['lr_decay'],
+                      group['eps'])
 
         return loss
diff --git a/torch/optim/functional.py b/torch/optim/functional.py
new file mode 100644
index 000000000000..4d66f3bb91ff
--- /dev/null
+++ b/torch/optim/functional.py
@@ -0,0 +1,46 @@
+r"""Functional interface"""
+from torch import Tensor
+from typing import List
+
+# TODO: use foreach API in optim.functional to do all the computation
+
+def adagrad(params: List[Tensor],
+            grads: List[Tensor],
+            state_sums: List[Tensor],
+            state_steps: List[int],
+            lr: float,
+            weight_decay: float,
+            lr_decay: float,
+            eps: float):
+    r"""Functional API that performs Adagrad algorithm computation.
+
+    See :class:`~torch.optim.Adagrad` for details.
+    """
+
+    for (param, grad, state_sum, step) in zip(params, grads, state_sums, state_steps):
+        if weight_decay != 0:
+            if grad.is_sparse:
+                raise RuntimeError("weight_decay option is not compatible with sparse gradients")
+            grad = grad.add(param, alpha=weight_decay)
+
+        clr = lr / (1 + (step - 1) * lr_decay)
+
+        if grad.is_sparse:
+            grad = grad.coalesce()  # the update is non-linear so indices must be unique
+            grad_indices = grad._indices()
+            grad_values = grad._values()
+            size = grad.size()
+
+            def make_sparse(values):
+                constructor = grad.new
+                if grad_indices.dim() == 0 or values.dim() == 0:
+                    return constructor().resize_as_(grad)
+                return constructor(grad_indices, values, size)
+            state_sum.add_(make_sparse(grad_values.pow(2)))
+            std = state_sum.sparse_mask(grad)
+            std_values = std._values().sqrt_().add_(eps)
+            param.add_(make_sparse(grad_values / std_values), alpha=-clr)
+        else:
+            state_sum.addcmul_(grad, grad, value=1)
+            std = state_sum.sqrt().add_(eps)
+            param.addcdiv_(grad, std, value=-clr)

From 08caf1550249db707225b35d741cb8dd0dddd5db Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Fri, 25 Sep 2020 17:02:16 -0700
Subject: [PATCH 160/449] [optimizer] refactor Adam to use functional API
 (#44791)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44791

Test Plan: Imported from OSS

Reviewed By: ailzhang

Differential Revision: D23935257

Pulled By: wanchaol

fbshipit-source-id: 6f6e22a9287f5515d2e4e6abd4dee2fe7e17b945
---
 test/test_optim.py        |   4 ++
 torch/optim/adam.py       | 100 ++++++++++++++++++++------------------
 torch/optim/functional.py |  50 +++++++++++++++++++
 3 files changed, 106 insertions(+), 48 deletions(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index b00184cc9343..f8a2cc405bf6 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -306,6 +306,10 @@ def test_adam(self):
             lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
                                             amsgrad=True)
         )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
+                                            weight_decay=0.1)
+        )
         self._test_basic_cases(
             lambda weight, bias: optim.Adam(
                 self._build_params_dict(weight, bias, lr=1e-2),
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 22a9e3828a57..c5ae8849aba7 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -1,5 +1,5 @@
-import math
 import torch
+from . import functional as F
 from .optimizer import Optimizer
 
 
@@ -66,52 +66,56 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                amsgrad = group['amsgrad']
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                if amsgrad:
-                    max_exp_avg_sq = state['max_exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-
-                if group['weight_decay'] != 0:
-                    grad = grad.add(p, alpha=group['weight_decay'])
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                if amsgrad:
-                    # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
-                    # Use the max. for normalizing running avg. of gradient
-                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-                else:
-                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-
-                step_size = group['lr'] / bias_correction1
-
-                p.addcdiv_(exp_avg, denom, value=-step_size)
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
 
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    grads.append(p.grad)
+
+                    state = self.state[p]
+                    # Lazy state initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        # Exponential moving average of squared gradient values
+                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['amsgrad']:
+                            # Maintains max of all exp. moving avg. of sq. grad. values
+                            state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_avg_sqs.append(state['exp_avg_sq'])
+
+                    if group['amsgrad']:
+                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+                    # update the steps for each param group update
+                    state['step'] += 1
+                    # record the step after step update
+                    state_steps.append(state['step'])
+
+            beta1, beta2 = group['betas']
+            F.adam(params_with_grad,
+                   grads,
+                   exp_avgs,
+                   exp_avg_sqs,
+                   max_exp_avg_sqs,
+                   state_steps,
+                   group['amsgrad'],
+                   beta1,
+                   beta2,
+                   group['lr'],
+                   group['weight_decay'],
+                   group['eps']
+                   )
         return loss
diff --git a/torch/optim/functional.py b/torch/optim/functional.py
index 4d66f3bb91ff..4e24a5eba424 100644
--- a/torch/optim/functional.py
+++ b/torch/optim/functional.py
@@ -1,4 +1,6 @@
 r"""Functional interface"""
+import math
+import torch
 from torch import Tensor
 from typing import List
 
@@ -44,3 +46,51 @@ def make_sparse(values):
             state_sum.addcmul_(grad, grad, value=1)
             std = state_sum.sqrt().add_(eps)
             param.addcdiv_(grad, std, value=-clr)
+
+
+def adam(params: List[Tensor],
+         grads: List[Tensor],
+         exp_avgs: List[Tensor],
+         exp_avg_sqs: List[Tensor],
+         max_exp_avg_sqs: List[Tensor],
+         state_steps: List[int],
+         amsgrad: bool,
+         beta1: float,
+         beta2: float,
+         lr: float,
+         weight_decay: float,
+         eps: float):
+    r"""Functional API that performs Adam algorithm computation.
+
+    See :class:`~torch.optim.Adam` for details.
+    """
+
+    for i, param in enumerate(params):
+
+        grad = grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step = state_steps[i]
+        if amsgrad:
+            max_exp_avg_sq = max_exp_avg_sqs[i]
+
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        if amsgrad:
+            # Maintains the maximum of all 2nd moment running avg. till now
+            torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+            # Use the max. for normalizing running avg. of gradient
+            denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+        else:
+            denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+
+        step_size = lr / bias_correction1
+
+        param.addcdiv_(exp_avg, denom, value=-step_size)

From 32c355af5b6528f478603ac42be1face834fc38a Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Fri, 25 Sep 2020 17:02:16 -0700
Subject: [PATCH 161/449] [dist_optim] introduce distributed functional
 optimizer (#45221)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45221

This PR introduces a distributed functional optimizer, so that
distributed optimizer can reuse the functional optimizer APIs and
maintain their own states. This could enable the torchscript compatible
functional optimizer when using distributed optimizer, helps getting rid
of GIL and improve overall performance of training, especially distributed
model parallel training

Test Plan: Imported from OSS

Reviewed By: ailzhang

Differential Revision: D23935256

Pulled By: wanchaol

fbshipit-source-id: 59b6d77ff4693ab24a6e1cbb6740bcf614cc624a
---
 torch/distributed/optim/functional_adagrad.py | 90 +++++++++++++++++
 torch/distributed/optim/optimizer.py          | 97 +++++++++++++++++--
 torch/optim/functional.py                     | 16 +--
 .../distributed/rpc/dist_optimizer_test.py    | 63 ++++++++++++
 4 files changed, 251 insertions(+), 15 deletions(-)
 create mode 100644 torch/distributed/optim/functional_adagrad.py

diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py
new file mode 100644
index 000000000000..cafce79a8c8e
--- /dev/null
+++ b/torch/distributed/optim/functional_adagrad.py
@@ -0,0 +1,90 @@
+from typing import List, Dict, Optional
+import torch
+import torch.optim.functional as F
+
+from torch import Tensor
+
+# Define a TorchScript compatible Functional Adagrad Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly let the user pass gradients to the `step` function
+# this is so that we could separate the gradients and parameters
+# and allow multithreaded trainer to update the parameters
+# without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdagrad(object):
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        lr_decay: float = 0.0,
+        weight_decay: float = 0.0,
+        initial_accumulator_value: float = 0.0,
+        warmup_lr_multiplier: float = 1.0,
+        warmup_num_iters: float = 0.0,
+        eps: float = 1e-10,
+        coalesce_grad: bool = True,
+    ):
+        self.defaults = {
+            "lr": lr,
+            "lr_decay": lr_decay,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "initial_accumulator_value": initial_accumulator_value,
+            "warmup_lr_multiplier": warmup_lr_multiplier,
+            "warmup_num_iters": warmup_num_iters,
+        }
+        self.coalesce_grad = coalesce_grad
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        # TODO: no union or any types in TorchScript, make step a scalar tensor instead
+        # This is also needed by if we want to share_memory on the step across processes
+        for p in self.param_group["params"]:
+            self.state[p] = {
+                "sum": torch.full_like(p.data, initial_accumulator_value),
+                "step": torch.tensor(0.0),
+            }
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group['params']
+        params_with_grad = []
+        grads = []
+        state_sums = []
+        state_steps: List[int] = []
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        for param, gradient in zip(self.param_group['params'], gradients):
+            if gradient is not None:
+                params_with_grad.append(param)
+                grads.append(gradient)
+                state = self.state[param]
+                state_sums.append(state['sum'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'].item())
+
+        with torch.no_grad():
+            F.adagrad(params,
+                      grads,
+                      state_sums,
+                      state_steps,
+                      self.defaults['lr'],
+                      self.defaults['weight_decay'],
+                      self.defaults['lr_decay'],
+                      self.defaults['eps'])
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index 153d229e3bf3..bb04e2dde3aa 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -1,11 +1,55 @@
+from typing import List, Optional
+
 import torch.distributed.rpc as rpc
+import torch.optim as optim
+import torch.jit as jit
+from torch import Tensor
+from torch.distributed.rpc import RRef
+from .functional_adagrad import _FunctionalAdagrad
 import torch.distributed.autograd as dist_autograd
 
+
 from collections import defaultdict
 from threading import Lock
 
 
-class _LocalOptimizer:
+# XXX: we define a _ScriptModuleOptimizer here to explicitly
+# compile the FunctionalOptimizer class into TorchScript
+# This is because ScriptClass instance still lives in
+# python unless you explictly compile it as an attribute
+# in ScriptModule or pass it to a ScriptFunction
+# _ScriptLocalOptimizerInterface serves as a common
+# interface type for Optimizer ScriptModules.
+# 
+# TODO (wanchaol): remove this once we added TorchScript
+# class reference semantics
+@jit.interface
+class _ScriptLocalOptimizerInterface(object):
+    def step(self, autograd_ctx_id: int) -> None:
+        pass
+
+class _ScriptLocalOptimizer(jit.ScriptModule):
+    def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
+        super().__init__()
+        self._local_params = [rref.local_value() for rref in local_params_rref]
+        self.optim = optim_cls(
+            self._local_params,
+            *args,
+            **kwargs)
+
+    @jit.script_method
+    def step(self, autograd_ctx_id: int):
+        all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
+        # apply functional optimizer step with a list of gradients
+        grads: List[Optional[Tensor]] = [
+            all_local_grads[p] if p in all_local_grads else None
+            for p in self._local_params
+        ]
+
+        self.optim.step(grads)
+
+
+class _LocalOptimizer(object):
     # Ideally we would only need to share a lock for instances of
     # _LocalOptimizer that deal with the same parameters. We are
     # making a simplifying assumption here that if there is more
@@ -16,8 +60,9 @@ class _LocalOptimizer:
     global_lock = Lock()
 
     def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
+        self._local_params = [rref.local_value() for rref in local_params_rref]
         self.optim = optim_cls(
-            [rref.local_value() for rref in local_params_rref],
+            self._local_params,
             *args,
             **kwargs)
 
@@ -40,6 +85,19 @@ def _local_optimizer_step(local_optim_rref, autograd_ctx_id):
     local_optim.step(autograd_ctx_id)
 
 
+# new/step functions combined with _ScriptLocalOptimizer to provide GIL-free optimizer
+def _new_script_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
+    return rpc.RRef(
+        _ScriptLocalOptimizer(optim_cls, local_params_rref, *args, **kwargs), _ScriptLocalOptimizerInterface)
+
+@jit.script
+def _script_local_optimizer_step(
+    local_optim_rref: RRef[_ScriptLocalOptimizerInterface],
+    autograd_ctx_id: int
+) -> None:
+    local_optim = local_optim_rref.local_value()
+    local_optim.step(autograd_ctx_id)
+
 def _wait_for_all(rpc_futs):
     # TODO: improve error propagation
     exception = None
@@ -104,17 +162,34 @@ class DistributedOptimizer:
         >>>   )
         >>>   dist_optim.step(context_id)
     """
+
+    # dict to map a user passed in optimizer_class to a functional
+    # optimizer class if we have already defined inside the
+    # distributed.optim package, this is so that we hide the
+    # functional optimizer to user and still provide the same API.
+    functional_optim_map = {
+        optim.Adagrad: _FunctionalAdagrad,
+    }
+
     def __init__(self, optimizer_class, params_rref, *args, **kwargs):
         per_worker_params_rref = defaultdict(list)
         for param in params_rref:
             per_worker_params_rref[param.owner()].append(param)
 
+        optim_ctor = DistributedOptimizer.functional_optim_map.get(optimizer_class, optimizer_class)
+        self.is_functional_optim = (optim_ctor != optimizer_class)
+
+        if self.is_functional_optim:
+            optimizer_new_func = _new_script_local_optimizer
+        else:
+            optimizer_new_func = _new_local_optimizer
+
         remote_optim_futs = []
         for worker, param_rrefs in per_worker_params_rref.items():
             remote_optim_rref_fut = rpc.rpc_async(
                 worker,
-                _new_local_optimizer,
-                args=(optimizer_class, param_rrefs) + args,
+                optimizer_new_func,
+                args=(optim_ctor, param_rrefs) + args,
                 kwargs=kwargs,
             )
             remote_optim_futs.append(remote_optim_rref_fut)
@@ -136,11 +211,17 @@ def step(self, context_id):
                 optimizer step.
         """
         dist_autograd._is_valid_context(context_id)
+
+        if self.is_functional_optim:
+            optimizer_step_func = _script_local_optimizer_step
+        else:
+            optimizer_step_func = _local_optimizer_step
+
         rpc_futs = []
-        for optim in self.remote_optimizers:
+        for optimizer in self.remote_optimizers:
             rpc_futs.append(rpc.rpc_async(
-                optim.owner(),
-                _local_optimizer_step,
-                args=(optim, context_id),
+                optimizer.owner(),
+                optimizer_step_func,
+                args=(optimizer, context_id),
             ))
         _wait_for_all(rpc_futs)
diff --git a/torch/optim/functional.py b/torch/optim/functional.py
index 4e24a5eba424..2984d7f417ea 100644
--- a/torch/optim/functional.py
+++ b/torch/optim/functional.py
@@ -6,6 +6,13 @@
 
 # TODO: use foreach API in optim.functional to do all the computation
 
+def _make_sparse(grad, grad_indices, values):
+    size = grad.size()
+    if grad_indices.numel() == 0 or values.numel() == 0:
+        return torch.empty_like(grad)
+    return torch.sparse_coo_tensor(grad_indices, values, size)
+
+
 def adagrad(params: List[Tensor],
             grads: List[Tensor],
             state_sums: List[Tensor],
@@ -33,15 +40,10 @@ def adagrad(params: List[Tensor],
             grad_values = grad._values()
             size = grad.size()
 
-            def make_sparse(values):
-                constructor = grad.new
-                if grad_indices.dim() == 0 or values.dim() == 0:
-                    return constructor().resize_as_(grad)
-                return constructor(grad_indices, values, size)
-            state_sum.add_(make_sparse(grad_values.pow(2)))
+            state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2)))
             std = state_sum.sparse_mask(grad)
             std_values = std._values().sqrt_().add_(eps)
-            param.add_(make_sparse(grad_values / std_values), alpha=-clr)
+            param.add_(_make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr)
         else:
             state_sum.addcmul_(grad, grad, value=1)
             std = state_sum.sqrt().add_(eps)
diff --git a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
index 3754aa014ad2..b111ff614608 100644
--- a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -198,3 +198,66 @@ def test_dist_optim(self):
             # ensure local equals remote
             self.assertEqual(new_w1, module1.get_w())
             self.assertEqual(new_w2, module2.get_w())
+
+
+    @dist_init
+    def test_dist_optim_functional(self):
+        # local version
+        module1 = MyModule()
+        module2 = MyModule()
+        params = [module1.get_w(), module2.get_w()]
+        local_optim = optim.Adagrad(params, lr=0.05)
+
+        old_w1 = module1.w.clone().detach()
+        old_w2 = module2.w.clone().detach()
+
+        g_cpu = torch.Generator()
+        g_cpu.manual_seed(0)
+        t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        output1 = module1.forward(t2)
+        output2 = module2.forward(output1)
+        loss = torch.add(output2, t1).sum()
+
+        loss.backward()
+        local_optim.step()
+
+        # distributed version
+        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
+        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)
+
+        remote_module1 = rpc.remote(owner1, MyModule)
+        remote_module2 = rpc.remote(owner2, MyModule)
+        remote_param1 = remote_method(MyModule.get_w, remote_module1)
+        remote_param2 = remote_method(MyModule.get_w, remote_module2)
+
+        old_w1_remote = remote_param1.to_here()
+
+        # sanity check: local and remote initial weights should match
+        self.assertEqual(old_w1, remote_param1.to_here())
+        self.assertEqual(old_w2, remote_param2.to_here())
+
+        dist_optim = DistributedOptimizer(
+            optim.Adagrad, [remote_param1, remote_param2], lr=0.05
+        )
+
+        with dist_autograd.context() as context_id:
+            g_cpu.manual_seed(0)
+            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
+            output2 = rpc_async_method(MyModule.forward, remote_module2, output1.wait())
+            loss = torch.add(output2.wait(), t1)
+
+            dist_autograd.backward(context_id, [loss.sum()])
+            dist_optim.step(context_id)
+
+            new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait()
+            new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait()
+
+            # ensure optimizer changed weights
+            self.assertNotEqual(old_w1, new_w1)
+            self.assertNotEqual(old_w2, new_w2)
+            # ensure local equals remote
+            self.assertEqual(new_w1, module1.get_w())
+            self.assertEqual(new_w2, module2.get_w())

From 606b1a9a2e55485ba3b26f866e2b0d51a115d75d Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailzhang@fb.com>
Date: Fri, 25 Sep 2020 18:05:32 -0700
Subject: [PATCH 162/449] Move xla codegen to aten. (#45241)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45241

Test Plan: Imported from OSS

Reviewed By: soumith

Differential Revision: D23926750

Pulled By: ailzhang

fbshipit-source-id: f768e24a9baeca9f9df069a62d6f8b94a853a1ee
---
 aten/src/ATen/native/native_functions.yaml         | 10 ++++++++++
 aten/src/ATen/templates/RegistrationDeclarations.h |  4 ++++
 tools/codegen/api/dispatcher.py                    |  3 +++
 tools/codegen/gen.py                               | 14 ++++++++++++++
 4 files changed, 31 insertions(+)
 create mode 100644 aten/src/ATen/templates/RegistrationDeclarations.h

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index d13c6799bff0..339cc1294580 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -397,12 +397,18 @@
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: addmv
 
 - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: addmv_
 
 - func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CPU, CUDA: addmv_out
 
 - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -1068,6 +1074,8 @@
 - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: count_nonzero
 
 - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
   use_c10_dispatcher: full
@@ -2427,6 +2435,8 @@
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
   use_c10_dispatcher: full
   variants: function, method
+  dispatch:
+    CPU, CUDA: mode
 
 - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
 
diff --git a/aten/src/ATen/templates/RegistrationDeclarations.h b/aten/src/ATen/templates/RegistrationDeclarations.h
new file mode 100644
index 000000000000..5a0f0d0c7b44
--- /dev/null
+++ b/aten/src/ATen/templates/RegistrationDeclarations.h
@@ -0,0 +1,4 @@
+// This file contains all native_functions that can be registered to
+// and the schema string that they should be registered with
+
+${registration_declarations}
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index f28135510b0e..67724b271691 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -63,6 +63,9 @@ def argument(a: Argument) -> DispatcherArgument:
             argument=la.argument,
         )
 
+def name(func: FunctionSchema) -> str:
+    return cpp.name(func)
+
 def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]:
     if local.use_c10_dispatcher() is UseC10Dispatcher.full:
         return list(map(argument, itertools.chain(func.out_arguments, func.arguments, func.kwarg_only_arguments)))
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index df6f08189307..668894775f54 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -762,6 +762,17 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         ('has_math_kernel', f.dispatch is not None and 'Math' in f.dispatch),
     ])
 
+@with_native_function
+def compute_registration_declarations(f: NativeFunction) -> str:
+    name = dispatcher.name(f.func)
+    returns_type = dispatcher.returns_type(f.func.returns)
+    args = dispatcher.arguments(f.func)
+    args_str = ', '.join(map(str, args))
+    dispatch = f.dispatch is not None
+    math = dispatch and 'Math' in f.dispatch  # type: ignore
+    return f"""{returns_type} {name}({args_str}); // {{"schema": "aten::{f.func}", "dispatch": "{dispatch}", "math": "{math}"}}
+"""
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                           RUN IT ALL
@@ -1021,6 +1032,9 @@ def computeSchemaRegister() -> Dict[str, object]:
         cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister)
 
     cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions))))
+    cpu_fm.write('RegistrationDeclarations.h', lambda: {
+        'registration_declarations': list(map(compute_registration_declarations, native_functions)),
+    })
 
     if options.output_dependencies:
         cpu_fm.write_outputs(options.output_dependencies)

From 958c208666a83609160b3d80a75d91a187a5b634 Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Fri, 25 Sep 2020 18:08:38 -0700
Subject: [PATCH 163/449] [quant] conv_transpose graph patterns (#45078)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45078

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D23821580

Pulled By: z-a-f

fbshipit-source-id: 813a4ef1bbc429720765d61791fe754b6678a334
---
 test/quantization/test_quantize_jit.py        | 31 +++++++
 .../csrc/jit/passes/graph_rewrite_helper.cpp  | 84 ++++++++++++-------
 .../csrc/jit/passes/quantization/finalize.cpp |  8 +-
 torch/csrc/jit/passes/quantization/helper.cpp | 26 +++++-
 torch/csrc/jit/passes/quantization/helper.h   |  8 ++
 .../quantization/quantization_patterns.h      | 76 ++++++++++++++++-
 .../testing/_internal/common_quantization.py  | 23 +++++
 7 files changed, 221 insertions(+), 35 deletions(-)

diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py
index 7a14816ee8ad..f0cd203cf43e 100644
--- a/test/quantization/test_quantize_jit.py
+++ b/test/quantization/test_quantize_jit.py
@@ -51,6 +51,7 @@
     SkipQuantModel,
     NestedModel,
     ConvModel,
+    ConvTransposeModel,
     default_per_channel_qconfig,
     test_only_eval_fn,
     ConvBnModel,
@@ -61,6 +62,7 @@
     AnnotatedSkipQuantModel,
     AnnotatedNestedModel,
     AnnotatedConvModel,
+    AnnotatedConvTransposeModel,
     AnnotatedConvBnModel,
 )
 
@@ -3171,6 +3173,35 @@ def test_conv(self):
                 inplace=False)
             self.assertEqual(model_quantized(self.img_data_2d[0][0]), result_eager)
 
+    @override_qengines
+    def test_conv_transpose(self):
+        r"""Compare the result of quantizing conv_transpose layer in
+        eager mode and graph mode
+        """
+        if not qengine_is_qnnpack():
+            return  # Currently only qnnpack is supported
+        # eager mode
+        annotated_conv_model = AnnotatedConvTransposeModel(
+            torch.backends.quantized.engine).eval()
+        conv_model = ConvTransposeModel().eval()
+        # copy the weight from eager mode so that we can
+        # compare the result of the two quantized models later
+        conv_model.conv.weight = torch.nn.Parameter(annotated_conv_model.conv.weight.detach())
+        model_eager = quantize(annotated_conv_model, test_only_eval_fn, self.img_data_2d)
+        qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
+        model_traced = torch.jit.trace(conv_model, self.img_data_2d[0][0])
+        model_script = torch.jit.script(conv_model)
+        result_eager = model_eager(self.img_data_2d[0][0])
+        for model_under_test in [model_traced, model_script]:
+            model_quantized = quantize_jit(
+                model_under_test,
+                qconfig_dict,
+                test_only_eval_fn,
+                [self.img_data_2d],
+                inplace=False)
+            self.assertEqual(model_quantized(self.img_data_2d[0][0]),
+                             result_eager)
+
     @override_qengines
     def test_conv_bn(self):
         r"""Compare the result of quantizing conv + bn layer in
diff --git a/torch/csrc/jit/passes/graph_rewrite_helper.cpp b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
index 6a6c0a2c355c..8b612a902b12 100644
--- a/torch/csrc/jit/passes/graph_rewrite_helper.cpp
+++ b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
@@ -84,43 +84,51 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
         %r = aten::conv2d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
-  std::string conv_transpose2d_for_deprecated_conv = R"(
+  std::string conv1d_for_deprecated_conv = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool):
-        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
-  std::string conv_transpose2d = R"(
+  std::string conv1d = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
-        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
-  std::string conv1d_for_deprecated_conv = R"(
+  std::string conv3d_for_deprecated_conv = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool):
-        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
-  std::string conv1d = R"(
+  std::string conv3d = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
-        %r = aten::conv1d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
-  std::string conv3d_for_deprecated_conv = R"(
+  std::string conv_transpose1d = R"(
+      graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
+          %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
+          %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
+        %r = aten::conv_transpose1d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_for_deprecated_conv = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool):
-        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
         return (%r) )";
-  std::string conv3d = R"(
+
+  std::string conv_transpose2d = R"(
       graph(%a, %w, %b, %stride:int[], %padding:int[], %dilation:int[],
           %transposed:bool, %output_padding:int[], %groups:int, %benchmark:bool,
           %deterministic:bool, %cudnn_enabled:bool, %allow_tf32:bool):
-        %r = aten::conv3d(%a, %w, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::conv_transpose2d(%a, %w, %b, %stride, %padding, %output_padding, %groups, %dilation)
         return (%r) )";
 
   // Filter the unsupported case
@@ -146,6 +154,29 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
     }
     return !calc_value_map["transposed"].toBool();
   };
+  auto filter_conv3d = [](const Match& match,
+                          const std::unordered_map<std::string, Value*>& vmap) {
+    auto calc_value_map = getConvParams(match, vmap);
+    if (calc_value_map["output_padding"].toIntList().size() != 3 ||
+        calc_value_map["stride"].toIntList().size() != 3 ||
+        calc_value_map["padding"].toIntList().size() != 3 ||
+        calc_value_map["dilation"].toIntList().size() != 3) {
+      return false;
+    }
+    return !calc_value_map["transposed"].toBool();
+  };
+  auto filter_conv_transpose1d =
+      [](const Match& match,
+         const std::unordered_map<std::string, Value*>& vmap) {
+        auto calc_value_map = getConvParams(match, vmap);
+        if (calc_value_map["output_padding"].toIntList().size() != 1 ||
+            calc_value_map["stride"].toIntList().size() != 1 ||
+            calc_value_map["padding"].toIntList().size() != 1 ||
+            calc_value_map["dilation"].toIntList().size() != 1) {
+          return false;
+        }
+        return calc_value_map["transposed"].toBool();
+      };
   auto filter_conv_transpose2d =
       [](const Match& match,
          const std::unordered_map<std::string, Value*>& vmap) {
@@ -158,39 +189,36 @@ void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph) {
         }
         return calc_value_map["transposed"].toBool();
       };
-  auto filter_conv3d = [](const Match& match,
-                          const std::unordered_map<std::string, Value*>& vmap) {
-    auto calc_value_map = getConvParams(match, vmap);
-    if (calc_value_map["output_padding"].toIntList().size() != 3 ||
-        calc_value_map["stride"].toIntList().size() != 3 ||
-        calc_value_map["padding"].toIntList().size() != 3 ||
-        calc_value_map["dilation"].toIntList().size() != 3) {
-      return false;
-    }
-    return !calc_value_map["transposed"].toBool();
-  };
 
   SubgraphRewriter rewriter_conv1d;
   rewriter_conv1d.RegisterRewritePattern(convolution, conv1d);
   rewriter_conv1d.RegisterRewritePattern(
       convolution_deprecated, conv1d_for_deprecated_conv);
   rewriter_conv1d.runOnGraph(graph, filter_conv1d);
+
   SubgraphRewriter rewriter_conv2d;
   rewriter_conv2d.RegisterRewritePattern(convolution, conv2d);
   rewriter_conv2d.RegisterRewritePattern(
       convolution_deprecated, conv2d_for_deprecated_conv);
   rewriter_conv2d.runOnGraph(graph, filter_conv2d);
+
+  SubgraphRewriter rewriter_conv3d;
+  rewriter_conv3d.RegisterRewritePattern(convolution, conv3d);
+  rewriter_conv3d.RegisterRewritePattern(
+      convolution_deprecated, conv3d_for_deprecated_conv);
+  rewriter_conv3d.runOnGraph(graph, filter_conv3d);
+
+  SubgraphRewriter rewriter_conv_transpose1d;
+  rewriter_conv_transpose1d.RegisterRewritePattern(
+      convolution, conv_transpose1d);
+  rewriter_conv_transpose1d.runOnGraph(graph, filter_conv_transpose1d);
+
   SubgraphRewriter rewriter_conv_transpose2d;
   rewriter_conv_transpose2d.RegisterRewritePattern(
       convolution, conv_transpose2d);
   rewriter_conv_transpose2d.RegisterRewritePattern(
       convolution_deprecated, conv_transpose2d_for_deprecated_conv);
   rewriter_conv_transpose2d.runOnGraph(graph, filter_conv_transpose2d);
-  SubgraphRewriter rewriter_conv3d;
-  rewriter_conv3d.RegisterRewritePattern(convolution, conv3d);
-  rewriter_conv3d.RegisterRewritePattern(
-      convolution_deprecated, conv3d_for_deprecated_conv);
-  rewriter_conv3d.runOnGraph(graph, filter_conv3d);
 }
 
 bool isClampFusable(
diff --git a/torch/csrc/jit/passes/quantization/finalize.cpp b/torch/csrc/jit/passes/quantization/finalize.cpp
index 635c02728f6b..d5beb5ca37ca 100644
--- a/torch/csrc/jit/passes/quantization/finalize.cpp
+++ b/torch/csrc/jit/passes/quantization/finalize.cpp
@@ -65,10 +65,14 @@ void InsertPrepackUnpack(Module& module) {
 void FoldQuantizedPrepackingOps(Module& module) {
   auto filter_fn = [](const Node* n) -> bool {
     return (
-        (n->kind() == Symbol::fromQualString("quantized::linear_prepack")) ||
+        n->kind() == Symbol::fromQualString("quantized::linear_prepack") ||
         n->kind() == Symbol::fromQualString("quantized::conv1d_prepack") ||
         n->kind() == Symbol::fromQualString("quantized::conv2d_prepack") ||
-        n->kind() == Symbol::fromQualString("quantized::conv3d_prepack"));
+        n->kind() == Symbol::fromQualString("quantized::conv3d_prepack") ||
+        n->kind() ==
+            Symbol::fromQualString("quantized::conv_transpose1d_prepack") ||
+        n->kind() ==
+            Symbol::fromQualString("quantized::conv_transpose2d_prepack"));
   };
   PrePackingOpsFolder(module, filter_fn, "quantized");
 }
diff --git a/torch/csrc/jit/passes/quantization/helper.cpp b/torch/csrc/jit/passes/quantization/helper.cpp
index ddaf150803fe..10e46aee644c 100644
--- a/torch/csrc/jit/passes/quantization/helper.cpp
+++ b/torch/csrc/jit/passes/quantization/helper.cpp
@@ -32,6 +32,8 @@ std::vector<std::string> _static_quantizable_aten_funcs = {
     "conv1d",
     "conv2d",
     "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
     "linear",
     "hardswish",
     "hardswish_",
@@ -273,6 +275,8 @@ bool isWeight(Value* v) {
       AtenFuncArgs({{"conv1d", 1},
                     {"conv2d", 1},
                     {"conv3d", 1},
+                    {"conv_transpose1d", 1},
+                    {"conv_transpose2d", 1},
                     {"linear", 1},
                     {"embedding_bag", 0}}),
       // embedding_bag - prim::CallFunction(%func, %input.1, %weight,
@@ -285,8 +289,12 @@ bool isWeight(Value* v) {
 bool isBiasOfConvOrLinear(Value* v) {
   bool result = matchArgPattern(
       v,
-      AtenFuncArgs(
-          {{"conv1d", 2}, {"conv2d", 2}, {"conv3d", 2}, {"linear", 2}}),
+      AtenFuncArgs({{"conv1d", 2},
+                    {"conv2d", 2},
+                    {"conv3d", 2},
+                    {"conv_transpose1d", 2},
+                    {"conv_transpose2d", 2},
+                    {"linear", 2}}),
       CallFuncArgs({{"linear", 3}}));
   return result;
 }
@@ -728,6 +736,20 @@ bool is_conv3d_module(
       match, vmap, "conv", "__torch__.torch.nn.modules.conv.Conv3d");
 }
 
+bool is_conv_transpose1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap) {
+  return is_module(
+      match, vmap, "conv", "__torch__.torch.nn.modules.conv.ConvTranspose1d");
+}
+
+bool is_conv_transpose2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap) {
+  return is_module(
+      match, vmap, "conv", "__torch__.torch.nn.modules.conv.ConvTranspose2d");
+}
+
 bool is_batchnorm2d_module(
     const Match& match,
     const std::unordered_map<std::string, Value*>& vmap) {
diff --git a/torch/csrc/jit/passes/quantization/helper.h b/torch/csrc/jit/passes/quantization/helper.h
index 440134ccbd3f..f473b4b7caa8 100644
--- a/torch/csrc/jit/passes/quantization/helper.h
+++ b/torch/csrc/jit/passes/quantization/helper.h
@@ -194,6 +194,14 @@ bool is_conv3d_module(
     const Match& match,
     const std::unordered_map<std::string, Value*>& vmap);
 
+bool is_conv_transpose1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv_transpose2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
 bool is_batchnorm2d_module(
     const Match& match,
     const std::unordered_map<std::string, Value*>& vmap);
diff --git a/torch/csrc/jit/passes/quantization/quantization_patterns.h b/torch/csrc/jit/passes/quantization/quantization_patterns.h
index ba692d88c18c..8248357d986e 100644
--- a/torch/csrc/jit/passes/quantization/quantization_patterns.h
+++ b/torch/csrc/jit/passes/quantization/quantization_patterns.h
@@ -407,6 +407,38 @@ graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %pad
         %r_quant = quantized::conv3d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
         return (%r_quant) )";
 
+  // aten::conv_transpose1d
+  std::string conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose1d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv_transpose2d
+  std::string conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose2d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
   std::string add_relu = R"(
 graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
          %a_dequant = aten::dequantize(%a_quant)
@@ -907,6 +939,12 @@ graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype)
       {"quantized::conv3d", conv3d, quantized_conv3d},
       {"quantized::conv3d_relu", conv3d_relu, quantized_conv3d_relu},
       {"quantized::conv3d_relu", conv3d_inplace_relu, quantized_conv3d_relu},
+      {"quantized::conv_transpose1d",
+       conv_transpose1d,
+       quantized_conv_transpose1d},
+      {"quantized::conv_transpose2d",
+       conv_transpose2d,
+       quantized_conv_transpose2d},
       {"quantized::linear", linear, quantized_linear},
       {"quantized::linear_relu", linear_relu, quantized_linear_relu},
       {"quantized::linear_relu", linear_inplace_relu, quantized_linear_relu},
@@ -1128,12 +1166,44 @@ graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
         %r = aten::conv3d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
         return (%r) )";
 
+  std::string conv_transpose1d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose1d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose1d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose2d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
   return {
       {"conv1d_prepack_unpack", conv1d_with_quant, conv1d_with_quant_prepack},
       {"conv2d_prepack_unpack", conv2d_with_quant, conv2d_with_quant_prepack},
-      {"conv3d_prepack_unpack", conv3d_with_quant, conv3d_with_quant_prepack}
-
-  };
+      {"conv3d_prepack_unpack", conv3d_with_quant, conv3d_with_quant_prepack},
+      {"conv_transpose1d_prepack_unpack",
+       conv_transpose1d_with_quant,
+       conv_transpose1d_with_quant_prepack},
+      {"conv_transpose2d_prepack_unpack",
+       conv_transpose2d_with_quant,
+       conv_transpose2d_with_quant_prepack}};
 }
 
 } // namespace jit
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 4031b2fdd0de..387c45bd31e0 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -810,6 +810,15 @@ def forward(self, x):
         x = self.conv(x)
         return x
 
+class ConvTransposeModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
 class AnnotatedConvModel(torch.nn.Module):
     def __init__(self, qengine):
         super().__init__()
@@ -824,6 +833,20 @@ def forward(self, x):
         x = self.dequant(x)
         return x
 
+class AnnotatedConvTransposeModel(torch.nn.Module):
+    def __init__(self, qengine):
+        super().__init__()
+        self.qconfig = torch.quantization.get_default_qconfig(qengine)
+        self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv(x)
+        x = self.dequant(x)
+        return x
+
 class ConvBnModel(torch.nn.Module):
     def __init__(self):
         super().__init__()

From d2bd556e7d311c326b0e056bbba7ff665e2acf8a Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Fri, 25 Sep 2020 18:29:01 -0700
Subject: [PATCH 164/449] Quantization: add API summary section (#45093)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45093

This adds a tl;dr; style summary of the quantization API
to the documentation. Hopefully this will make this easier
for new folks to learn how to use quantization.

This is not meant to be all-encompassing.  Future PRs
can improve the documentation further.

Test Plan:
1. build the doc as specified in https://github.com/pytorch/pytorch#building-the-documentation
2. inspect the quantization page in Chrome, format looks good

Reviewed By: jerryzh168

Differential Revision: D23828257

Pulled By: vkuzo

fbshipit-source-id: 9311ee3f394cd83af0aeafb6e2fcdc3e0321fa38
---
 docs/source/quantization.rst | 237 +++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index b597fa9f51f3..3a004ace295f 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -77,6 +77,243 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 
     ``torch.backends.quantized.engine = 'qnnpack'``
 
+Quantization API Summary
+---------------------------------------
+
+There are three types of quantization supported in PyTorch:
+
+1. dynamic quantization (weights quantized with activations read/stored in
+   floating point and quantized for compute.)
+2. static quantization (weights quantized, activations quantized, calibration
+   required post training)
+3. quantization aware training (weights quantized, activations quantized,
+   quantization numerics modeled during training)
+
+Please see our `Introduction to Quantization on Pytorch
+<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
+for a more comprehensive overview of the tradeoffs between these quantization
+types.
+
+Dynamic Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+Dynamic quantization quantizes the weights of the model.  Activations are
+converted to the quantized domain for calculations and converted to floating
+point domain for I/O.  It does not require calibration and can be done in a
+single function call.  It is commonly used in models where runtime is compute
+bound or dominated by memory access of weights, instead of being dominated by
+memory accesses of activations dominates the runtime, such as LSTMs/RNNs or
+BERT/Transformers.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                   /
+  linear_weight_fp32
+
+  # dynamically quantized model
+  # linear and conv weights are in int8
+  previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32
+                       /
+     linear_weight_int8
+
+API example::
+
+    import torch
+
+    # define a floating point model
+    class M(torch.nn.Module):
+        def __init__(self):
+            super(M, self).__init__()
+            self.fc = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            x = self.fc(x)
+            return x
+
+    # create a model instance
+    model_fp32 = M()
+    # create a quantized model instance
+    model_int8 = torch.quantization.quantize_dynamic(
+        model_fp32,  # the original model
+        {torch.nn.Linear},  # a set of layers to dynamically quantize
+        dtype=torch.qint8)  # the target dtype for quantized weights
+
+    # run the model
+    input_fp32 = torch.randn(4, 4, 4, 4)
+    res = model_int8(input_fp32)
+
+To learn more about dynamic quantization please see our `end to end tutorial
+<https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
+
+Static Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+Static quantization quantizes the weights and activations of the model.  It
+fuses activations into preceding layers where possible.  It requires
+calibration with a representative dataset to determine optimal quantization
+parameters for activations.  It is commonly used in models where activation
+throughput dominates the runtime, such as CNNs.  Static quantization is also
+known as Post Training Quantization or PTQ.
+
+Diagram::
+
+    # original model
+    # all tensors and computations are in floating point
+    previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+        linear_weight_fp32
+
+    # statically quantized model
+    # weights and activations are in int8
+    previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                        /
+      linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could be statically quantized
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.relu(x)
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to eval mode for static quantization logic to work
+  model_fp32.eval()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference.
+  model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+
+  # fuse the activations to preceding layers, where applicable
+  # this needs to be done manually depending on the model architecture
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
+
+  # prepare the model for static quantization
+  model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
+
+  # calibrate the prepared model to determine quantization parameters for activations
+  # in a real world setting, the calibration would be done with a representative dataset
+  input_fp32 = torch.randn(4, 1, 4, 4)
+  model_fp32_prepared(input_fp32)
+
+  # convert the observed model to a quantized model
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about static quantization, please see the `end to end tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
+Quantization Aware Training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Quantization Aware Training models the effects of quantization during training
+allowing for higher accuracy compared to other quantization methods.  During
+training, all calculations are done in floating point, with fake_quant modules
+modeling the effects of quantization.  After model conversion, weights and
+activations are quantized, and activations are fused into the preceding layer
+where possible.  It is commonly used with CNNs and yields a higher accuracy
+compared to static quantization.  Quantization Aware Training is also known as
+QAT.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+      linear_weight_fp32
+
+  # model with fake_quants for modeling quantization numerics during training
+  previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32
+                             /
+     linear_weight_fp32 -- fq
+
+  # quantized model
+  # weights and activations are in int8
+  previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                       /
+     linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could benefit from QAT
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.bn = torch.nn.BatchNorm2d(1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.bn(x)
+          x = self.relu(x)
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to train mode for QAT logic to work
+  model_fp32.train()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference.
+  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+
+  # fuse the activations to preceding layers, where applicable
+  # this needs to be done manually depending on the model architecture
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
+      [['conv', 'bn', 'relu']])
+
+  # prepare the model for static quantization
+  model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
+
+  # run the training loop (not shown)
+  training_loop(model_fp32_prepared)
+
+  # convert the observed model to a quantized model
+  model_fp32_prepared.eval()
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about quantization aware training, please see the `end to end
+tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
 Quantized Tensors
 ---------------------------------------
 

From 278da57255f6498010b1886782b6203196bb2471 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Fri, 25 Sep 2020 18:29:01 -0700
Subject: [PATCH 165/449] Quantization: combine previous summary with new
 summary (#45135)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45135

The previous quantization summary had steps on what to do for
dynamic, static, QAT.  This PR moves these steps to comments in the
example code, so it is more clear how to accomplish the steps.

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23842456

Pulled By: vkuzo

fbshipit-source-id: db2399e51e9ae33c8a1ac610e3d7dbdb648742b0
---
 docs/source/quantization.rst | 129 +++++++++++------------------------
 1 file changed, 38 insertions(+), 91 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 3a004ace295f..d950e08762e1 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -97,13 +97,12 @@ types.
 Dynamic Quantization
 ^^^^^^^^^^^^^^^^^^^^
 
-Dynamic quantization quantizes the weights of the model.  Activations are
-converted to the quantized domain for calculations and converted to floating
-point domain for I/O.  It does not require calibration and can be done in a
-single function call.  It is commonly used in models where runtime is compute
-bound or dominated by memory access of weights, instead of being dominated by
-memory accesses of activations dominates the runtime, such as LSTMs/RNNs or
-BERT/Transformers.
+This is the simplest to apply form of quantization where the weights are
+quantized ahead of time but the activations are dynamically quantized
+during inference. This is used for situations where the model execution time
+is dominated by loading weights from memory rather than computing the matrix
+multiplications. This is true for for LSTM and Transformer type models with
+small batch size.
 
 Diagram::
 
@@ -154,9 +153,10 @@ Static Quantization
 Static quantization quantizes the weights and activations of the model.  It
 fuses activations into preceding layers where possible.  It requires
 calibration with a representative dataset to determine optimal quantization
-parameters for activations.  It is commonly used in models where activation
-throughput dominates the runtime, such as CNNs.  Static quantization is also
-known as Post Training Quantization or PTQ.
+parameters for activations. Post Training Quantization is typically used when
+both memory bandwidth and compute savings are important with CNNs being a
+typical use case.  Static quantization is also known as Post Training
+Quantization or PTQ.
 
 Diagram::
 
@@ -188,9 +188,13 @@ API Example::
           self.dequant = torch.quantization.DeQuantStub()
 
       def forward(self, x):
+          # manually specify where tensors will be converted from floating
+          # point to quantized in the quantized model
           x = self.quant(x)
           x = self.conv(x)
           x = self.relu(x)
+          # manually specify where tensors will be converted from quantized
+          # to floating point in the quantized model
           x = self.dequant(x)
           return x
 
@@ -202,14 +206,18 @@ API Example::
 
   # attach a global qconfig, which contains information about what kind
   # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference.
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
   model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
 
-  # fuse the activations to preceding layers, where applicable
-  # this needs to be done manually depending on the model architecture
+  # Fuse the activations to preceding layers, where applicable.
+  # This needs to be done manually depending on the model architecture.
+  # Common fusions include `conv + relu` and `conv + batchnorm + relu`
   model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
 
-  # prepare the model for static quantization
+  # Prepare the model for static quantization. This inserts observers in
+  # the model that will observe activation tensors during calibration.
   model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
 
   # calibrate the prepared model to determine quantization parameters for activations
@@ -217,7 +225,10 @@ API Example::
   input_fp32 = torch.randn(4, 1, 4, 4)
   model_fp32_prepared(input_fp32)
 
-  # convert the observed model to a quantized model
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, and replaces key operators with quantized
+  # implementations.
   model_int8 = torch.quantization.convert(model_fp32_prepared)
 
   # run the model, relevant calculations will happen in int8
@@ -232,7 +243,8 @@ Quantization Aware Training
 Quantization Aware Training models the effects of quantization during training
 allowing for higher accuracy compared to other quantization methods.  During
 training, all calculations are done in floating point, with fake_quant modules
-modeling the effects of quantization.  After model conversion, weights and
+modeling the effects of quantization by clamping and rounding to simulate the
+effects of INT8.  After model conversion, weights and
 activations are quantized, and activations are fused into the preceding layer
 where possible.  It is commonly used with CNNs and yields a higher accuracy
 compared to static quantization.  Quantization Aware Training is also known as
@@ -289,7 +301,9 @@ API Example::
 
   # attach a global qconfig, which contains information about what kind
   # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference.
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
   model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
 
   # fuse the activations to preceding layers, where applicable
@@ -297,13 +311,17 @@ API Example::
   model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
       [['conv', 'bn', 'relu']])
 
-  # prepare the model for static quantization
+  # Prepare the model for QAT. This inserts observers and fake_quants in
+  # the model that will observe weight and activation tensors during calibration.
   model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
 
   # run the training loop (not shown)
   training_loop(model_fp32_prepared)
 
-  # convert the observed model to a quantized model
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, fuses modules where appropriate,
+  # and replaces key operators with quantized implementations.
   model_fp32_prepared.eval()
   model_int8 = torch.quantization.convert(model_fp32_prepared)
 
@@ -358,80 +376,9 @@ cover typical CNN and RNN models
     torch.nn.quantized
     torch.nn.quantized.dynamic
 
-Quantization Workflows
+Quantization Customizations
 ----------------------
 
-PyTorch provides three approaches to quantize models.
-
-.. _quantization tutorials:
-   https://pytorch.org/tutorials/#quantization-experimental
-
-1. Post Training Dynamic Quantization: This is the simplest to apply form of
-   quantization where the weights are quantized ahead of time but the
-   activations are dynamically quantized  during inference. This is used
-   for situations where the model execution time is dominated by loading
-   weights from memory rather than computing the matrix multiplications.
-   This is true for for LSTM and Transformer type models with small
-   batch size. Applying dynamic quantization to a whole model can be
-   done with a single call to :func:`torch.quantization.quantize_dynamic()`.
-   See the `quantization tutorials`_
-2. Post Training Static Quantization: This is the most commonly used form of
-   quantization where the weights are quantized ahead of time and the
-   scale factor and bias for the activation tensors is pre-computed
-   based on observing the behavior of the model during a calibration
-   process. Post Training Quantization is typically when both memory bandwidth
-   and compute savings are important with CNNs being a typical use case.
-   The general process for doing post training quantization is:
-
-
-
-   1. Prepare the model:
-
-      a. Specify where the activations are quantized and dequantized explicitly
-         by adding QuantStub and DeQuantStub modules.
-      b. Ensure that modules are not reused.
-      c. Convert any operations that require requantization into modules
-
-   2. Fuse operations like conv + relu or conv+batchnorm + relu together to
-      improve both model accuracy and performance.
-
-   3. Specify the configuration of the quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or
-      L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare` to insert modules
-      that will observe activation tensors during calibration
-   5. Calibrate the model by running inference against a calibration
-      dataset
-   6. Finally, convert the model itself with the
-      torch.quantization.convert() method. This does several things: it
-      quantizes the weights, computes and stores the scale and bias
-      value to be used each activation tensor, and replaces key
-      operators quantized implementations.
-
-   See the `quantization tutorials`_
-
-
-3. Quantization Aware Training: In the rare cases where post training
-   quantization does not provide adequate accuracy training can be done
-   with simulated quantization using the
-   :class:`torch.quantization.FakeQuantize`. Computations will take place in
-   FP32 but with values clamped and rounded to simulate the effects of INT8
-   quantization. The sequence of steps is very similar.
-
-
-   1. Steps (1) and (2) are identical.
-
-   3. Specify the configuration of the fake quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or Moving Average
-      or L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare_qat` to insert modules
-      that will simulate quantization during training.
-   5. Train or fine tune the model.
-   6. Identical to step (6) for post training quantization
-
-   See the `quantization tutorials`_
-
-
 While default implementations of observers to select the scale factor and bias
 based on observed tensor data are provided, developers can provide their own
 quantization functions. Quantization can be applied selectively to different

From eb3962439404cff611d83e932dafc77f3cf32d5f Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Fri, 25 Sep 2020 18:29:01 -0700
Subject: [PATCH 166/449] quant docs: add reduce_range explanatation to top
 level doc (#45305)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45305

Adds an explanatation for reduce_range to the main quantization
doc page.

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23916669

Pulled By: vkuzo

fbshipit-source-id: ef93fb774cb15741cd92889f114f6ab76c39f051
---
 docs/source/quantization.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index d950e08762e1..ad5cd9afff7f 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -404,7 +404,7 @@ by module basis. Specifically, for all quantization techniques, the user needs t
 1. Convert any operations that require output requantization (and thus have
    additional parameters) from functionals to module form.
 2. Specify which parts of the model need to be quantized either by assigning
-   ```.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
 
 For static quantization techniques which quantize activations, the user needs
 to do the following in addition:
@@ -422,6 +422,13 @@ to do the following in addition:
    to be fused. We currently support the following fusions:
    [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
 
+Best Practices
+--------------
+
+1. Set the ``reduce_range`` argument on observers to `True` if you are using the
+   ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
+   by reducing the range of quantized data type by 1 bit.
+
 
 Modules that provide quantization functions and classes
 -------------------------------------------------------

From 7763e1d7b1be5b13394396646cb00a8766b4a626 Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Fri, 25 Sep 2020 18:29:01 -0700
Subject: [PATCH 167/449] quant docs: document how to customize qconfigs in
 eager mode (#45306)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45306

Adds details to the main quantization doc on how specifically
users can skip or customize quantization of layers.

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23917034

Pulled By: vkuzo

fbshipit-source-id: ccf71ce4300c1946b2ab63d1f35a07691fd7a2af
---
 docs/source/quantization.rst | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index ad5cd9afff7f..eb648524ad16 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -402,9 +402,15 @@ prior to quantization. This is because currently quantization works on a module
 by module basis. Specifically, for all quantization techniques, the user needs to:
 
 1. Convert any operations that require output requantization (and thus have
-   additional parameters) from functionals to module form.
+   additional parameters) from functionals to module form (for example,
+   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
 2. Specify which parts of the model need to be quantized either by assigning
-   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``.
+   For example, setting ``model.conv1.qconfig = None`` means that the
+   ``model.conv`` layer will not be quantized, and setting
+   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
+   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
+   of the global qconfig.
 
 For static quantization techniques which quantize activations, the user needs
 to do the following in addition:

From 92189b34b73fcd8692dad358ba714b60d6542e4c Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Fri, 25 Sep 2020 19:30:07 -0700
Subject: [PATCH 168/449] Add get_all_users_of function to GraphManipulation
 (#45216)

Summary:
This PR adds get_all_users_of function. The function returns all the users of a specific node. A test unit is also added.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45216

Reviewed By: ezyang

Differential Revision: D23883572

Pulled By: scottxu0730

fbshipit-source-id: 3eb68a411c3c6db39ed2506c9cb7bb7337520ee4
---
 test/test_fx.py                            | 21 +++++++++++++++
 torch/fx/experimental/GraphManipulation.py | 31 ++++++++++++++++++++++
 torch/fx/experimental/__init__.py          |  0
 3 files changed, 52 insertions(+)
 create mode 100644 torch/fx/experimental/GraphManipulation.py
 create mode 100644 torch/fx/experimental/__init__.py

diff --git a/test/test_fx.py b/test/test_fx.py
index 5eae41538f5c..ca63706f9a5d 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -6,6 +6,7 @@
 import copy
 from pathlib import Path
 from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Tracer, Graph
+from torch.fx.experimental import GraphManipulation
 
 from torch.fx.proxy import TraceError
 
@@ -619,6 +620,26 @@ def forward(self, x):
         with self.assertRaisesRegex(AssertionError, message):
             traced(torch.rand(4, 3))
 
+    def test_get_all_users_of(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        a : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_module', 'linear_mod', args=(a,))
+        c : torch.fx.Node = graph.create_node('get_attr', 'y_attr')
+        d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
+        graph.output(d)
+        linear_mod : torch.nn.Module = torch.nn.Linear(3, 4)
+        add_param : torch.Tensor = torch.rand(3, 4)
+        gm : torch.fx.GraphModule = torch.fx.GraphModule(
+            {'linear_mod': linear_mod, 'y_attr' : add_param}, graph)
+        expected_uses: Dict[int, List[int]] = {
+            0: [1],
+            1: [3],
+            2: [3],
+            3: []
+        }
+        for i, node in enumerate(graph.nodes):
+            user_indexes = GraphManipulation.get_all_users_of(gm, i)
+            assert user_indexes == expected_uses[i]
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py
new file mode 100644
index 000000000000..f3df7734ba99
--- /dev/null
+++ b/torch/fx/experimental/GraphManipulation.py
@@ -0,0 +1,31 @@
+from typing import List
+from torch.fx.graph_module import GraphModule
+from typing import Any
+from torch.fx.node import Node
+
+"""find_use is used to find out if the node is another node's arg or kwargs."""
+def find_use(arg: Any, node: Node) -> bool:
+    if isinstance(arg, (tuple, list)):
+        return any(find_use(elem, node) for elem in arg)
+    elif isinstance(arg, dict):
+        return any(find_use(v, node) for k, v in arg.items())
+    elif isinstance(arg, slice):
+        return any([find_use(arg.start, node), find_use(arg.stop, node), find_use(arg.step, node)])
+    elif isinstance(arg, Node):
+        return arg is node
+    else:
+        return False
+
+def get_all_users_of(fx_module: GraphModule, index: int) -> List[int]:
+    """Given the graph(fx_module) and an index, return a list of all node indexes that use this node"""
+    graph = fx_module.graph
+    current_node = graph.nodes[index]
+    user_indexes: List[int] = []
+    """if the node A is in node B's args, then B is the user of A
+       go through all the nodes, if the input node in any node's args,
+       then that node is the input node's user
+    """
+    for i, n in enumerate(graph.nodes):
+        if find_use(n.args, current_node) or find_use(n.kwargs, current_node):
+            user_indexes.append(i)
+    return user_indexes
diff --git a/torch/fx/experimental/__init__.py b/torch/fx/experimental/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1

From d9af3d2fcd4e86c1b410f700dc6361e5c538b0af Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Fri, 25 Sep 2020 22:25:53 -0700
Subject: [PATCH 169/449] [quant] ConvTranspose warnings (#45081)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45081

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D23822449

Pulled By: z-a-f

fbshipit-source-id: f21a5f3ef4d09f703c96fff0bc413dbadeac8202
---
 aten/src/ATen/native/quantized/cpu/qconv.cpp |  6 ++++++
 torch/nn/quantized/modules/conv.py           | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 12563eb36d44..621cdf31aeea 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -277,6 +277,12 @@ at::Tensor PackedConvWeight<kSpatialDim>::apply_impl(
                                             : "quantized::conv";
   TORCH_CHECK(
       fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+  TORCH_CHECK(
+    !transpose(),
+    "FBGEMM currently does NOT support transposed convolution. ",
+    "Meanwhile you have multiple options: 1) Replace the ConvTranspose with ",
+    "the 'dequant->conv_tranpose->quant'; 2) Change the current qengine to "
+    "QNNPACK using 'torch.backends.quantized.engine = \"qnnpack\"'.");
   ConvDimChecks<kSpatialDim>(
       act.ndimension(), stride().size(), padding().size(),
       output_padding().size(), dilation().size(), func_name, transpose());
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index 773a9a37fbb3..31c914d2bf35 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -519,15 +519,22 @@ class ConvTranspose1d(_ConvTransposeNd):
     composed of several input planes.
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose1d`.
+
     .. note:: Currently only the QNNPACK engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'qnnpack'`
+
     For special notes, please, see :class:`~torch.nn.quantized.Conv1d`
+
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
                              parameter.
         scale (Tensor):      scalar for the output scale
         zero_point (Tensor): scalar for the output zero point
     See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
     Examples::
+
+        >>> torch.backends.quantized.engine = 'qnnpack'
         >>> # With square kernels and equal stride
         >>> m = nnq.ConvTranspose1d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding
@@ -598,15 +605,22 @@ class ConvTranspose2d(_ConvTransposeNd):
     composed of several input planes.
     For details on input arguments, parameters, and implementation see
     :class:`~torch.nn.ConvTranspose2d`.
+
     .. note:: Currently only the QNNPACK engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'qnnpack'`
+
     For special notes, please, see :class:`~torch.nn.quantized.Conv2d`
+
     Attributes:
         weight (Tensor):     packed tensor derived from the learnable weight
                              parameter.
         scale (Tensor):      scalar for the output scale
         zero_point (Tensor): scalar for the output zero point
     See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
     Examples::
+
+        >>> torch.backends.quantized.engine = 'qnnpack'
         >>> # With square kernels and equal stride
         >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
         >>> # non-square kernels and unequal stride and with padding

From 304e1d1e19800ce54b00baaa7ca1dffd3ada3c24 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Sat, 26 Sep 2020 00:45:59 -0700
Subject: [PATCH 170/449] [Distributed] getNumKeys API to c10d TCPStore
 (#43962)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43962

TCPStore needs a getNumKeys API for our logging needs.
ghstack-source-id: 112939761

Test Plan: Adding tests to C++ Store Tests

Reviewed By: pritamdamania87

Differential Revision: D22985085

fbshipit-source-id: 8a0d286fbd6fd314dcc997bae3aad0e62b51af83
---
 caffe2/distributed/file_store_handler.cc  |  5 +++++
 caffe2/distributed/file_store_handler.h   |  2 ++
 caffe2/distributed/redis_store_handler.cc |  5 +++++
 caffe2/distributed/redis_store_handler.h  |  2 ++
 caffe2/distributed/store_handler.h        |  5 +++++
 torch/csrc/distributed/c10d/init.cpp      |  8 ++++++++
 torch/lib/c10d/FileStore.cpp              |  6 ++++++
 torch/lib/c10d/FileStore.hpp              |  2 ++
 torch/lib/c10d/HashStore.cpp              |  6 ++++++
 torch/lib/c10d/HashStore.hpp              |  2 ++
 torch/lib/c10d/PrefixStore.cpp            |  4 ++++
 torch/lib/c10d/PrefixStore.hpp            |  2 ++
 torch/lib/c10d/Store.hpp                  |  2 ++
 torch/lib/c10d/TCPStore.cpp               | 14 +++++++++++++-
 torch/lib/c10d/TCPStore.hpp               |  3 +++
 torch/lib/c10d/test/TCPStoreTest.cpp      |  4 ++++
 16 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc
index 5a749c304d2b..11f2f5f244d4 100644
--- a/caffe2/distributed/file_store_handler.cc
+++ b/caffe2/distributed/file_store_handler.cc
@@ -122,6 +122,11 @@ int64_t FileStoreHandler::add(
   return 0;
 }
 
+int64_t FileStoreHandler::getNumKeys() {
+  CHECK(false) << "getNumKeys not implemented for FileStoreHandler";
+  return 0;
+}
+
 bool FileStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> paths;
   for (const auto& name : names) {
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
index b58b156e51b0..36c7df2a845f 100644
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@@ -17,6 +17,8 @@ class CAFFE2_API FileStoreHandler : public StoreHandler {
 
   virtual int64_t add(const std::string& name, int64_t value) override;
 
+  virtual int64_t getNumKeys() override;
+
   virtual bool check(const std::vector<std::string>& names) override;
 
   virtual void wait(
diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc
index 7caaa6c79de7..f635a14ba63d 100644
--- a/caffe2/distributed/redis_store_handler.cc
+++ b/caffe2/distributed/redis_store_handler.cc
@@ -76,6 +76,11 @@ int64_t RedisStoreHandler::add(const std::string& name, int64_t value) {
   return reply->integer;
 }
 
+int64_t RedisStoreHandler::getNumKeys() {
+  CHECK(false) << "getNumKeys not implemented for RedisStoreHandler";
+  return 0;
+}
+
 bool RedisStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> args;
   args.push_back("EXISTS");
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
index 0caa888a6629..94d3a2f762b4 100644
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@@ -23,6 +23,8 @@ class CAFFE2_API RedisStoreHandler : public StoreHandler {
 
   virtual int64_t add(const std::string& name, int64_t value) override;
 
+  virtual int64_t getNumKeys() override;
+
   virtual bool check(const std::vector<std::string>& names) override;
 
   virtual void wait(
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
index e11ea57aea3d..1f74d244337f 100644
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@@ -41,6 +41,11 @@ class CAFFE2_API StoreHandler {
    */
   virtual int64_t add(const std::string& name, int64_t value) = 0;
 
+  /*
+   * Returns the number of keys in this store.
+   */
+  virtual int64_t getNumKeys() = 0;
+
   /*
    * Check if a keys exist in the store.
    */
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index be1752d7366f..3cd5d68daf8e 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -94,6 +94,10 @@ class PythonStore : public ::c10d::Store {
     PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, add, key, value);
   }
 
+  int64_t getNumKeys() override {
+    PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, getNumKeys);
+  }
+
   bool check(const std::vector<std::string>& keys) override {
     PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, check, keys);
   }
@@ -303,6 +307,10 @@ They are used in specifying strategies for reduction collectives, e.g.,
               "add",
               &::c10d::Store::add,
               py::call_guard<py::gil_scoped_release>())
+          .def(
+              "num_keys",
+              &::c10d::Store::getNumKeys,
+              py::call_guard<py::gil_scoped_release>())
           .def(
               "set_timeout",
               &::c10d::Store::setTimeout,
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index eb25c52f787a..d48f58994473 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -23,6 +23,8 @@
 #include <system_error>
 #include <thread>
 
+#include <c10/util/Exception.h>
+
 #define SYSASSERT(rv, ...)                                                 \
   if ((rv) < 0) {                                                          \
     throw std::system_error(errno, std::system_category(), ##__VA_ARGS__); \
@@ -352,6 +354,10 @@ int64_t FileStore::add(const std::string& key, int64_t value) {
   return addHelper(regKey, value);
 }
 
+int64_t FileStore::getNumKeys() {
+  TORCH_CHECK(false, "getNumKeys not implemented for FileStore");
+}
+
 bool FileStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> l(activeFileOpLock_);
   File file(path_, O_RDONLY, timeout_);
diff --git a/torch/lib/c10d/FileStore.hpp b/torch/lib/c10d/FileStore.hpp
index dfca47ba7cc4..daf38baa817a 100644
--- a/torch/lib/c10d/FileStore.hpp
+++ b/torch/lib/c10d/FileStore.hpp
@@ -21,6 +21,8 @@ class FileStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  int64_t getNumKeys() override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   void wait(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/HashStore.cpp b/torch/lib/c10d/HashStore.cpp
index 191560d5b0fc..a21b5ea535a7 100644
--- a/torch/lib/c10d/HashStore.cpp
+++ b/torch/lib/c10d/HashStore.cpp
@@ -8,6 +8,8 @@
 #include <cstdio>
 #include <system_error>
 
+#include <c10/util/Exception.h>
+
 namespace c10d {
 
 void HashStore::set(const std::string& key, const std::vector<uint8_t>& data) {
@@ -77,6 +79,10 @@ int64_t HashStore::add(const std::string& key, int64_t i) {
   return ti;
 }
 
+int64_t HashStore::getNumKeys() {
+  TORCH_CHECK(false, "getNumKeys not implemented for HashStore");
+}
+
 bool HashStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> lock(m_);
   for (const auto& key : keys) {
diff --git a/torch/lib/c10d/HashStore.hpp b/torch/lib/c10d/HashStore.hpp
index 0d55722efae9..77123e3b4f58 100644
--- a/torch/lib/c10d/HashStore.hpp
+++ b/torch/lib/c10d/HashStore.hpp
@@ -28,6 +28,8 @@ class HashStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  int64_t getNumKeys() override;
+
   bool check(const std::vector<std::string>& keys) override;
 
  protected:
diff --git a/torch/lib/c10d/PrefixStore.cpp b/torch/lib/c10d/PrefixStore.cpp
index a1bc17450942..a78c17435a19 100644
--- a/torch/lib/c10d/PrefixStore.cpp
+++ b/torch/lib/c10d/PrefixStore.cpp
@@ -35,6 +35,10 @@ int64_t PrefixStore::add(const std::string& key, int64_t value) {
   return store_->add(joinKey(key), value);
 }
 
+int64_t PrefixStore::getNumKeys() {
+  return store_->getNumKeys();
+}
+
 bool PrefixStore::check(const std::vector<std::string>& keys) {
   auto joinedKeys = joinKeys(keys);
   return store_->check(joinedKeys);
diff --git a/torch/lib/c10d/PrefixStore.hpp b/torch/lib/c10d/PrefixStore.hpp
index 86dba598ed33..8f3953a48e78 100644
--- a/torch/lib/c10d/PrefixStore.hpp
+++ b/torch/lib/c10d/PrefixStore.hpp
@@ -17,6 +17,8 @@ class PrefixStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  int64_t getNumKeys() override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   void wait(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
index 8e313fda9767..5eb277c687ee 100644
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@@ -32,6 +32,8 @@ class Store {
 
   virtual bool check(const std::vector<std::string>& keys) = 0;
 
+  virtual int64_t getNumKeys() = 0;
+
   virtual void wait(const std::vector<std::string>& keys) = 0;
 
   virtual void wait(
diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index dfd33cfb77ca..45dcc778f53b 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -10,7 +10,7 @@ namespace c10d {
 
 namespace {
 
-enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT };
+enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
 
@@ -180,6 +180,9 @@ void TCPStoreDaemon::query(int socket) {
   } else if (qt == QueryType::WAIT) {
     waitHandler(socket);
 
+  } else if (qt == QueryType::GETNUMKEYS) {
+    getNumKeysHandler(socket);
+
   } else {
     throw std::runtime_error("Unexpected query type");
   }
@@ -228,6 +231,10 @@ void TCPStoreDaemon::getHandler(int socket) const {
   tcputil::sendVector<uint8_t>(socket, data);
 }
 
+void TCPStoreDaemon::getNumKeysHandler(int socket) const {
+  tcputil::sendValue<int64_t>(socket, tcpStore_.size());
+}
+
 void TCPStoreDaemon::checkHandler(int socket) const {
   SizeType nargs;
   tcputil::recvBytes<SizeType>(socket, &nargs, 1);
@@ -364,6 +371,11 @@ int64_t TCPStore::addHelper_(const std::string& key, int64_t value) {
   return tcputil::recvValue<int64_t>(storeSocket_);
 }
 
+int64_t TCPStore::getNumKeys() {
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::GETNUMKEYS);
+  return tcputil::recvValue<int64_t>(storeSocket_);
+}
+
 bool TCPStore::check(const std::vector<std::string>& keys) {
   tcputil::sendValue<QueryType>(storeSocket_, QueryType::CHECK);
   SizeType nkeys = keys.size();
diff --git a/torch/lib/c10d/TCPStore.hpp b/torch/lib/c10d/TCPStore.hpp
index 29733639bd59..e19d0d83b4bf 100644
--- a/torch/lib/c10d/TCPStore.hpp
+++ b/torch/lib/c10d/TCPStore.hpp
@@ -26,6 +26,7 @@ class TCPStoreDaemon {
   void addHandler(int socket);
   void getHandler(int socket) const;
   void checkHandler(int socket) const;
+  void getNumKeysHandler(int socket) const;
   void waitHandler(int socket);
 
   bool checkKeys(const std::vector<std::string>& keys) const;
@@ -63,6 +64,8 @@ class TCPStore : public Store {
 
   bool check(const std::vector<std::string>& keys) override;
 
+  int64_t getNumKeys() override;
+
   void wait(const std::vector<std::string>& keys) override;
 
   void wait(
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index 66176d3e7355..dfd0dfc1a534 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -36,6 +36,10 @@ void testHelper(const std::string& prefix = "") {
     c10d::test::check(*serverStore, "key0", "value0");
     c10d::test::check(*serverStore, "key1", "value1");
     c10d::test::check(*serverStore, "key2", "value2");
+    auto numKeys = serverStore->getNumKeys();
+    // We expect 5 keys since 3 are added above, 'counter' is added by the
+    // helper thread, and the init key to coordinate workers.
+    EXPECT_EQ(numKeys, 5);
   });
 
   // Hammer on TCPStore

From addf94f2d6518666ef4b86007b098c1b0f864b44 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Sat, 26 Sep 2020 00:45:59 -0700
Subject: [PATCH 171/449] [Distributed] DeleteKey API for c10d TCP Store
 (#43963)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43963

Added a DeleteKey API for the TCP Store
ghstack-source-id: 112939762

Test Plan:
Modified the existing get/set test to use delete. verified that the
correct keys were deleted and that the numKeys API returned the right values

Reviewed By: jiayisuse

Differential Revision: D23009117

fbshipit-source-id: 1a0d95b43d79e665a69b2befbaa059b2b50a1f66
---
 caffe2/distributed/file_store_handler.cc  |  5 +++++
 caffe2/distributed/file_store_handler.h   |  2 ++
 caffe2/distributed/redis_store_handler.cc |  4 ++++
 caffe2/distributed/redis_store_handler.h  |  2 ++
 caffe2/distributed/store_handler.h        |  5 +++++
 torch/csrc/distributed/c10d/init.cpp      |  8 ++++++++
 torch/lib/c10d/FileStore.cpp              |  4 ++++
 torch/lib/c10d/FileStore.hpp              |  2 ++
 torch/lib/c10d/HashStore.cpp              |  4 ++++
 torch/lib/c10d/HashStore.hpp              |  2 ++
 torch/lib/c10d/PrefixStore.cpp            |  4 ++++
 torch/lib/c10d/PrefixStore.hpp            |  2 ++
 torch/lib/c10d/Store.hpp                  |  2 ++
 torch/lib/c10d/TCPStore.cpp               | 19 ++++++++++++++++++-
 torch/lib/c10d/TCPStore.hpp               |  3 +++
 torch/lib/c10d/test/TCPStoreTest.cpp      | 14 +++++++++++++-
 16 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc
index 11f2f5f244d4..5a34e53b6947 100644
--- a/caffe2/distributed/file_store_handler.cc
+++ b/caffe2/distributed/file_store_handler.cc
@@ -127,6 +127,11 @@ int64_t FileStoreHandler::getNumKeys() {
   return 0;
 }
 
+bool FileStoreHandler::deleteKey(const std::string& /* unused */) {
+  CHECK(false) << "deleteKey not implemented for FileStoreHandler";
+  return false;
+}
+
 bool FileStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> paths;
   for (const auto& name : names) {
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
index 36c7df2a845f..9ca81e4c2c7d 100644
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@@ -17,6 +17,8 @@ class CAFFE2_API FileStoreHandler : public StoreHandler {
 
   virtual int64_t add(const std::string& name, int64_t value) override;
 
+  virtual bool deleteKey(const std::string& key) override;
+
   virtual int64_t getNumKeys() override;
 
   virtual bool check(const std::vector<std::string>& names) override;
diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc
index f635a14ba63d..b93e91aafc1f 100644
--- a/caffe2/distributed/redis_store_handler.cc
+++ b/caffe2/distributed/redis_store_handler.cc
@@ -81,6 +81,10 @@ int64_t RedisStoreHandler::getNumKeys() {
   return 0;
 }
 
+bool RedisStoreHandler::deleteKey(const std::string& /* unused */) {
+  CHECK(false) << "deleteKey not implemented for RedisStoreHandler";
+}
+
 bool RedisStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> args;
   args.push_back("EXISTS");
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
index 94d3a2f762b4..d5fa76741578 100644
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@@ -25,6 +25,8 @@ class CAFFE2_API RedisStoreHandler : public StoreHandler {
 
   virtual int64_t getNumKeys() override;
 
+  virtual bool deleteKey(const std::string& key) override;
+
   virtual bool check(const std::vector<std::string>& names) override;
 
   virtual void wait(
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
index 1f74d244337f..951fe26c6ec6 100644
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@@ -46,6 +46,11 @@ class CAFFE2_API StoreHandler {
    */
   virtual int64_t getNumKeys() = 0;
 
+  /*
+   * Removes the specified key from the store.
+   */
+  virtual bool deleteKey(const std::string& key) = 0;
+
   /*
    * Check if a keys exist in the store.
    */
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 3cd5d68daf8e..bc52426f08ac 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -98,6 +98,10 @@ class PythonStore : public ::c10d::Store {
     PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, getNumKeys);
   }
 
+  bool deleteKey(const std::string& key) override {
+    PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, deleteKey, key);
+  }
+
   bool check(const std::vector<std::string>& keys) override {
     PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, check, keys);
   }
@@ -307,6 +311,10 @@ They are used in specifying strategies for reduction collectives, e.g.,
               "add",
               &::c10d::Store::add,
               py::call_guard<py::gil_scoped_release>())
+          .def(
+              "delete_key",
+              &::c10d::Store::deleteKey,
+              py::call_guard<py::gil_scoped_release>())
           .def(
               "num_keys",
               &::c10d::Store::getNumKeys,
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index d48f58994473..46642742d307 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -358,6 +358,10 @@ int64_t FileStore::getNumKeys() {
   TORCH_CHECK(false, "getNumKeys not implemented for FileStore");
 }
 
+bool FileStore::deleteKey(const std::string& /* unused */) {
+  TORCH_CHECK(false, "deleteKey not implemented for FileStore");
+}
+
 bool FileStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> l(activeFileOpLock_);
   File file(path_, O_RDONLY, timeout_);
diff --git a/torch/lib/c10d/FileStore.hpp b/torch/lib/c10d/FileStore.hpp
index daf38baa817a..aa5d9946e5b3 100644
--- a/torch/lib/c10d/FileStore.hpp
+++ b/torch/lib/c10d/FileStore.hpp
@@ -23,6 +23,8 @@ class FileStore : public Store {
 
   int64_t getNumKeys() override;
 
+  bool deleteKey(const std::string& key) override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   void wait(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/HashStore.cpp b/torch/lib/c10d/HashStore.cpp
index a21b5ea535a7..1bc823f0609c 100644
--- a/torch/lib/c10d/HashStore.cpp
+++ b/torch/lib/c10d/HashStore.cpp
@@ -83,6 +83,10 @@ int64_t HashStore::getNumKeys() {
   TORCH_CHECK(false, "getNumKeys not implemented for HashStore");
 }
 
+bool HashStore::deleteKey(const std::string& /* unused */) {
+  TORCH_CHECK(false, "deleteKey not implemented for HashStore");
+}
+
 bool HashStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> lock(m_);
   for (const auto& key : keys) {
diff --git a/torch/lib/c10d/HashStore.hpp b/torch/lib/c10d/HashStore.hpp
index 77123e3b4f58..1bdd67ca603c 100644
--- a/torch/lib/c10d/HashStore.hpp
+++ b/torch/lib/c10d/HashStore.hpp
@@ -32,6 +32,8 @@ class HashStore : public Store {
 
   bool check(const std::vector<std::string>& keys) override;
 
+  bool deleteKey(const std::string& key) override;
+
  protected:
   std::unordered_map<std::string, std::vector<uint8_t>> map_;
   std::mutex m_;
diff --git a/torch/lib/c10d/PrefixStore.cpp b/torch/lib/c10d/PrefixStore.cpp
index a78c17435a19..5f9a3c9c21ec 100644
--- a/torch/lib/c10d/PrefixStore.cpp
+++ b/torch/lib/c10d/PrefixStore.cpp
@@ -35,6 +35,10 @@ int64_t PrefixStore::add(const std::string& key, int64_t value) {
   return store_->add(joinKey(key), value);
 }
 
+bool PrefixStore::deleteKey(const std::string& key) {
+  return store_->deleteKey(joinKey(key));
+}
+
 int64_t PrefixStore::getNumKeys() {
   return store_->getNumKeys();
 }
diff --git a/torch/lib/c10d/PrefixStore.hpp b/torch/lib/c10d/PrefixStore.hpp
index 8f3953a48e78..cad7112fbd76 100644
--- a/torch/lib/c10d/PrefixStore.hpp
+++ b/torch/lib/c10d/PrefixStore.hpp
@@ -17,6 +17,8 @@ class PrefixStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  bool deleteKey(const std::string& key) override;
+
   int64_t getNumKeys() override;
 
   bool check(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
index 5eb277c687ee..e42bbf300e0b 100644
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@@ -30,6 +30,8 @@ class Store {
 
   virtual int64_t add(const std::string& key, int64_t value) = 0;
 
+  virtual bool deleteKey(const std::string& key) = 0;
+
   virtual bool check(const std::vector<std::string>& keys) = 0;
 
   virtual int64_t getNumKeys() = 0;
diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index 45dcc778f53b..55705005aad0 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -10,7 +10,7 @@ namespace c10d {
 
 namespace {
 
-enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS };
+enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS, DELETE };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
 
@@ -183,6 +183,9 @@ void TCPStoreDaemon::query(int socket) {
   } else if (qt == QueryType::GETNUMKEYS) {
     getNumKeysHandler(socket);
 
+  } else if (qt == QueryType::DELETE) {
+    deleteHandler(socket);
+
   } else {
     throw std::runtime_error("Unexpected query type");
   }
@@ -235,6 +238,12 @@ void TCPStoreDaemon::getNumKeysHandler(int socket) const {
   tcputil::sendValue<int64_t>(socket, tcpStore_.size());
 }
 
+void TCPStoreDaemon::deleteHandler(int socket) {
+  std::string key = tcputil::recvString(socket);
+  auto numDeleted = tcpStore_.erase(key);
+  tcputil::sendValue<int64_t>(socket, numDeleted);
+}
+
 void TCPStoreDaemon::checkHandler(int socket) const {
   SizeType nargs;
   tcputil::recvBytes<SizeType>(socket, &nargs, 1);
@@ -364,6 +373,14 @@ int64_t TCPStore::add(const std::string& key, int64_t value) {
   return addHelper_(regKey, value);
 }
 
+bool TCPStore::deleteKey(const std::string& key) {
+  std::string regKey = regularPrefix_ + key;
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::DELETE);
+  tcputil::sendString(storeSocket_, regKey, true);
+  auto numDeleted = tcputil::recvValue<int64_t>(storeSocket_);
+  return (numDeleted == 1);
+}
+
 int64_t TCPStore::addHelper_(const std::string& key, int64_t value) {
   tcputil::sendValue<QueryType>(storeSocket_, QueryType::ADD);
   tcputil::sendString(storeSocket_, key, true);
diff --git a/torch/lib/c10d/TCPStore.hpp b/torch/lib/c10d/TCPStore.hpp
index e19d0d83b4bf..d26df3e9e8ab 100644
--- a/torch/lib/c10d/TCPStore.hpp
+++ b/torch/lib/c10d/TCPStore.hpp
@@ -27,6 +27,7 @@ class TCPStoreDaemon {
   void getHandler(int socket) const;
   void checkHandler(int socket) const;
   void getNumKeysHandler(int socket) const;
+  void deleteHandler(int socket);
   void waitHandler(int socket);
 
   bool checkKeys(const std::vector<std::string>& keys) const;
@@ -62,6 +63,8 @@ class TCPStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  bool deleteKey(const std::string& key) override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   int64_t getNumKeys() override;
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index dfd0dfc1a534..916e5bedd94a 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -36,10 +36,22 @@ void testHelper(const std::string& prefix = "") {
     c10d::test::check(*serverStore, "key0", "value0");
     c10d::test::check(*serverStore, "key1", "value1");
     c10d::test::check(*serverStore, "key2", "value2");
+    serverStore->add("counter", 1);
     auto numKeys = serverStore->getNumKeys();
     // We expect 5 keys since 3 are added above, 'counter' is added by the
     // helper thread, and the init key to coordinate workers.
     EXPECT_EQ(numKeys, 5);
+
+    auto delSuccess = serverStore->deleteKey("key0");
+    // Ensure that the key was successfully deleted
+    EXPECT_TRUE(delSuccess);
+    auto delFailure = serverStore->deleteKey("badKeyName");
+    // The key was not in the store so the delete operation should have failed
+    // and returned false.
+    EXPECT_FALSE(delFailure);
+    numKeys = serverStore->getNumKeys();
+    EXPECT_EQ(numKeys, 4);
+    EXPECT_THROW(serverStore->get("key0"), std::runtime_error);
   });
 
   // Hammer on TCPStore
@@ -57,7 +69,7 @@ void testHelper(const std::string& prefix = "") {
         new c10d::PrefixStore(prefix, clientTCPStores[i])));
   }
 
-  std::string expectedCounterRes = std::to_string(numThreads * numIterations);
+  std::string expectedCounterRes = std::to_string(numThreads * numIterations + 1);
 
   for (auto i = 0; i < numThreads; i++) {
     threads.push_back(

From cf808bed738632dba15531337ceb9fa5f148d7b4 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Sat, 26 Sep 2020 00:45:59 -0700
Subject: [PATCH 172/449] [Distributed] Adding Python tests for the TCPStore
 getNumKeys and deleteKey (#45223)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45223

Previous diffs in this stack implemented the getNumKeys and deleteKey
APIs in the c10d Store as well as added tests at the C++ layer. This diff adds
tests at the Python level in test_c10d.py
ghstack-source-id: 112939763

Test Plan: Ensured these new python tests as well as previous C++ tests pass

Reviewed By: jiayisuse

Differential Revision: D23878455

fbshipit-source-id: 0a17ecf66b28d46438a77346e5bf36414e05e25c
---
 test/distributed/test_c10d.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 0e893c87efb1..53a118e8f15b 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -273,6 +273,30 @@ def test_address_already_in_use(self):
             store1 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
             store2 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
 
+    def _test_numkeys_delkeys(self, fs):
+        # We start off with one init key in the store to coordinate workers
+        self.assertEqual(fs.num_keys(), 1)
+        fs.add("key", 1)
+        fs.add("key", 2)
+        fs.add("key", 3)
+        fs.set("key0", "value0")
+        fs.add("key3", 1)
+        fs.set("key1", "value1")
+        self.assertEqual(fs.num_keys(), 5)
+        fs.delete_key("key")
+        self.assertEqual(fs.num_keys(), 4)
+        with self.assertRaises(RuntimeError):
+            fs.get("key")
+        fs.delete_key("key0")
+        fs.delete_key("key3")
+        self.assertEqual(fs.num_keys(), 2)
+        fs.set("key4", "value2")
+        self.assertEqual(fs.num_keys(), 3)
+        self.assertEqual(b"value1", fs.get("key1"))
+        self.assertEqual(b"value2", fs.get("key4"))
+
+    def test_numkeys_delkeys(self):
+        self._test_numkeys_delkeys(self._create_store())
 
 @skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):

From 0fa551f0ab9ecd99456f2b718c3735a84f10f75a Mon Sep 17 00:00:00 2001
From: Thomas Bredillet <tnb@fb.com>
Date: Sat, 26 Sep 2020 10:57:03 -0700
Subject: [PATCH 173/449] [c2] Fix int types for learning rate

Summary: Currently GetSingleArgument is overflowing since it's expecting an int instead of an int64 when using a 1cycle (hill policy) annealing schedule

Test Plan:
unittest

buck test  caffe2/caffe2/python/operator_test:learning_rate_op_test

Differential Revision: D23938169

fbshipit-source-id: 20d65df800d7a0f1dd9520705af31f63ae716463
---
 .../python/operator_test/learning_rate_op_test.py  |  2 +-
 caffe2/sgd/learning_rate_op.h                      | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py
index bdce6a4c78f7..8d17c0c7ef08 100644
--- a/caffe2/python/operator_test/learning_rate_op_test.py
+++ b/caffe2/python/operator_test/learning_rate_op_test.py
@@ -50,7 +50,7 @@ def ref(iter):
     def test_hill_learning_rate_op(self, gc, dc):
         iter = np.random.randint(low=1, high=1e5, size=1)
 
-        num_iter = int(np.random.randint(low=1e2, high=1e3, size=1))
+        num_iter = int(np.random.randint(low=1e2, high=1e8, size=1))
         start_multiplier = 1e-4
         gamma = 1.0
         power = 0.5
diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h
index 3ba6bef39e63..fb0998a65d71 100644
--- a/caffe2/sgd/learning_rate_op.h
+++ b/caffe2/sgd/learning_rate_op.h
@@ -62,7 +62,7 @@ class LearningRateOp final : public Operator<Context> {
           active_period, inactive_period, active_first);
     } else if (policy == "hill") {
       int64_t num_iter =
-          this->template GetSingleArgument<int>(arg_prefix + "num_iter", 0);
+          this->template GetSingleArgument<int64_t>(arg_prefix + "num_iter", 0);
       DCHECK_GT(num_iter, 0);
       T start_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_multiplier", 0.);
@@ -198,9 +198,9 @@ class LearningRateOp final : public Operator<Context> {
     } else if (policy == "constantThenLinearWarmup") {
       T start_warmup_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_warmup_multiplier", 0.1);
-      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "constant_warmup_num_iter", 10000000);
-      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "linear_warmup_num_iter", 10000000);
       return new ConstantThenLinearWarmupLearningRate<T>(
           start_warmup_multiplier,
@@ -209,9 +209,9 @@ class LearningRateOp final : public Operator<Context> {
     } else if (policy == "compositeCyclical") {
       T start_warmup_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_warmup_multiplier", 0.1);
-      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "constant_warmup_num_iter", 10000000);
-      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "linear_warmup_num_iter", 10000000);
       T cyclical_max_lr = this->template GetSingleArgument<float>(
           arg_prefix + "cyclical_max_lr", 0.05);
@@ -245,9 +245,9 @@ class LearningRateOp final : public Operator<Context> {
     } else if (policy == "compositeCosine") {
       T start_warmup_multiplier = this->template GetSingleArgument<float>(
           arg_prefix + "start_warmup_multiplier", 0.1);
-      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t constant_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "constant_warmup_num_iter", 10000000);
-      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
+      int64_t linear_warmup_num_iter = this->template GetSingleArgument<int64_t>(
           arg_prefix + "linear_warmup_num_iter", 10000000);
       T cosine_max_lr = this->template GetSingleArgument<float>(
           arg_prefix + "cosine_max_lr", 0.5);

From 2b21e7767e51d0a57297922598edb0fff9f0e722 Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@devfair004.maas>
Date: Sat, 26 Sep 2020 12:15:27 -0700
Subject: [PATCH 174/449] Added optimizers based on multi tensor apply (#45299)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45299

Adding a new namespace `torch.optim._multi_tensor` with a bunch of updated optimizers. Those optimizers are using _foreach APIs which improve performance significantly.

### Tests
- updated existing tests to use both optimizers
- added `test_multi_tensor_optimizers` test to verify correctness.

### Perf results

**Adam**
timeit: 42.69 ms --> 10.16 ms
autorange: 41.96 ms --> 10.28 ms

**AdamW**
timeit: 51.38 ms --> 15.63 ms
autorange: 50.82 ms --> 16.07 ms

**SGD**
timeit: 6.28 ms --> 4.40 ms
autorange: 6.13 ms --> 4.73 ms

**RMSprop**
timeit: 28.63 ms --> 5.89 ms
autorange: 28.27 ms -->  5.76 ms

**Rprop**
timeit: 213.30 --> 178.42
autorange: 212.03 --> 178.03

**ASGD**
timeit: 21.67 --> 9.33
autorange: 21.64 --> 9.27

**Adamax**
timeit: 55.60 --> 48.29
autorange: 55.22 -> 49.13

**Rerf Script used**

```
import torch
import time
import torch.optim as optim
from torch.autograd import Variable
from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, StepLR
import torch.nn as nn
import time
import torchvision
import torch.utils._benchmark as benchmark_utils

device = "cuda"
model = torchvision.models.resnet.resnet101(pretrained=True).to(device)
targets = torch.randint(0, 1000, (100, 100), device=device)
criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=1e-3) # <----------------------- optimizer.
                                                          # would compare optim.SGD vs optim._multi_tensor.SGD
running_loss = 0.0
target = torch.empty(128, dtype=torch.long, device=device).random_(5)

optimizer.zero_grad()
inputs = torch.rand(128, 3, 100, 100, device=device , requires_grad=True)
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
running_loss += loss.item()

def main():
    timer = benchmark_utils.Timer(
        stmt="optimizer.step()",
        globals=globals(),
        label="str(optimizer)",
    )

    for i in range(1):
        print(f"Run: {i}\n{'-' * 40}")
        print(f"timeit:\n{timer.timeit(1000)}\n")
        print(f"autorange:\n{timer.blocked_autorange()}\n\n")

if __name__ == "__main__":
    main()
```

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D23931987

Pulled By: izdeby

fbshipit-source-id: 582134ef2d402909d27d89a45c5b588fb7130ea1
---
 test/test_optim.py                     | 244 ++++++++++++++-----------
 torch/__init__.py                      |   1 +
 torch/optim/_multi_tensor/__init__.py  |  24 +++
 torch/optim/_multi_tensor/__init__.pyi |   8 +
 torch/optim/_multi_tensor/adadelta.py  |  99 ++++++++++
 torch/optim/_multi_tensor/adadelta.pyi |   5 +
 torch/optim/_multi_tensor/adam.py      | 142 ++++++++++++++
 torch/optim/_multi_tensor/adam.pyi     |   5 +
 torch/optim/_multi_tensor/adamax.py    | 107 +++++++++++
 torch/optim/_multi_tensor/adamax.pyi   |   5 +
 torch/optim/_multi_tensor/adamw.py     | 144 +++++++++++++++
 torch/optim/_multi_tensor/adamw.pyi    |   5 +
 torch/optim/_multi_tensor/asgd.py      |  94 ++++++++++
 torch/optim/_multi_tensor/asgd.pyi     |   5 +
 torch/optim/_multi_tensor/rmsprop.py   | 123 +++++++++++++
 torch/optim/_multi_tensor/rmsprop.pyi  |   5 +
 torch/optim/_multi_tensor/rprop.py     |  95 ++++++++++
 torch/optim/_multi_tensor/rprop.pyi    |   5 +
 torch/optim/_multi_tensor/sgd.py       | 140 ++++++++++++++
 torch/optim/_multi_tensor/sgd.pyi      |   4 +
 20 files changed, 1152 insertions(+), 108 deletions(-)
 create mode 100644 torch/optim/_multi_tensor/__init__.py
 create mode 100644 torch/optim/_multi_tensor/__init__.pyi
 create mode 100644 torch/optim/_multi_tensor/adadelta.py
 create mode 100644 torch/optim/_multi_tensor/adadelta.pyi
 create mode 100644 torch/optim/_multi_tensor/adam.py
 create mode 100644 torch/optim/_multi_tensor/adam.pyi
 create mode 100644 torch/optim/_multi_tensor/adamax.py
 create mode 100644 torch/optim/_multi_tensor/adamax.pyi
 create mode 100644 torch/optim/_multi_tensor/adamw.py
 create mode 100644 torch/optim/_multi_tensor/adamw.pyi
 create mode 100644 torch/optim/_multi_tensor/asgd.py
 create mode 100644 torch/optim/_multi_tensor/asgd.pyi
 create mode 100644 torch/optim/_multi_tensor/rmsprop.py
 create mode 100644 torch/optim/_multi_tensor/rmsprop.pyi
 create mode 100644 torch/optim/_multi_tensor/rprop.py
 create mode 100644 torch/optim/_multi_tensor/rprop.pyi
 create mode 100644 torch/optim/_multi_tensor/sgd.py
 create mode 100644 torch/optim/_multi_tensor/sgd.pyi

diff --git a/test/test_optim.py b/test/test_optim.py
index f8a2cc405bf6..682c03c8f51c 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -6,6 +6,7 @@
 import torch
 from torch._six import inf
 import torch.optim as optim
+import torch.optim._multi_tensor as optim_mt
 import torch.nn.functional as F
 from torch.optim import SGD
 from torch.autograd import Variable
@@ -249,49 +250,55 @@ def _build_params_dict_single(self, weight, bias, **kwargs):
         return [dict(params=bias, **kwargs)]
 
     def test_sgd(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2))
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
-             lambda opt: ExponentialLR(opt, gamma=0.99),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
-            optim.SGD(None, lr=1e-2, momentum=-0.5)
+        for optimizer in [optim.SGD, optim_mt.SGD]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict_single(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict_single(weight, bias, lr=1e-2))
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
+                 lambda opt: ExponentialLR(opt, gamma=0.99),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
+                optimizer(None, lr=1e-2, momentum=-0.5)
 
     def test_sgd_sparse(self):
-        self._test_rosenbrock_sparse(
-            lambda params: optim.SGD(params, lr=5e-3)
-        )
-        self._test_rosenbrock_sparse(
-            lambda params: optim.SGD(params, lr=0.005),
-            [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
-        )
+        for optimizer in [optim.SGD, optim_mt.SGD]:
+            self._test_rosenbrock_sparse(
+                lambda params: optimizer(params, lr=5e-3)
+            )
+            self._test_rosenbrock_sparse(
+                lambda params: optimizer(params, lr=0.005),
+                [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
+            )
+
+    def test_multi_tensor_optimizers(self):
+        if not torch.cuda.is_available():
+            return
 
     def test_adam(self):
         self._test_basic_cases(
@@ -337,21 +344,22 @@ def test_adam(self):
         with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
             optim.Adam(None, lr=1e-2, betas=(1.0, 0.0))
 
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
+        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):	
             optim.Adam(None, lr=1e-2, weight_decay=-1)
 
     def test_adamw(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.AdamW([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.AdamW(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
-            optim.AdamW(None, lr=1e-2, weight_decay=-1)
+        for optimizer in [optim.AdamW, optim_mt.AdamW]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
+                optimizer(None, lr=1e-2, weight_decay=-1)
 
     def test_sparse_adam(self):
         self._test_rosenbrock_sparse(
@@ -369,21 +377,22 @@ def test_sparse_adam(self):
     # ROCm precision is too low to pass this test
     @skipIfRocm
     def test_adadelta(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta([weight, bias])
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95))
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95)),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
-            optim.Adadelta(None, lr=1e-2, rho=1.1)
+        for optimizer in [optim.Adadelta, optim_mt.Adadelta]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias])
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95))
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95)),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
+                optimizer(None, lr=1e-2, rho=1.1)
 
     def test_adagrad(self):
         self._test_basic_cases(
@@ -425,52 +434,71 @@ def test_adagrad_sparse(self):
         )
 
     def test_adamax(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adamax(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
-            optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
+        for optimizer in [optim.Adamax, optim_mt.Adamax]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
+                optimizer(None, lr=1e-2, betas=(0.0, 1.0))
 
     def test_rmsprop(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.RMSprop(
-                self._build_params_dict(weight, bias, lr=1e-3),
-                lr=1e-2)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
-            optim.RMSprop(None, lr=1e-2, momentum=-1.0)
+        for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-2)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, centered=True)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, centered=True, momentum=0.1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, momentum=0.1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
+                optimizer(None, lr=1e-2, momentum=-1.0)
 
     def test_asgd(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.ASGD(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, t0=100)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
-            optim.ASGD(None, lr=1e-2, weight_decay=-0.5)
+        for optimizer in [optim.ASGD, optim_mt.ASGD]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, t0=100)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3, t0=100)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
+                optimizer(None, lr=1e-2, weight_decay=-0.5)
 
     def test_rprop(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Rprop(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
-            optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))
+        for optimizer in [optim.Rprop, optim_mt.Rprop]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
+                optimizer(None, lr=1e-2, etas=(1.0, 0.5))
 
     def test_lbfgs(self):
         self._test_basic_cases(
diff --git a/torch/__init__.py b/torch/__init__.py
index a88e441b9238..8b69a4612ae7 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -552,6 +552,7 @@ def manager_path():
 import torch.nn.intrinsic
 import torch.nn.quantized
 import torch.optim
+import torch.optim._multi_tensor
 import torch.multiprocessing
 import torch.sparse
 import torch.utils.backcompat
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
new file mode 100644
index 000000000000..2f9b65bfbe3f
--- /dev/null
+++ b/torch/optim/_multi_tensor/__init__.py
@@ -0,0 +1,24 @@
+"""
+:mod:`torch.optim._multi_tensor` is a package implementing various optimization algorithms.
+Most commonly used methods are already supported, and the interface is general
+enough, so that more sophisticated ones can be also easily integrated in the
+future.
+"""
+
+from .adam import Adam
+from .adamw import AdamW
+from .sgd import SGD
+from .rmsprop import RMSprop
+from .rprop import Rprop
+from .asgd import ASGD
+from .adamax import Adamax
+from .adadelta import Adadelta
+
+del adam
+del adamw
+del sgd
+del rmsprop
+del rprop
+del asgd
+del adamax
+del adadelta
diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi
new file mode 100644
index 000000000000..952b969012b7
--- /dev/null
+++ b/torch/optim/_multi_tensor/__init__.pyi
@@ -0,0 +1,8 @@
+from .adam import Adam as Adam
+from .adamw import AdamW as AdamW
+from .sgd import SGD as SGD
+from .rmsprop import RMSprop as RMSprop
+from .rprop import Rprop as Rprop
+from .asgd import ASGD as ASGD
+from .adamax import Adamax as Adamax
+from .adadelta import Adadelta as Adadelta
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adadelta.py b/torch/optim/_multi_tensor/adadelta.py
new file mode 100644
index 000000000000..e69c2143fd12
--- /dev/null
+++ b/torch/optim/_multi_tensor/adadelta.py
@@ -0,0 +1,99 @@
+import torch
+from ..optimizer import Optimizer
+
+class Adadelta(Optimizer):
+    """Implements Adadelta algorithm.
+
+    It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        rho (float, optional): coefficient used for computing a running average
+            of squared gradients (default: 0.9)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-6)
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1212.5701
+    """
+
+    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= rho <= 1.0:
+            raise ValueError("Invalid rho value: {}".format(rho))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
+        super(Adadelta, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            square_avgs = []
+            acc_deltas = []
+
+            rho, eps = group['rho'], group['eps']
+
+            for p in group['params']:
+                if p.grad is not None: 
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adadelta does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    square_avgs.append(state['square_avg'])
+                    acc_deltas.append(state['acc_delta'])
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            torch._foreach_mul_(square_avgs, rho)
+            torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)
+
+            std = torch._foreach_add(square_avgs, eps)
+            torch._foreach_sqrt_(std)
+
+            deltas = torch._foreach_add(acc_deltas, eps)
+            torch._foreach_sqrt_(deltas)
+            torch._foreach_div_(deltas, std)
+            torch._foreach_mul_(deltas, grads)
+
+            torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr'])
+
+            torch._foreach_mul_(acc_deltas, rho)
+            torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adadelta.pyi b/torch/optim/_multi_tensor/adadelta.pyi
new file mode 100644
index 000000000000..0ca4478a16da
--- /dev/null
+++ b/torch/optim/_multi_tensor/adadelta.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adadelta(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., rho: float=..., eps: float=..., weight_decay: float=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
new file mode 100644
index 000000000000..be19e324a9ea
--- /dev/null
+++ b/torch/optim/_multi_tensor/adam.py
@@ -0,0 +1,142 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+class Adam(Optimizer):
+    r"""Implements Adam algorithm with multi tensor APIs.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+    The implementation of the L2 penalty follows changes proposed in
+    `Decoupled Weight Decay Regularization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(Adam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            amsgrad = group['amsgrad']
+
+            grads = []
+            states = []
+            exp_avg = []
+            exp_avg_sq = []
+            max_exp_avg_sq = []
+            params_with_grad = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+
+            for p in params_with_grad:
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avg.append(state['exp_avg'])
+                exp_avg_sq.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
+
+                state['step'] += 1
+                states.append(state)
+
+            beta1, beta2 = group['betas']
+
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+            if group['weight_decay'] != 0:
+                grads = torch._foreach_add(grads, group['params'], group['weight_decay'])
+
+            #
+            # Decay the first and second moment running average coefficient
+            #
+            torch._foreach_mul_(exp_avg, beta1)
+            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
+
+            torch._foreach_mul_(exp_avg_sq, beta2)
+            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                # Use the max. for normalizing running avg. of gradient
+                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
+
+            step_size = [group['lr'] / bc for bc in bias_correction1]
+
+            for i in range(len(step_size)):
+                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adam.pyi b/torch/optim/_multi_tensor/adam.pyi
new file mode 100644
index 000000000000..09f29597fd18
--- /dev/null
+++ b/torch/optim/_multi_tensor/adam.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adam(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamax.py b/torch/optim/_multi_tensor/adamax.py
new file mode 100644
index 000000000000..6eb86676315d
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamax.py
@@ -0,0 +1,107 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class Adamax(Optimizer):
+    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        super(Adamax, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            exp_avgs = []
+            exp_infs = []
+
+            beta1, beta2 = group['betas']
+            eps = group['eps']
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adamax does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['exp_inf'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_infs.append(state['exp_inf'])
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            # Update biased first moment estimate.
+            torch._foreach_mul_(exp_avgs, beta1)
+            torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)
+
+            # Update the exponentially weighted infinity norm.
+            torch._foreach_mul_(exp_infs, beta2)
+
+            for exp_inf, grad in zip(exp_infs, grads):
+                norm_buf = torch.cat([
+                    exp_inf.unsqueeze(0),
+                    grad.abs().add_(eps).unsqueeze_(0)
+                ], 0)
+                torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
+
+            bias_corrections = [1 - beta1 ** state['step'] for state in states]
+            clr = [group['lr'] / bias_correction for bias_correction in bias_corrections]
+
+            for i in range(len(params_with_grad)):
+                params_with_grad[i].addcdiv_(exp_avgs[i], exp_infs[i], value=-clr[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adamax.pyi b/torch/optim/_multi_tensor/adamax.pyi
new file mode 100644
index 000000000000..4ac68f75ba99
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamax.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adamax(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
new file mode 100644
index 000000000000..5440352ad908
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamw.py
@@ -0,0 +1,144 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+
+class AdamW(Optimizer):
+    r"""Implements AdamW algorithm.
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            amsgrad = group['amsgrad']
+
+            grads = []
+            states = []
+            exp_avg = []
+            exp_avg_sq = []
+            max_exp_avg_sq = []
+            params_with_grad = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('AdamW does not support sparse gradients')
+
+                    # Perform stepweight decay
+                    p.mul_(1 - group['lr'] * group['weight_decay'])
+
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+
+            for p in params_with_grad:
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avg.append(state['exp_avg'])
+                exp_avg_sq.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
+
+                state['step'] += 1
+                states.append(state)
+
+            beta1, beta2 = group['betas']
+
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+
+            #
+            # Decay the first and second moment running average coefficient
+            #
+            torch._foreach_mul_(exp_avg, beta1)
+            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
+
+            torch._foreach_mul_(exp_avg_sq, beta2)
+            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                # Use the max. for normalizing running avg. of gradient
+                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
+
+            step_size = [group['lr'] / bc for bc in bias_correction1]
+
+            for i in range(len(step_size)):
+                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adamw.pyi b/torch/optim/_multi_tensor/adamw.pyi
new file mode 100644
index 000000000000..dedd8de3f876
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamw.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class AdamW(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/asgd.py b/torch/optim/_multi_tensor/asgd.py
new file mode 100644
index 000000000000..7589505e0d40
--- /dev/null
+++ b/torch/optim/_multi_tensor/asgd.py
@@ -0,0 +1,94 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+
+class ASGD(Optimizer):
+    """Implements Averaged Stochastic Gradient Descent.
+
+    It has been proposed in `Acceleration of stochastic approximation by
+    averaging`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lambd (float, optional): decay term (default: 1e-4)
+        alpha (float, optional): power for eta update (default: 0.75)
+        t0 (float, optional): point at which to start averaging (default: 1e6)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    .. _Acceleration of stochastic approximation by averaging:
+        https://dl.acm.org/citation.cfm?id=131098
+    """
+
+    def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0,
+                        weight_decay=weight_decay)
+        super(ASGD, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        grads = []
+        params_with_grad = []
+        states = []
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('ASGD does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['eta'] = group['lr']
+                        state['mu'] = 1
+                        state['ax'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, p, alpha=group['weight_decay'])
+
+            # decay term
+            torch._foreach_mul_(params_with_grad, 1 - group['lambd'] * state['eta'])
+
+            # update parameter
+            torch._foreach_add_(params_with_grad, grads, alpha=-state['eta'])
+
+            # averaging
+            for i in range(len(states)):
+                if states[i]['mu'] != 1:
+                    states[i]['ax'].add_(params_with_grad[i].sub(states[i]['ax']).mul(states[i]['mu']))
+                else:
+                    states[i]['ax'].copy_(params_with_grad[i])
+
+            # update eta and mu
+            for state in states:
+                state['eta'] = (group['lr'] /
+                                math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
+                state['mu'] = 1 / max(1, state['step'] - group['t0'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/asgd.pyi b/torch/optim/_multi_tensor/asgd.pyi
new file mode 100644
index 000000000000..6e30af60f9bd
--- /dev/null
+++ b/torch/optim/_multi_tensor/asgd.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class ASGD(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., lambd: float=..., alpha: float=..., t0: float=..., weight_decay: float=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/rmsprop.py b/torch/optim/_multi_tensor/rmsprop.py
new file mode 100644
index 000000000000..9b02065435cf
--- /dev/null
+++ b/torch/optim/_multi_tensor/rmsprop.py
@@ -0,0 +1,123 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class RMSprop(Optimizer):
+    r"""Implements RMSprop algorithm.
+
+    Proposed by G. Hinton in his
+    `course <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
+
+    The centered version first appears in `Generating Sequences
+    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
+
+    The implementation here takes the square root of the gradient average before
+    adding epsilon (note that TensorFlow interchanges these two operations). The effective
+    learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`
+    is the scheduled learning rate and :math:`v` is the weighted moving average
+    of the squared gradient.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    """
+
+    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= momentum:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= alpha:
+            raise ValueError("Invalid alpha value: {}".format(alpha))
+
+        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
+        super(RMSprop, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RMSprop, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('momentum', 0)
+            group.setdefault('centered', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            alpha = group['alpha']
+            square_avg = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('RMSprop does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['momentum'] > 0:
+                            state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['centered']:
+                            state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                        state['step'] += 1
+
+                    states.append(state)
+                    square_avg.append(state['square_avg'])
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, p, alpha=group['weight_decay'])
+
+            torch._foreach_mul_(square_avg, alpha)
+            torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha)
+
+            if group['centered']:
+                grad_avgs = [s['grad_avg'] for s in states]
+                torch._foreach_mul_(grad_avgs, alpha)
+                torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
+                avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1)
+                torch._foreach_sqrt_(avg)
+                torch._foreach_add_(avg, group['eps'])
+            else:
+                avg = torch._foreach_sqrt(square_avg)
+                torch._foreach_add_(avg, group['eps'])
+
+            if group['momentum'] > 0:
+                buf = [s['momentum_buffer'] for s in states]
+                torch._foreach_mul_(buf, group['momentum'])
+                torch._foreach_addcdiv_(buf, grads, avg)
+                torch._foreach_add_(params_with_grad, buf, alpha=-group['lr'])
+            else:
+                torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/rmsprop.pyi b/torch/optim/_multi_tensor/rmsprop.pyi
new file mode 100644
index 000000000000..691f2188ebb1
--- /dev/null
+++ b/torch/optim/_multi_tensor/rmsprop.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class RMSprop(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=...,  centered: bool=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py
new file mode 100644
index 000000000000..95df563a271f
--- /dev/null
+++ b/torch/optim/_multi_tensor/rprop.py
@@ -0,0 +1,95 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class Rprop(Optimizer):
+    """Implements the resilient backpropagation algorithm.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
+            are multiplicative increase and decrease factors
+            (default: (0.5, 1.2))
+        step_sizes (Tuple[float, float], optional): a pair of minimal and
+            maximal allowed step sizes (default: (1e-6, 50))
+    """
+
+    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 < etas[0] < 1.0 < etas[1]:
+            raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
+
+        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes)
+        super(Rprop, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        grads = []
+        states = []
+        params_with_grad = []
+        step_sizes = []
+
+        for group in self.param_groups:
+            for p in group['params']:
+                etaminus, etaplus = group['etas']
+                step_size_min, step_size_max = group['step_sizes']
+
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('RMSprop does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['step_size'] = p.grad.new().resize_as_(p.grad).fill_(group['lr'])
+
+                        state['step'] += 1
+
+                    states.append(state)
+                    step_sizes.append(state['step_size'])
+
+            signs = torch._foreach_mul(grads, [s['prev'] for s in states])
+            signs = [s.sign() for s in signs]
+            for sign in signs:
+                sign[sign.gt(0)] = etaplus
+                sign[sign.lt(0)] = etaminus
+                sign[sign.eq(0)] = 1
+
+            # update stepsizes with step size updates
+            torch._foreach_mul_(step_sizes, signs)
+            for step_size in step_sizes:
+                step_size.clamp_(step_size_min, step_size_max)
+
+            # for dir<0, dfdx=0
+            # for dir>=0 dfdx=dfdx
+            for i in range(len(grads)): 
+                grads[i] = grads[i].clone(memory_format=torch.preserve_format)
+                grads[i][signs[i].eq(etaminus)] = 0
+
+            # update parameters
+            grad_signs = [grad.sign() for grad in grads]
+            torch._foreach_addcmul_(params_with_grad, grad_signs, step_sizes, value=-1)
+
+            for i in range(len(states)):
+                states[i]['prev'].copy_(grads[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/rprop.pyi b/torch/optim/_multi_tensor/rprop.pyi
new file mode 100644
index 000000000000..4891521b81c7
--- /dev/null
+++ b/torch/optim/_multi_tensor/rprop.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Rprop(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py
new file mode 100644
index 000000000000..1f9d440e0c06
--- /dev/null
+++ b/torch/optim/_multi_tensor/sgd.py
@@ -0,0 +1,140 @@
+import torch
+from ..optimizer import Optimizer, required
+
+
+class SGD(Optimizer):
+    r"""Implements stochastic gradient descent (optionally with momentum).
+
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+
+        Considering the specific case of Momentum, the update can be written as
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
+                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
+            \end{aligned}
+
+        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
+        parameters, gradient, velocity, and momentum respectively.
+
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
+                p_{t+1} & = p_{t} - v_{t+1}.
+            \end{aligned}
+
+        The Nesterov version is analogously modified.
+    """
+
+    def __init__(self, params, lr=required, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        super(SGD, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(SGD, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+
+            grads = []
+            params_with_grad = []
+            states = []
+            has_sparse_grad = False
+
+            for p in group['params']:
+                if p.grad is not None:
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+                    states.append(self.state[p])
+
+                    if p.grad.is_sparse:
+                        has_sparse_grad = True
+
+                        if momentum != 0: 
+                            raise RuntimeError('SGD does not support momentum for sparse gradients')
+
+            if grads == []:
+                return loss
+
+            if weight_decay != 0:
+                grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay)
+
+            if momentum != 0:
+                bufs = []
+                for i in range(len(states)):
+                    if 'momentum_buffer' not in state:
+                        buf = states[i]['momentum_buffer'] = torch.clone(grads[i]).detach()
+                    else:
+                        buf = states[i]['momentum_buffer']
+                        buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)
+
+                    bufs.append(buf)
+
+                if nesterov:
+                    torch._foreach_add_(grads, bufs, alpha=momentum)
+                else:
+                    grads = bufs
+
+            if not has_sparse_grad:
+                torch._foreach_add_(params_with_grad, grads, alpha=-group['lr'])
+            else:
+                # foreach APIs dont support sparse
+                for i in range(len(params_with_grad)): 
+                    params_with_grad[i].add_(grads[i], alpha=-group['lr'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/sgd.pyi b/torch/optim/_multi_tensor/sgd.pyi
new file mode 100644
index 000000000000..6082e230cd79
--- /dev/null
+++ b/torch/optim/_multi_tensor/sgd.pyi
@@ -0,0 +1,4 @@
+from ..optimizer import _params_t, Optimizer
+
+class SGD(Optimizer):
+    def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ...

From 19dda7c68a6e2db8319db13dfb73a41d6be90fec Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Sat, 26 Sep 2020 13:10:49 -0700
Subject: [PATCH 175/449] Fallback to CPU when remote end does not have CUDA
 for profiling (#44967)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44967

When enabling profiler on server, if it is a different machine it may
not have CUDA while caller does. In this case, we would crash but now we
fallback to CPU and log a warning.
ghstack-source-id: 112977906

Test Plan: CI

Reviewed By: pritamdamania87

Differential Revision: D23790729

fbshipit-source-id: dc6eba172b7e666842d54553f52a6b9d5f0a5362
---
 .../distributed/rpc/request_callback_impl.cpp |  8 +++++
 .../distributed/rpc/request_callback_impl.h   |  2 ++
 .../rpc/request_callback_no_python.cpp        | 29 ++++++++++++++++++-
 .../rpc/request_callback_no_python.h          |  2 ++
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
index b68cb4092b67..c429fde123c6 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -502,6 +502,14 @@ void RequestCallbackImpl::processRpcWithErrors(
   }
 }
 
+bool RequestCallbackImpl::cudaAvailable() const {
+  #ifdef USE_CUDA
+  return true;
+  #else
+  return false;
+  #endif
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.h b/torch/csrc/distributed/rpc/request_callback_impl.h
index 0591cc88c7d0..836e496fb069 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.h
+++ b/torch/csrc/distributed/rpc/request_callback_impl.h
@@ -54,6 +54,8 @@ class TORCH_API RequestCallbackImpl : public RequestCallbackNoPython {
       const MessageType& messageType,
       const int64_t messageId,
       const std::shared_ptr<FutureMessage>& responseFuture) const override;
+
+   bool cudaAvailable() const override;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index f5df65e87145..d41c8f271104 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -482,7 +482,26 @@ void RequestCallbackNoPython::processRpc(
     case MessageType::RUN_WITH_PROFILING_REQ: {
       auto& rpcWithProfilingReq = static_cast<RpcWithProfilingReq&>(rpc);
       auto wrappedMsgType = rpcWithProfilingReq.wrappedMessageType();
-      const auto profilingConfig = rpcWithProfilingReq.getProfilingConfig();
+      auto profilingConfig = rpcWithProfilingReq.getProfilingConfig();
+      // If requested with CUDA from caller but CUDA is not available on this
+      // machine, fallback to CPU and log a warning instead of crashing.
+      if (profilingConfig.state ==
+              torch::autograd::profiler::ProfilerState::CUDA &&
+          !this->cudaAvailable()) {
+        profilingConfig = torch::autograd::profiler::ProfilerConfig(
+            torch::autograd::profiler::ProfilerState::CPU,
+            profilingConfig.report_input_shapes,
+            profilingConfig.profile_memory);
+
+        LOG(WARNING)
+            << "Profiler was requested to be enabled with CUDA on this node, but CUDA is not available. "
+            << "Falling back to CPU profiling only.";
+      }
+      TORCH_INTERNAL_ASSERT(
+          profilingConfig.state !=
+                  torch::autograd::profiler::ProfilerState::CUDA ||
+              this->cudaAvailable(),
+          "Profiler state set to CUDA but CUDA not available.");
       const auto profilingKeyId = rpcWithProfilingReq.getProfilingId();
       auto wrappedRpcResponseFuture = std::make_shared<FutureMessage>();
       // Enable the profiler with the config from the sender.
@@ -571,6 +590,14 @@ Message RequestCallbackNoPython::handleError(
   return createExceptionResponse(errorMsg, messageId);
 }
 
+bool RequestCallbackNoPython::cudaAvailable() const {
+  #ifdef USE_CUDA
+  return true;
+  #else
+  return false;
+  #endif
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.h b/torch/csrc/distributed/rpc/request_callback_no_python.h
index dd54ea009417..b54fe172d7b6 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.h
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.h
@@ -84,6 +84,8 @@ class TORCH_API RequestCallbackNoPython : public RequestCallback {
       const std::exception& e,
       const MessageType messageType,
       int64_t messageId) const;
+
+  virtual bool cudaAvailable() const;
 };
 
 } // namespace rpc

From 23dfca83510961b71415bf4b4fe1f789cd06156e Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Sat, 26 Sep 2020 13:22:06 -0700
Subject: [PATCH 176/449] Support record_shapes in RPC profiling (#44419)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44419

Closes https://github.com/pytorch/pytorch/issues/39969

This PR adds support for propagation of input shapes over the wire when the profiler is invoked with `record_shapes=True` over RPC. Previously, we did not respect this argument.

This is done by saving the shapes as an ivalue list and recovering it as the type expected (`std::vector<std::vector<int>>` on the client). Test is added to ensure that remote ops have the same `input_shapes` as if the op were run locally.
ghstack-source-id: 112977899

Reviewed By: pritamdamania87

Differential Revision: D23591274

fbshipit-source-id: 7cf3b2e8df26935ead9d70e534fc2c872ccd6958
---
 torch/csrc/autograd/profiler.cpp              | 40 ++++++++++++++++++-
 .../_internal/distributed/rpc/rpc_test.py     | 27 +++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index c817174f02fc..bb5789cd1755 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -36,6 +36,7 @@ namespace {
     CUDA_MEM_USAGE,
     CUDA_DEVICE,
     CUDA_US,
+    SHAPES,
     NUM_EVENT_IVALUE_IDX // must be last in list
   };
 
@@ -520,6 +521,30 @@ void Event::record(bool record_cuda) {
       NUM_EVENT_IVALUE_IDX,
       " elements to reconstruct Event.");
 
+  // Reconstruct input shapes from ivalues.
+  auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES);
+  TORCH_INTERNAL_ASSERT(
+    shapeListIValue.isList(),
+    "Expected profiler shapes IValue to contain type c10::impl::GenericList."
+  );
+
+  auto shapeList = shapeListIValue.toList();
+  std::vector<std::vector<int64_t>> shapes;
+  shapes.reserve(shapeList.size());
+  for (size_t i = 0 ; i < shapeList.size(); ++i) {
+    std::vector<int64_t> s;
+    auto shapeIValue = shapeList.get(i);
+    TORCH_INTERNAL_ASSERT(
+        shapeIValue.isList(),
+        "Expected each profiler shape element to contain shapes of type c10::impl::GenericList.")
+    auto curShapesList = shapeIValue.toList();
+    s.reserve(curShapesList.size());
+    for (size_t j = 0; j < curShapesList.size(); ++j) {
+      s.emplace_back(curShapesList.get(j).toInt());
+    }
+    shapes.emplace_back(s);
+  }
+
   Event evt(
       static_cast<EventKind>(
           ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind
@@ -527,7 +552,7 @@ void Event::record(bool record_cuda) {
       ivalues.get(EventIValueIdx::THREAD_ID).toInt(), // thread_id
       static_cast<at::RecordFunctionHandle>(
           ivalues.get(EventIValueIdx::HANDLE).toDouble()), // handle
-      {}, // TODO: record shapes
+      std::move(shapes), // input shapes
       ivalues.get(EventIValueIdx::NODE_ID).toInt(), // node id
       true, // is remote
       ivalues.get(EventIValueIdx::CPU_MEM_USAGE).toInt(), // cpu_mem_usage
@@ -556,6 +581,19 @@ at::IValue Event::toIValue() const {
   eventIValueList.emplace_back(static_cast<int64_t>(cuda_memory_usage_));
   eventIValueList.emplace_back(device_);
   eventIValueList.emplace_back(cuda_us_);
+  // Shapes
+  c10::impl::GenericList shapesList =
+      c10::impl::GenericList(at::ListType::create(at::IntType::get()));
+  shapesList.reserve(shapes_.size());
+  for (const auto& shape : shapes_) {
+    c10::impl::GenericList s = c10::impl::GenericList(at::IntType::get());
+    s.reserve(shape.size());
+    for (const auto& k : shape) {
+      s.emplace_back(k);
+    }
+    shapesList.emplace_back(s);
+  }
+  eventIValueList.emplace_back(shapesList);
   return at::IValue(eventIValueList);
 }
 
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 734f79db66b6..4b005c6933f9 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1056,6 +1056,33 @@ def check_profiling_info(self, self_worker_name, dst_worker_name, func, rpc_even
         self.assertTrue(rpc_exec_mode.value in rpc_event.name)
         self.assertEqual(rpc_event.count, 1)
 
+    @dist_init
+    def test_profiler_rpc_record_shapes(self):
+        if self.rank != 1:
+            return
+        dst = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(dst)
+        t1, t2 = torch.ones(100), torch.ones(100)
+        with torch.autograd.profiler.profile(record_shapes=True) as prof:
+            rpc.rpc_sync(dst_worker, torch.add, args=(t1, t2))
+
+        function_events = prof.function_events
+        remote_events = [event for event in function_events if event.is_remote]
+        remote_add_event = [
+            event for event in remote_events if "aten::add" in event.name
+        ][0]
+        remote_add_input_shapes = remote_add_event.input_shapes
+        # Run profiler on equivalent local op and validate shapes are the same.
+        with torch.autograd.profiler.profile(record_shapes=True) as prof:
+            torch.add(t1, t2)
+
+        local_function_events = prof.function_events
+        local_add_event = [
+            event for event in local_function_events if "aten::add" in event.name
+        ][0]
+        local_add_input_shapes = local_add_event.input_shapes
+        self.assertEqual(remote_add_input_shapes, local_add_input_shapes)
+
     @dist_init
     def test_profiler_rpc_memory(self):
         if self.rank != 1:

From e52762cbb74b813560db4944ff73fb6222d905c5 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Sat, 26 Sep 2020 18:03:54 -0700
Subject: [PATCH 177/449] Revert D23917034: quant docs: document how to
 customize qconfigs in eager mode

Test Plan: revert-hammer

Differential Revision:
D23917034 (https://github.com/pytorch/pytorch/commit/7763e1d7b1be5b13394396646cb00a8766b4a626)

Original commit changeset: ccf71ce4300c

fbshipit-source-id: 9ce99e880b4a22e824f4413354a0f3703e7c5c2c
---
 docs/source/quantization.rst | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index eb648524ad16..ad5cd9afff7f 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -402,15 +402,9 @@ prior to quantization. This is because currently quantization works on a module
 by module basis. Specifically, for all quantization techniques, the user needs to:
 
 1. Convert any operations that require output requantization (and thus have
-   additional parameters) from functionals to module form (for example,
-   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
+   additional parameters) from functionals to module form.
 2. Specify which parts of the model need to be quantized either by assigning
-   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``.
-   For example, setting ``model.conv1.qconfig = None`` means that the
-   ``model.conv`` layer will not be quantized, and setting
-   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
-   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
-   of the global qconfig.
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
 
 For static quantization techniques which quantize activations, the user needs
 to do the following in addition:

From 54a253fded9a143e6d9c82818e73a7ef3190376f Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Sat, 26 Sep 2020 18:06:04 -0700
Subject: [PATCH 178/449] Revert D23931987: Added optimizers based on multi
 tensor apply

Test Plan: revert-hammer

Differential Revision:
D23931987 (https://github.com/pytorch/pytorch/commit/2b21e7767e51d0a57297922598edb0fff9f0e722)

Original commit changeset: 582134ef2d40

fbshipit-source-id: ffd500aea55fda34155442fb15e2529cb9c00100
---
 test/test_optim.py                     | 244 +++++++++++--------------
 torch/__init__.py                      |   1 -
 torch/optim/_multi_tensor/__init__.py  |  24 ---
 torch/optim/_multi_tensor/__init__.pyi |   8 -
 torch/optim/_multi_tensor/adadelta.py  |  99 ----------
 torch/optim/_multi_tensor/adadelta.pyi |   5 -
 torch/optim/_multi_tensor/adam.py      | 142 --------------
 torch/optim/_multi_tensor/adam.pyi     |   5 -
 torch/optim/_multi_tensor/adamax.py    | 107 -----------
 torch/optim/_multi_tensor/adamax.pyi   |   5 -
 torch/optim/_multi_tensor/adamw.py     | 144 ---------------
 torch/optim/_multi_tensor/adamw.pyi    |   5 -
 torch/optim/_multi_tensor/asgd.py      |  94 ----------
 torch/optim/_multi_tensor/asgd.pyi     |   5 -
 torch/optim/_multi_tensor/rmsprop.py   | 123 -------------
 torch/optim/_multi_tensor/rmsprop.pyi  |   5 -
 torch/optim/_multi_tensor/rprop.py     |  95 ----------
 torch/optim/_multi_tensor/rprop.pyi    |   5 -
 torch/optim/_multi_tensor/sgd.py       | 140 --------------
 torch/optim/_multi_tensor/sgd.pyi      |   4 -
 20 files changed, 108 insertions(+), 1152 deletions(-)
 delete mode 100644 torch/optim/_multi_tensor/__init__.py
 delete mode 100644 torch/optim/_multi_tensor/__init__.pyi
 delete mode 100644 torch/optim/_multi_tensor/adadelta.py
 delete mode 100644 torch/optim/_multi_tensor/adadelta.pyi
 delete mode 100644 torch/optim/_multi_tensor/adam.py
 delete mode 100644 torch/optim/_multi_tensor/adam.pyi
 delete mode 100644 torch/optim/_multi_tensor/adamax.py
 delete mode 100644 torch/optim/_multi_tensor/adamax.pyi
 delete mode 100644 torch/optim/_multi_tensor/adamw.py
 delete mode 100644 torch/optim/_multi_tensor/adamw.pyi
 delete mode 100644 torch/optim/_multi_tensor/asgd.py
 delete mode 100644 torch/optim/_multi_tensor/asgd.pyi
 delete mode 100644 torch/optim/_multi_tensor/rmsprop.py
 delete mode 100644 torch/optim/_multi_tensor/rmsprop.pyi
 delete mode 100644 torch/optim/_multi_tensor/rprop.py
 delete mode 100644 torch/optim/_multi_tensor/rprop.pyi
 delete mode 100644 torch/optim/_multi_tensor/sgd.py
 delete mode 100644 torch/optim/_multi_tensor/sgd.pyi

diff --git a/test/test_optim.py b/test/test_optim.py
index 682c03c8f51c..f8a2cc405bf6 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -6,7 +6,6 @@
 import torch
 from torch._six import inf
 import torch.optim as optim
-import torch.optim._multi_tensor as optim_mt
 import torch.nn.functional as F
 from torch.optim import SGD
 from torch.autograd import Variable
@@ -250,55 +249,49 @@ def _build_params_dict_single(self, weight, bias, **kwargs):
         return [dict(params=bias, **kwargs)]
 
     def test_sgd(self):
-        for optimizer in [optim.SGD, optim_mt.SGD]:
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict_single(weight, bias, lr=1e-2),
-                    lr=1e-3)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict_single(weight, bias, lr=1e-2))
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
-                [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
-                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                 lambda opt: ReduceLROnPlateau(opt)]
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
-                [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
-                 lambda opt: ExponentialLR(opt, gamma=0.99),
-                 lambda opt: ReduceLROnPlateau(opt)]
-            )
-            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
-                optimizer(None, lr=1e-2, momentum=-0.5)
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD(
+                self._build_params_dict_single(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD(
+                self._build_params_dict_single(weight, bias, lr=1e-2))
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
+            [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
+            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+             lambda opt: ReduceLROnPlateau(opt)]
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
+            [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
+             lambda opt: ExponentialLR(opt, gamma=0.99),
+             lambda opt: ReduceLROnPlateau(opt)]
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
+            optim.SGD(None, lr=1e-2, momentum=-0.5)
 
     def test_sgd_sparse(self):
-        for optimizer in [optim.SGD, optim_mt.SGD]:
-            self._test_rosenbrock_sparse(
-                lambda params: optimizer(params, lr=5e-3)
-            )
-            self._test_rosenbrock_sparse(
-                lambda params: optimizer(params, lr=0.005),
-                [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
-            )
-
-    def test_multi_tensor_optimizers(self):
-        if not torch.cuda.is_available():
-            return
+        self._test_rosenbrock_sparse(
+            lambda params: optim.SGD(params, lr=5e-3)
+        )
+        self._test_rosenbrock_sparse(
+            lambda params: optim.SGD(params, lr=0.005),
+            [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
+        )
 
     def test_adam(self):
         self._test_basic_cases(
@@ -344,22 +337,21 @@ def test_adam(self):
         with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
             optim.Adam(None, lr=1e-2, betas=(1.0, 0.0))
 
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):	
+        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
             optim.Adam(None, lr=1e-2, weight_decay=-1)
 
     def test_adamw(self):
-        for optimizer in [optim.AdamW, optim_mt.AdamW]:
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3)
-            )
-
-            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
-                optimizer(None, lr=1e-2, weight_decay=-1)
+        self._test_basic_cases(
+            lambda weight, bias: optim.AdamW([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.AdamW(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+
+        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
+            optim.AdamW(None, lr=1e-2, weight_decay=-1)
 
     def test_sparse_adam(self):
         self._test_rosenbrock_sparse(
@@ -377,22 +369,21 @@ def test_sparse_adam(self):
     # ROCm precision is too low to pass this test
     @skipIfRocm
     def test_adadelta(self):
-        for optimizer in [optim.Adadelta, optim_mt.Adadelta]:
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias])
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, rho=0.95))
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, rho=0.95)),
-                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-                 lambda opt: ReduceLROnPlateau(opt)]
-            )
-            with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
-                optimizer(None, lr=1e-2, rho=1.1)
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adadelta([weight, bias])
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adadelta(
+                self._build_params_dict(weight, bias, rho=0.95))
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adadelta(
+                self._build_params_dict(weight, bias, rho=0.95)),
+            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+             lambda opt: ReduceLROnPlateau(opt)]
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
+            optim.Adadelta(None, lr=1e-2, rho=1.1)
 
     def test_adagrad(self):
         self._test_basic_cases(
@@ -434,71 +425,52 @@ def test_adagrad_sparse(self):
         )
 
     def test_adamax(self):
-        for optimizer in [optim.Adamax, optim_mt.Adamax]:
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-1)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-1)
-            )
-            with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
-                optimizer(None, lr=1e-2, betas=(0.0, 1.0))
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Adamax(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-1)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
+            optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
 
     def test_rmsprop(self):
-        for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-2)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, centered=True)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, centered=True, momentum=0.1)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-3),
-                    lr=1e-2, momentum=0.1)
-            )
-            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
-                optimizer(None, lr=1e-2, momentum=-1.0)
+        self._test_basic_cases(
+            lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.RMSprop(
+                self._build_params_dict(weight, bias, lr=1e-3),
+                lr=1e-2)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
+            optim.RMSprop(None, lr=1e-2, momentum=-1.0)
 
     def test_asgd(self):
-        for optimizer in [optim.ASGD, optim_mt.ASGD]:
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-3, t0=100)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3, t0=100)
-            )
-            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
-                optimizer(None, lr=1e-2, weight_decay=-0.5)
+        self._test_basic_cases(
+            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.ASGD(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3, t0=100)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
+            optim.ASGD(None, lr=1e-2, weight_decay=-0.5)
 
     def test_rprop(self):
-        for optimizer in [optim.Rprop, optim_mt.Rprop]:
-            self._test_basic_cases(
-                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
-            )
-            self._test_basic_cases(
-                lambda weight, bias: optimizer(
-                    self._build_params_dict(weight, bias, lr=1e-2),
-                    lr=1e-3)
-            )
-            with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
-                optimizer(None, lr=1e-2, etas=(1.0, 0.5))
+        self._test_basic_cases(
+            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
+        )
+        self._test_basic_cases(
+            lambda weight, bias: optim.Rprop(
+                self._build_params_dict(weight, bias, lr=1e-2),
+                lr=1e-3)
+        )
+        with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
+            optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))
 
     def test_lbfgs(self):
         self._test_basic_cases(
diff --git a/torch/__init__.py b/torch/__init__.py
index 8b69a4612ae7..a88e441b9238 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -552,7 +552,6 @@ def manager_path():
 import torch.nn.intrinsic
 import torch.nn.quantized
 import torch.optim
-import torch.optim._multi_tensor
 import torch.multiprocessing
 import torch.sparse
 import torch.utils.backcompat
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
deleted file mode 100644
index 2f9b65bfbe3f..000000000000
--- a/torch/optim/_multi_tensor/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""
-:mod:`torch.optim._multi_tensor` is a package implementing various optimization algorithms.
-Most commonly used methods are already supported, and the interface is general
-enough, so that more sophisticated ones can be also easily integrated in the
-future.
-"""
-
-from .adam import Adam
-from .adamw import AdamW
-from .sgd import SGD
-from .rmsprop import RMSprop
-from .rprop import Rprop
-from .asgd import ASGD
-from .adamax import Adamax
-from .adadelta import Adadelta
-
-del adam
-del adamw
-del sgd
-del rmsprop
-del rprop
-del asgd
-del adamax
-del adadelta
diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi
deleted file mode 100644
index 952b969012b7..000000000000
--- a/torch/optim/_multi_tensor/__init__.pyi
+++ /dev/null
@@ -1,8 +0,0 @@
-from .adam import Adam as Adam
-from .adamw import AdamW as AdamW
-from .sgd import SGD as SGD
-from .rmsprop import RMSprop as RMSprop
-from .rprop import Rprop as Rprop
-from .asgd import ASGD as ASGD
-from .adamax import Adamax as Adamax
-from .adadelta import Adadelta as Adadelta
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adadelta.py b/torch/optim/_multi_tensor/adadelta.py
deleted file mode 100644
index e69c2143fd12..000000000000
--- a/torch/optim/_multi_tensor/adadelta.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import torch
-from ..optimizer import Optimizer
-
-class Adadelta(Optimizer):
-    """Implements Adadelta algorithm.
-
-    It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        rho (float, optional): coefficient used for computing a running average
-            of squared gradients (default: 0.9)
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-6)
-        lr (float, optional): coefficient that scale delta before it is applied
-            to the parameters (default: 1.0)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-
-    __ https://arxiv.org/abs/1212.5701
-    """
-
-    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= rho <= 1.0:
-            raise ValueError("Invalid rho value: {}".format(rho))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-
-        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
-        super(Adadelta, self).__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            grads = []
-            params_with_grad = []
-            states = []
-            square_avgs = []
-            acc_deltas = []
-
-            rho, eps = group['rho'], group['eps']
-
-            for p in group['params']:
-                if p.grad is not None: 
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adadelta does not support sparse gradients')
-
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-
-                    state = self.state[p]
-
-                    # State initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    square_avgs.append(state['square_avg'])
-                    acc_deltas.append(state['acc_delta'])
-
-                    state['step'] += 1
-                    states.append(state)
-
-            if group['weight_decay'] != 0:
-                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
-
-            torch._foreach_mul_(square_avgs, rho)
-            torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)
-
-            std = torch._foreach_add(square_avgs, eps)
-            torch._foreach_sqrt_(std)
-
-            deltas = torch._foreach_add(acc_deltas, eps)
-            torch._foreach_sqrt_(deltas)
-            torch._foreach_div_(deltas, std)
-            torch._foreach_mul_(deltas, grads)
-
-            torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr'])
-
-            torch._foreach_mul_(acc_deltas, rho)
-            torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
-
-        return loss
diff --git a/torch/optim/_multi_tensor/adadelta.pyi b/torch/optim/_multi_tensor/adadelta.pyi
deleted file mode 100644
index 0ca4478a16da..000000000000
--- a/torch/optim/_multi_tensor/adadelta.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class Adadelta(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., rho: float=..., eps: float=..., weight_decay: float=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
deleted file mode 100644
index be19e324a9ea..000000000000
--- a/torch/optim/_multi_tensor/adam.py
+++ /dev/null
@@ -1,142 +0,0 @@
-import math
-import torch
-from ..optimizer import Optimizer
-
-class Adam(Optimizer):
-    r"""Implements Adam algorithm with multi tensor APIs.
-
-    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
-    The implementation of the L2 penalty follows changes proposed in
-    `Decoupled Weight Decay Regularization`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, amsgrad=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad)
-        super(Adam, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(Adam, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            amsgrad = group['amsgrad']
-
-            grads = []
-            states = []
-            exp_avg = []
-            exp_avg_sq = []
-            max_exp_avg_sq = []
-            params_with_grad = []
-
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                    params_with_grad.append(p)
-                    grads.append(p.grad)
-
-            for p in params_with_grad:
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avg.append(state['exp_avg'])
-                exp_avg_sq.append(state['exp_avg_sq'])
-
-                if amsgrad:
-                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
-
-                state['step'] += 1
-                states.append(state)
-
-            beta1, beta2 = group['betas']
-
-            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
-            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
-            if group['weight_decay'] != 0:
-                grads = torch._foreach_add(grads, group['params'], group['weight_decay'])
-
-            #
-            # Decay the first and second moment running average coefficient
-            #
-            torch._foreach_mul_(exp_avg, beta1)
-            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
-
-            torch._foreach_mul_(exp_avg_sq, beta2)
-            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
-
-            if amsgrad:
-                # Maintains the maximum of all 2nd moment running avg. till now
-                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
-                # Use the max. for normalizing running avg. of gradient
-                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
-            else:
-                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
-
-            step_size = [group['lr'] / bc for bc in bias_correction1]
-
-            for i in range(len(step_size)):
-                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/adam.pyi b/torch/optim/_multi_tensor/adam.pyi
deleted file mode 100644
index 09f29597fd18..000000000000
--- a/torch/optim/_multi_tensor/adam.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class Adam(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamax.py b/torch/optim/_multi_tensor/adamax.py
deleted file mode 100644
index 6eb86676315d..000000000000
--- a/torch/optim/_multi_tensor/adamax.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import torch
-from ..optimizer import Optimizer
-
-
-class Adamax(Optimizer):
-    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
-
-    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 2e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-
-    __ https://arxiv.org/abs/1412.6980
-    """
-
-    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-        super(Adamax, self).__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            grads = []
-            params_with_grad = []
-            states = []
-            exp_avgs = []
-            exp_infs = []
-
-            beta1, beta2 = group['betas']
-            eps = group['eps']
-
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adamax does not support sparse gradients')
-
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-
-                    state = self.state[p]
-
-                    # State initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        state['exp_inf'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_infs.append(state['exp_inf'])
-
-                    state['step'] += 1
-                    states.append(state)
-
-            if group['weight_decay'] != 0:
-                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
-
-            # Update biased first moment estimate.
-            torch._foreach_mul_(exp_avgs, beta1)
-            torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)
-
-            # Update the exponentially weighted infinity norm.
-            torch._foreach_mul_(exp_infs, beta2)
-
-            for exp_inf, grad in zip(exp_infs, grads):
-                norm_buf = torch.cat([
-                    exp_inf.unsqueeze(0),
-                    grad.abs().add_(eps).unsqueeze_(0)
-                ], 0)
-                torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
-
-            bias_corrections = [1 - beta1 ** state['step'] for state in states]
-            clr = [group['lr'] / bias_correction for bias_correction in bias_corrections]
-
-            for i in range(len(params_with_grad)):
-                params_with_grad[i].addcdiv_(exp_avgs[i], exp_infs[i], value=-clr[i])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/adamax.pyi b/torch/optim/_multi_tensor/adamax.pyi
deleted file mode 100644
index 4ac68f75ba99..000000000000
--- a/torch/optim/_multi_tensor/adamax.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class Adamax(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
deleted file mode 100644
index 5440352ad908..000000000000
--- a/torch/optim/_multi_tensor/adamw.py
+++ /dev/null
@@ -1,144 +0,0 @@
-import math
-import torch
-from ..optimizer import Optimizer
-
-
-class AdamW(Optimizer):
-    r"""Implements AdamW algorithm.
-
-    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
-    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=1e-2, amsgrad=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad)
-        super(AdamW, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(AdamW, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            amsgrad = group['amsgrad']
-
-            grads = []
-            states = []
-            exp_avg = []
-            exp_avg_sq = []
-            max_exp_avg_sq = []
-            params_with_grad = []
-
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('AdamW does not support sparse gradients')
-
-                    # Perform stepweight decay
-                    p.mul_(1 - group['lr'] * group['weight_decay'])
-
-                    params_with_grad.append(p)
-                    grads.append(p.grad)
-
-            for p in params_with_grad:
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                exp_avg.append(state['exp_avg'])
-                exp_avg_sq.append(state['exp_avg_sq'])
-
-                if amsgrad:
-                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
-
-                state['step'] += 1
-                states.append(state)
-
-            beta1, beta2 = group['betas']
-
-            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
-            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
-
-            #
-            # Decay the first and second moment running average coefficient
-            #
-            torch._foreach_mul_(exp_avg, beta1)
-            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
-
-            torch._foreach_mul_(exp_avg_sq, beta2)
-            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
-
-            if amsgrad:
-                # Maintains the maximum of all 2nd moment running avg. till now
-                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
-                # Use the max. for normalizing running avg. of gradient
-                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
-            else:
-                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
-                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
-                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
-                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
-
-            step_size = [group['lr'] / bc for bc in bias_correction1]
-
-            for i in range(len(step_size)):
-                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/adamw.pyi b/torch/optim/_multi_tensor/adamw.pyi
deleted file mode 100644
index dedd8de3f876..000000000000
--- a/torch/optim/_multi_tensor/adamw.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class AdamW(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/asgd.py b/torch/optim/_multi_tensor/asgd.py
deleted file mode 100644
index 7589505e0d40..000000000000
--- a/torch/optim/_multi_tensor/asgd.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import math
-import torch
-from ..optimizer import Optimizer
-
-
-class ASGD(Optimizer):
-    """Implements Averaged Stochastic Gradient Descent.
-
-    It has been proposed in `Acceleration of stochastic approximation by
-    averaging`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-2)
-        lambd (float, optional): decay term (default: 1e-4)
-        alpha (float, optional): power for eta update (default: 0.75)
-        t0 (float, optional): point at which to start averaging (default: 1e6)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-
-    .. _Acceleration of stochastic approximation by averaging:
-        https://dl.acm.org/citation.cfm?id=131098
-    """
-
-    def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-
-        defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0,
-                        weight_decay=weight_decay)
-        super(ASGD, self).__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        grads = []
-        params_with_grad = []
-        states = []
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('ASGD does not support sparse gradients')
-
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-                    state = self.state[p]
-
-                    # State initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        state['eta'] = group['lr']
-                        state['mu'] = 1
-                        state['ax'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                    state['step'] += 1
-                    states.append(state)
-
-            if group['weight_decay'] != 0:
-                torch._foreach_add_(grads, p, alpha=group['weight_decay'])
-
-            # decay term
-            torch._foreach_mul_(params_with_grad, 1 - group['lambd'] * state['eta'])
-
-            # update parameter
-            torch._foreach_add_(params_with_grad, grads, alpha=-state['eta'])
-
-            # averaging
-            for i in range(len(states)):
-                if states[i]['mu'] != 1:
-                    states[i]['ax'].add_(params_with_grad[i].sub(states[i]['ax']).mul(states[i]['mu']))
-                else:
-                    states[i]['ax'].copy_(params_with_grad[i])
-
-            # update eta and mu
-            for state in states:
-                state['eta'] = (group['lr'] /
-                                math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
-                state['mu'] = 1 / max(1, state['step'] - group['t0'])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/asgd.pyi b/torch/optim/_multi_tensor/asgd.pyi
deleted file mode 100644
index 6e30af60f9bd..000000000000
--- a/torch/optim/_multi_tensor/asgd.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class ASGD(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., lambd: float=..., alpha: float=..., t0: float=..., weight_decay: float=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/rmsprop.py b/torch/optim/_multi_tensor/rmsprop.py
deleted file mode 100644
index 9b02065435cf..000000000000
--- a/torch/optim/_multi_tensor/rmsprop.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import torch
-from ..optimizer import Optimizer
-
-
-class RMSprop(Optimizer):
-    r"""Implements RMSprop algorithm.
-
-    Proposed by G. Hinton in his
-    `course <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
-
-    The centered version first appears in `Generating Sequences
-    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
-
-    The implementation here takes the square root of the gradient average before
-    adding epsilon (note that TensorFlow interchanges these two operations). The effective
-    learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`
-    is the scheduled learning rate and :math:`v` is the weighted moving average
-    of the squared gradient.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-2)
-        momentum (float, optional): momentum factor (default: 0)
-        alpha (float, optional): smoothing constant (default: 0.99)
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        centered (bool, optional) : if ``True``, compute the centered RMSProp,
-            the gradient is normalized by an estimation of its variance
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-
-    """
-
-    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= momentum:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        if not 0.0 <= alpha:
-            raise ValueError("Invalid alpha value: {}".format(alpha))
-
-        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
-        super(RMSprop, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(RMSprop, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('momentum', 0)
-            group.setdefault('centered', False)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            grads = []
-            params_with_grad = []
-            states = []
-            alpha = group['alpha']
-            square_avg = []
-
-            for p in group['params']:
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('RMSprop does not support sparse gradients')
-
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-
-                    state = self.state[p]
-                    # State initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        if group['momentum'] > 0:
-                            state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        if group['centered']:
-                            state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-
-                        state['step'] += 1
-
-                    states.append(state)
-                    square_avg.append(state['square_avg'])
-
-            if group['weight_decay'] != 0:
-                torch._foreach_add_(grads, p, alpha=group['weight_decay'])
-
-            torch._foreach_mul_(square_avg, alpha)
-            torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha)
-
-            if group['centered']:
-                grad_avgs = [s['grad_avg'] for s in states]
-                torch._foreach_mul_(grad_avgs, alpha)
-                torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
-                avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1)
-                torch._foreach_sqrt_(avg)
-                torch._foreach_add_(avg, group['eps'])
-            else:
-                avg = torch._foreach_sqrt(square_avg)
-                torch._foreach_add_(avg, group['eps'])
-
-            if group['momentum'] > 0:
-                buf = [s['momentum_buffer'] for s in states]
-                torch._foreach_mul_(buf, group['momentum'])
-                torch._foreach_addcdiv_(buf, grads, avg)
-                torch._foreach_add_(params_with_grad, buf, alpha=-group['lr'])
-            else:
-                torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr'])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/rmsprop.pyi b/torch/optim/_multi_tensor/rmsprop.pyi
deleted file mode 100644
index 691f2188ebb1..000000000000
--- a/torch/optim/_multi_tensor/rmsprop.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class RMSprop(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=...,  centered: bool=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py
deleted file mode 100644
index 95df563a271f..000000000000
--- a/torch/optim/_multi_tensor/rprop.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import torch
-from ..optimizer import Optimizer
-
-
-class Rprop(Optimizer):
-    """Implements the resilient backpropagation algorithm.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-2)
-        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
-            are multiplicative increase and decrease factors
-            (default: (0.5, 1.2))
-        step_sizes (Tuple[float, float], optional): a pair of minimal and
-            maximal allowed step sizes (default: (1e-6, 50))
-    """
-
-    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 < etas[0] < 1.0 < etas[1]:
-            raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
-
-        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes)
-        super(Rprop, self).__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        grads = []
-        states = []
-        params_with_grad = []
-        step_sizes = []
-
-        for group in self.param_groups:
-            for p in group['params']:
-                etaminus, etaplus = group['etas']
-                step_size_min, step_size_max = group['step_sizes']
-
-                if p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise RuntimeError('RMSprop does not support sparse gradients')
-
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-
-                    state = self.state[p]
-                    # State initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                        state['step_size'] = p.grad.new().resize_as_(p.grad).fill_(group['lr'])
-
-                        state['step'] += 1
-
-                    states.append(state)
-                    step_sizes.append(state['step_size'])
-
-            signs = torch._foreach_mul(grads, [s['prev'] for s in states])
-            signs = [s.sign() for s in signs]
-            for sign in signs:
-                sign[sign.gt(0)] = etaplus
-                sign[sign.lt(0)] = etaminus
-                sign[sign.eq(0)] = 1
-
-            # update stepsizes with step size updates
-            torch._foreach_mul_(step_sizes, signs)
-            for step_size in step_sizes:
-                step_size.clamp_(step_size_min, step_size_max)
-
-            # for dir<0, dfdx=0
-            # for dir>=0 dfdx=dfdx
-            for i in range(len(grads)): 
-                grads[i] = grads[i].clone(memory_format=torch.preserve_format)
-                grads[i][signs[i].eq(etaminus)] = 0
-
-            # update parameters
-            grad_signs = [grad.sign() for grad in grads]
-            torch._foreach_addcmul_(params_with_grad, grad_signs, step_sizes, value=-1)
-
-            for i in range(len(states)):
-                states[i]['prev'].copy_(grads[i])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/rprop.pyi b/torch/optim/_multi_tensor/rprop.pyi
deleted file mode 100644
index 4891521b81c7..000000000000
--- a/torch/optim/_multi_tensor/rprop.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-from typing import Tuple
-from ..optimizer import _params_t, Optimizer
-
-class Rprop(Optimizer):
-    def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py
deleted file mode 100644
index 1f9d440e0c06..000000000000
--- a/torch/optim/_multi_tensor/sgd.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import torch
-from ..optimizer import Optimizer, required
-
-
-class SGD(Optimizer):
-    r"""Implements stochastic gradient descent (optionally with momentum).
-
-    Nesterov momentum is based on the formula from
-    `On the importance of initialization and momentum in deep learning`__.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float): learning rate
-        momentum (float, optional): momentum factor (default: 0)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        dampening (float, optional): dampening for momentum (default: 0)
-        nesterov (bool, optional): enables Nesterov momentum (default: False)
-
-    Example:
-        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
-        >>> optimizer.zero_grad()
-        >>> loss_fn(model(input), target).backward()
-        >>> optimizer.step()
-
-    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
-
-    .. note::
-        The implementation of SGD with Momentum/Nesterov subtly differs from
-        Sutskever et. al. and implementations in some other frameworks.
-
-        Considering the specific case of Momentum, the update can be written as
-
-        .. math::
-            \begin{aligned}
-                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
-                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
-            \end{aligned}
-
-        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
-        parameters, gradient, velocity, and momentum respectively.
-
-        This is in contrast to Sutskever et. al. and
-        other frameworks which employ an update of the form
-
-        .. math::
-            \begin{aligned}
-                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
-                p_{t+1} & = p_{t} - v_{t+1}.
-            \end{aligned}
-
-        The Nesterov version is analogously modified.
-    """
-
-    def __init__(self, params, lr=required, momentum=0, dampening=0,
-                 weight_decay=0, nesterov=False):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if momentum < 0.0:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
-        if weight_decay < 0.0:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-
-        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
-                        weight_decay=weight_decay, nesterov=nesterov)
-        if nesterov and (momentum <= 0 or dampening != 0):
-            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
-        super(SGD, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(SGD, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('nesterov', False)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            dampening = group['dampening']
-            nesterov = group['nesterov']
-
-            grads = []
-            params_with_grad = []
-            states = []
-            has_sparse_grad = False
-
-            for p in group['params']:
-                if p.grad is not None:
-                    grads.append(p.grad)
-                    params_with_grad.append(p)
-                    states.append(self.state[p])
-
-                    if p.grad.is_sparse:
-                        has_sparse_grad = True
-
-                        if momentum != 0: 
-                            raise RuntimeError('SGD does not support momentum for sparse gradients')
-
-            if grads == []:
-                return loss
-
-            if weight_decay != 0:
-                grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay)
-
-            if momentum != 0:
-                bufs = []
-                for i in range(len(states)):
-                    if 'momentum_buffer' not in state:
-                        buf = states[i]['momentum_buffer'] = torch.clone(grads[i]).detach()
-                    else:
-                        buf = states[i]['momentum_buffer']
-                        buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)
-
-                    bufs.append(buf)
-
-                if nesterov:
-                    torch._foreach_add_(grads, bufs, alpha=momentum)
-                else:
-                    grads = bufs
-
-            if not has_sparse_grad:
-                torch._foreach_add_(params_with_grad, grads, alpha=-group['lr'])
-            else:
-                # foreach APIs dont support sparse
-                for i in range(len(params_with_grad)): 
-                    params_with_grad[i].add_(grads[i], alpha=-group['lr'])
-
-        return loss
diff --git a/torch/optim/_multi_tensor/sgd.pyi b/torch/optim/_multi_tensor/sgd.pyi
deleted file mode 100644
index 6082e230cd79..000000000000
--- a/torch/optim/_multi_tensor/sgd.pyi
+++ /dev/null
@@ -1,4 +0,0 @@
-from ..optimizer import _params_t, Optimizer
-
-class SGD(Optimizer):
-    def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ...

From 3da10610598b9a4793fba4d502f65a89dfc025af Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Sat, 26 Sep 2020 22:46:04 -0700
Subject: [PATCH 179/449] Revert D23916669: quant docs: add reduce_range
 explanatation to top level doc

Test Plan: revert-hammer

Differential Revision:
D23916669 (https://github.com/pytorch/pytorch/commit/eb3962439404cff611d83e932dafc77f3cf32d5f)

Original commit changeset: ef93fb774cb1

fbshipit-source-id: 7b56020427e76e13f847494044179c81d508db11
---
 docs/source/quantization.rst | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index ad5cd9afff7f..d950e08762e1 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -404,7 +404,7 @@ by module basis. Specifically, for all quantization techniques, the user needs t
 1. Convert any operations that require output requantization (and thus have
    additional parameters) from functionals to module form.
 2. Specify which parts of the model need to be quantized either by assigning
-   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
+   ```.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
 
 For static quantization techniques which quantize activations, the user needs
 to do the following in addition:
@@ -422,13 +422,6 @@ to do the following in addition:
    to be fused. We currently support the following fusions:
    [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
 
-Best Practices
---------------
-
-1. Set the ``reduce_range`` argument on observers to `True` if you are using the
-   ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
-   by reducing the range of quantized data type by 1 bit.
-
 
 Modules that provide quantization functions and classes
 -------------------------------------------------------

From 110aa45387cdfc240b8b64e079502d268813ba7b Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Sat, 26 Sep 2020 22:46:04 -0700
Subject: [PATCH 180/449] Revert D23842456: Quantization: combine previous
 summary with new summary

Test Plan: revert-hammer

Differential Revision:
D23842456 (https://github.com/pytorch/pytorch/commit/278da57255f6498010b1886782b6203196bb2471)

Original commit changeset: db2399e51e9a

fbshipit-source-id: 7878257330bf83751cb17c0971a5c894bdf256ba
---
 docs/source/quantization.rst | 129 ++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 38 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index d950e08762e1..3a004ace295f 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -97,12 +97,13 @@ types.
 Dynamic Quantization
 ^^^^^^^^^^^^^^^^^^^^
 
-This is the simplest to apply form of quantization where the weights are
-quantized ahead of time but the activations are dynamically quantized
-during inference. This is used for situations where the model execution time
-is dominated by loading weights from memory rather than computing the matrix
-multiplications. This is true for for LSTM and Transformer type models with
-small batch size.
+Dynamic quantization quantizes the weights of the model.  Activations are
+converted to the quantized domain for calculations and converted to floating
+point domain for I/O.  It does not require calibration and can be done in a
+single function call.  It is commonly used in models where runtime is compute
+bound or dominated by memory access of weights, instead of being dominated by
+memory accesses of activations dominates the runtime, such as LSTMs/RNNs or
+BERT/Transformers.
 
 Diagram::
 
@@ -153,10 +154,9 @@ Static Quantization
 Static quantization quantizes the weights and activations of the model.  It
 fuses activations into preceding layers where possible.  It requires
 calibration with a representative dataset to determine optimal quantization
-parameters for activations. Post Training Quantization is typically used when
-both memory bandwidth and compute savings are important with CNNs being a
-typical use case.  Static quantization is also known as Post Training
-Quantization or PTQ.
+parameters for activations.  It is commonly used in models where activation
+throughput dominates the runtime, such as CNNs.  Static quantization is also
+known as Post Training Quantization or PTQ.
 
 Diagram::
 
@@ -188,13 +188,9 @@ API Example::
           self.dequant = torch.quantization.DeQuantStub()
 
       def forward(self, x):
-          # manually specify where tensors will be converted from floating
-          # point to quantized in the quantized model
           x = self.quant(x)
           x = self.conv(x)
           x = self.relu(x)
-          # manually specify where tensors will be converted from quantized
-          # to floating point in the quantized model
           x = self.dequant(x)
           return x
 
@@ -206,18 +202,14 @@ API Example::
 
   # attach a global qconfig, which contains information about what kind
   # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference. Other quantization configurations such
-  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
-  # calibration techniques can be specified here.
+  # 'qnnpack' for mobile inference.
   model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
 
-  # Fuse the activations to preceding layers, where applicable.
-  # This needs to be done manually depending on the model architecture.
-  # Common fusions include `conv + relu` and `conv + batchnorm + relu`
+  # fuse the activations to preceding layers, where applicable
+  # this needs to be done manually depending on the model architecture
   model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
 
-  # Prepare the model for static quantization. This inserts observers in
-  # the model that will observe activation tensors during calibration.
+  # prepare the model for static quantization
   model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
 
   # calibrate the prepared model to determine quantization parameters for activations
@@ -225,10 +217,7 @@ API Example::
   input_fp32 = torch.randn(4, 1, 4, 4)
   model_fp32_prepared(input_fp32)
 
-  # Convert the observed model to a quantized model. This does several things:
-  # quantizes the weights, computes and stores the scale and bias value to be
-  # used with each activation tensor, and replaces key operators with quantized
-  # implementations.
+  # convert the observed model to a quantized model
   model_int8 = torch.quantization.convert(model_fp32_prepared)
 
   # run the model, relevant calculations will happen in int8
@@ -243,8 +232,7 @@ Quantization Aware Training
 Quantization Aware Training models the effects of quantization during training
 allowing for higher accuracy compared to other quantization methods.  During
 training, all calculations are done in floating point, with fake_quant modules
-modeling the effects of quantization by clamping and rounding to simulate the
-effects of INT8.  After model conversion, weights and
+modeling the effects of quantization.  After model conversion, weights and
 activations are quantized, and activations are fused into the preceding layer
 where possible.  It is commonly used with CNNs and yields a higher accuracy
 compared to static quantization.  Quantization Aware Training is also known as
@@ -301,9 +289,7 @@ API Example::
 
   # attach a global qconfig, which contains information about what kind
   # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference. Other quantization configurations such
-  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
-  # calibration techniques can be specified here.
+  # 'qnnpack' for mobile inference.
   model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
 
   # fuse the activations to preceding layers, where applicable
@@ -311,17 +297,13 @@ API Example::
   model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
       [['conv', 'bn', 'relu']])
 
-  # Prepare the model for QAT. This inserts observers and fake_quants in
-  # the model that will observe weight and activation tensors during calibration.
+  # prepare the model for static quantization
   model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
 
   # run the training loop (not shown)
   training_loop(model_fp32_prepared)
 
-  # Convert the observed model to a quantized model. This does several things:
-  # quantizes the weights, computes and stores the scale and bias value to be
-  # used with each activation tensor, fuses modules where appropriate,
-  # and replaces key operators with quantized implementations.
+  # convert the observed model to a quantized model
   model_fp32_prepared.eval()
   model_int8 = torch.quantization.convert(model_fp32_prepared)
 
@@ -376,9 +358,80 @@ cover typical CNN and RNN models
     torch.nn.quantized
     torch.nn.quantized.dynamic
 
-Quantization Customizations
+Quantization Workflows
 ----------------------
 
+PyTorch provides three approaches to quantize models.
+
+.. _quantization tutorials:
+   https://pytorch.org/tutorials/#quantization-experimental
+
+1. Post Training Dynamic Quantization: This is the simplest to apply form of
+   quantization where the weights are quantized ahead of time but the
+   activations are dynamically quantized  during inference. This is used
+   for situations where the model execution time is dominated by loading
+   weights from memory rather than computing the matrix multiplications.
+   This is true for for LSTM and Transformer type models with small
+   batch size. Applying dynamic quantization to a whole model can be
+   done with a single call to :func:`torch.quantization.quantize_dynamic()`.
+   See the `quantization tutorials`_
+2. Post Training Static Quantization: This is the most commonly used form of
+   quantization where the weights are quantized ahead of time and the
+   scale factor and bias for the activation tensors is pre-computed
+   based on observing the behavior of the model during a calibration
+   process. Post Training Quantization is typically when both memory bandwidth
+   and compute savings are important with CNNs being a typical use case.
+   The general process for doing post training quantization is:
+
+
+
+   1. Prepare the model:
+
+      a. Specify where the activations are quantized and dequantized explicitly
+         by adding QuantStub and DeQuantStub modules.
+      b. Ensure that modules are not reused.
+      c. Convert any operations that require requantization into modules
+
+   2. Fuse operations like conv + relu or conv+batchnorm + relu together to
+      improve both model accuracy and performance.
+
+   3. Specify the configuration of the quantization methods \'97 such as
+      selecting symmetric or asymmetric quantization and MinMax or
+      L2Norm calibration techniques.
+   4. Use the :func:`torch.quantization.prepare` to insert modules
+      that will observe activation tensors during calibration
+   5. Calibrate the model by running inference against a calibration
+      dataset
+   6. Finally, convert the model itself with the
+      torch.quantization.convert() method. This does several things: it
+      quantizes the weights, computes and stores the scale and bias
+      value to be used each activation tensor, and replaces key
+      operators quantized implementations.
+
+   See the `quantization tutorials`_
+
+
+3. Quantization Aware Training: In the rare cases where post training
+   quantization does not provide adequate accuracy training can be done
+   with simulated quantization using the
+   :class:`torch.quantization.FakeQuantize`. Computations will take place in
+   FP32 but with values clamped and rounded to simulate the effects of INT8
+   quantization. The sequence of steps is very similar.
+
+
+   1. Steps (1) and (2) are identical.
+
+   3. Specify the configuration of the fake quantization methods \'97 such as
+      selecting symmetric or asymmetric quantization and MinMax or Moving Average
+      or L2Norm calibration techniques.
+   4. Use the :func:`torch.quantization.prepare_qat` to insert modules
+      that will simulate quantization during training.
+   5. Train or fine tune the model.
+   6. Identical to step (6) for post training quantization
+
+   See the `quantization tutorials`_
+
+
 While default implementations of observers to select the scale factor and bias
 based on observed tensor data are provided, developers can provide their own
 quantization functions. Quantization can be applied selectively to different

From 37a671abc769129321aa897606c3ea052ce03be7 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Sat, 26 Sep 2020 22:46:04 -0700
Subject: [PATCH 181/449] Revert D23828257: Quantization: add API summary
 section

Test Plan: revert-hammer

Differential Revision:
D23828257 (https://github.com/pytorch/pytorch/commit/d2bd556e7d311c326b0e056bbba7ff665e2acf8a)

Original commit changeset: 9311ee3f394c

fbshipit-source-id: 80b16fc123191e249e6a070ec5360a15fe91cf61
---
 docs/source/quantization.rst | 237 -----------------------------------
 1 file changed, 237 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index 3a004ace295f..b597fa9f51f3 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -77,243 +77,6 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 
     ``torch.backends.quantized.engine = 'qnnpack'``
 
-Quantization API Summary
----------------------------------------
-
-There are three types of quantization supported in PyTorch:
-
-1. dynamic quantization (weights quantized with activations read/stored in
-   floating point and quantized for compute.)
-2. static quantization (weights quantized, activations quantized, calibration
-   required post training)
-3. quantization aware training (weights quantized, activations quantized,
-   quantization numerics modeled during training)
-
-Please see our `Introduction to Quantization on Pytorch
-<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
-for a more comprehensive overview of the tradeoffs between these quantization
-types.
-
-Dynamic Quantization
-^^^^^^^^^^^^^^^^^^^^
-
-Dynamic quantization quantizes the weights of the model.  Activations are
-converted to the quantized domain for calculations and converted to floating
-point domain for I/O.  It does not require calibration and can be done in a
-single function call.  It is commonly used in models where runtime is compute
-bound or dominated by memory access of weights, instead of being dominated by
-memory accesses of activations dominates the runtime, such as LSTMs/RNNs or
-BERT/Transformers.
-
-Diagram::
-
-  # original model
-  # all tensors and computations are in floating point
-  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
-                   /
-  linear_weight_fp32
-
-  # dynamically quantized model
-  # linear and conv weights are in int8
-  previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32
-                       /
-     linear_weight_int8
-
-API example::
-
-    import torch
-
-    # define a floating point model
-    class M(torch.nn.Module):
-        def __init__(self):
-            super(M, self).__init__()
-            self.fc = torch.nn.Linear(4, 4)
-
-        def forward(self, x):
-            x = self.fc(x)
-            return x
-
-    # create a model instance
-    model_fp32 = M()
-    # create a quantized model instance
-    model_int8 = torch.quantization.quantize_dynamic(
-        model_fp32,  # the original model
-        {torch.nn.Linear},  # a set of layers to dynamically quantize
-        dtype=torch.qint8)  # the target dtype for quantized weights
-
-    # run the model
-    input_fp32 = torch.randn(4, 4, 4, 4)
-    res = model_int8(input_fp32)
-
-To learn more about dynamic quantization please see our `end to end tutorial
-<https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
-
-Static Quantization
-^^^^^^^^^^^^^^^^^^^^
-
-Static quantization quantizes the weights and activations of the model.  It
-fuses activations into preceding layers where possible.  It requires
-calibration with a representative dataset to determine optimal quantization
-parameters for activations.  It is commonly used in models where activation
-throughput dominates the runtime, such as CNNs.  Static quantization is also
-known as Post Training Quantization or PTQ.
-
-Diagram::
-
-    # original model
-    # all tensors and computations are in floating point
-    previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
-                        /
-        linear_weight_fp32
-
-    # statically quantized model
-    # weights and activations are in int8
-    previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
-                        /
-      linear_weight_int8
-
-API Example::
-
-  import torch
-
-  # define a floating point model where some layers could be statically quantized
-  class M(torch.nn.Module):
-      def __init__(self):
-          super(M, self).__init__()
-          # QuantStub converts tensors from floating point to quantized
-          self.quant = torch.quantization.QuantStub()
-          self.conv = torch.nn.Conv2d(1, 1, 1)
-          self.relu = torch.nn.ReLU()
-          # DeQuantStub converts tensors from quantized to floating point
-          self.dequant = torch.quantization.DeQuantStub()
-
-      def forward(self, x):
-          x = self.quant(x)
-          x = self.conv(x)
-          x = self.relu(x)
-          x = self.dequant(x)
-          return x
-
-  # create a model instance
-  model_fp32 = M()
-
-  # model must be set to eval mode for static quantization logic to work
-  model_fp32.eval()
-
-  # attach a global qconfig, which contains information about what kind
-  # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference.
-  model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
-
-  # fuse the activations to preceding layers, where applicable
-  # this needs to be done manually depending on the model architecture
-  model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
-
-  # prepare the model for static quantization
-  model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
-
-  # calibrate the prepared model to determine quantization parameters for activations
-  # in a real world setting, the calibration would be done with a representative dataset
-  input_fp32 = torch.randn(4, 1, 4, 4)
-  model_fp32_prepared(input_fp32)
-
-  # convert the observed model to a quantized model
-  model_int8 = torch.quantization.convert(model_fp32_prepared)
-
-  # run the model, relevant calculations will happen in int8
-  res = model_int8(input_fp32)
-
-To learn more about static quantization, please see the `end to end tutorial
-<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
-
-Quantization Aware Training
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Quantization Aware Training models the effects of quantization during training
-allowing for higher accuracy compared to other quantization methods.  During
-training, all calculations are done in floating point, with fake_quant modules
-modeling the effects of quantization.  After model conversion, weights and
-activations are quantized, and activations are fused into the preceding layer
-where possible.  It is commonly used with CNNs and yields a higher accuracy
-compared to static quantization.  Quantization Aware Training is also known as
-QAT.
-
-Diagram::
-
-  # original model
-  # all tensors and computations are in floating point
-  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
-                        /
-      linear_weight_fp32
-
-  # model with fake_quants for modeling quantization numerics during training
-  previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32
-                             /
-     linear_weight_fp32 -- fq
-
-  # quantized model
-  # weights and activations are in int8
-  previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
-                       /
-     linear_weight_int8
-
-API Example::
-
-  import torch
-
-  # define a floating point model where some layers could benefit from QAT
-  class M(torch.nn.Module):
-      def __init__(self):
-          super(M, self).__init__()
-          # QuantStub converts tensors from floating point to quantized
-          self.quant = torch.quantization.QuantStub()
-          self.conv = torch.nn.Conv2d(1, 1, 1)
-          self.bn = torch.nn.BatchNorm2d(1)
-          self.relu = torch.nn.ReLU()
-          # DeQuantStub converts tensors from quantized to floating point
-          self.dequant = torch.quantization.DeQuantStub()
-
-      def forward(self, x):
-          x = self.quant(x)
-          x = self.conv(x)
-          x = self.bn(x)
-          x = self.relu(x)
-          x = self.dequant(x)
-          return x
-
-  # create a model instance
-  model_fp32 = M()
-
-  # model must be set to train mode for QAT logic to work
-  model_fp32.train()
-
-  # attach a global qconfig, which contains information about what kind
-  # of observers to attach. Use 'fbgemm' for server inference and
-  # 'qnnpack' for mobile inference.
-  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
-
-  # fuse the activations to preceding layers, where applicable
-  # this needs to be done manually depending on the model architecture
-  model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
-      [['conv', 'bn', 'relu']])
-
-  # prepare the model for static quantization
-  model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
-
-  # run the training loop (not shown)
-  training_loop(model_fp32_prepared)
-
-  # convert the observed model to a quantized model
-  model_fp32_prepared.eval()
-  model_int8 = torch.quantization.convert(model_fp32_prepared)
-
-  # run the model, relevant calculations will happen in int8
-  res = model_int8(input_fp32)
-
-To learn more about quantization aware training, please see the `end to end
-tutorial
-<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
-
 Quantized Tensors
 ---------------------------------------
 

From 8cef7326f40d0b7d90a28caad21047e7d211d0d9 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Sat, 26 Sep 2020 23:06:46 -0700
Subject: [PATCH 182/449] Benchmarks: add 'default' options for fuser and
 executor. (#45347)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45347

Test Plan: Imported from OSS

Reviewed By: Krovatkin

Differential Revision: D23935519

Pulled By: ZolotukhinM

fbshipit-source-id: 8323fafe7828683c4d29c12a1e5722adb6f945ff
---
 benchmarks/fastrnns/fuser.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py
index 620c19a13cf1..3c1b87f010ef 100644
--- a/benchmarks/fastrnns/fuser.py
+++ b/benchmarks/fastrnns/fuser.py
@@ -1,7 +1,7 @@
 import torch
 
 def set_fuser(fuser_name, executor_name):
-    assert fuser_name in ['te', 'old', 'none']
+    assert fuser_name in ['te', 'old', 'none', 'default']
     if fuser_name == 'te':
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_profiling_mode(True)
@@ -21,6 +21,8 @@ def set_fuser(fuser_name, executor_name):
         torch._C._jit_override_can_fuse_on_gpu(False)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_set_texpr_fuser_enabled(False)
+    elif fuser_name == 'default':
+        pass
 
     # --executor overrides settings of --fuser
     if executor_name == 'profiling':
@@ -34,3 +36,5 @@ def set_fuser(fuser_name, executor_name):
     elif executor_name == 'legacy':
         torch._C._jit_set_profiling_executor(False)
         torch._C._jit_set_profiling_mode(False)
+    elif executor_name == 'default':
+        pass

From a07d82982a3f3bd23aefa3bd1e919b476dc018f7 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Sat, 26 Sep 2020 23:06:46 -0700
Subject: [PATCH 183/449] CI: Add a run of FastRNN benchmarks in default
 executor/fuser configuration. (#45348)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45348

Test Plan: Imported from OSS

Reviewed By: Krovatkin

Differential Revision: D23935520

Pulled By: ZolotukhinM

fbshipit-source-id: efecaaab68caaaa057b354884f4ae37b6ef36983
---
 .jenkins/pytorch/test.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 7e85039a72d1..48a0063bbda6 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -338,6 +338,8 @@ test_benchmarks() {
     pip_install --user "requests"
     BENCHMARK_DATA="benchmarks/.data"
     mkdir -p ${BENCHMARK_DATA}
+    pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default
+    python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_default.json
     pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy
     python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json
     pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling

From bc5710f2f746d33645858ed82141076ab5fec75d Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Sat, 26 Sep 2020 23:06:46 -0700
Subject: [PATCH 184/449] Benchmarks: tweak PE config settings. (#45349)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45349

Test Plan: Imported from OSS

Reviewed By: Krovatkin

Differential Revision: D23935518

Pulled By: ZolotukhinM

fbshipit-source-id: 5a7c508c6fc84eafbc23399f095d732b903510dc
---
 benchmarks/fastrnns/fuser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py
index 3c1b87f010ef..5b85f87291dc 100644
--- a/benchmarks/fastrnns/fuser.py
+++ b/benchmarks/fastrnns/fuser.py
@@ -6,7 +6,7 @@ def set_fuser(fuser_name, executor_name):
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_profiling_mode(True)
         torch._C._jit_set_bailout_depth(20)
-        torch._C._jit_set_num_profiled_runs(2)
+        torch._C._jit_set_num_profiled_runs(1)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(True)

From f84b2e865fdb6802089e28a4730e7e8971985b53 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Sun, 27 Sep 2020 11:59:56 -0700
Subject: [PATCH 185/449] Revert D23878455: [Distributed] Adding Python tests
 for the TCPStore getNumKeys and deleteKey

Test Plan: revert-hammer

Differential Revision:
D23878455 (https://github.com/pytorch/pytorch/commit/cf808bed738632dba15531337ceb9fa5f148d7b4)

Original commit changeset: 0a17ecf66b28

fbshipit-source-id: 93e60b23f66324e3e5266c45abb0cec295bb3d23
---
 test/distributed/test_c10d.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 53a118e8f15b..0e893c87efb1 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -273,30 +273,6 @@ def test_address_already_in_use(self):
             store1 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
             store2 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
 
-    def _test_numkeys_delkeys(self, fs):
-        # We start off with one init key in the store to coordinate workers
-        self.assertEqual(fs.num_keys(), 1)
-        fs.add("key", 1)
-        fs.add("key", 2)
-        fs.add("key", 3)
-        fs.set("key0", "value0")
-        fs.add("key3", 1)
-        fs.set("key1", "value1")
-        self.assertEqual(fs.num_keys(), 5)
-        fs.delete_key("key")
-        self.assertEqual(fs.num_keys(), 4)
-        with self.assertRaises(RuntimeError):
-            fs.get("key")
-        fs.delete_key("key0")
-        fs.delete_key("key3")
-        self.assertEqual(fs.num_keys(), 2)
-        fs.set("key4", "value2")
-        self.assertEqual(fs.num_keys(), 3)
-        self.assertEqual(b"value1", fs.get("key1"))
-        self.assertEqual(b"value2", fs.get("key4"))
-
-    def test_numkeys_delkeys(self):
-        self._test_numkeys_delkeys(self._create_store())
 
 @skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):

From 78caa028b6fa1bb4b3b1db391a200ad7060576a9 Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Sun, 27 Sep 2020 11:59:56 -0700
Subject: [PATCH 186/449] Revert D23009117: [Distributed] DeleteKey API for
 c10d TCP Store

Test Plan: revert-hammer

Differential Revision:
D23009117 (https://github.com/pytorch/pytorch/commit/addf94f2d6518666ef4b86007b098c1b0f864b44)

Original commit changeset: 1a0d95b43d79

fbshipit-source-id: ad3fe5501267e1a0a7bf23410766f1e92b34b24d
---
 caffe2/distributed/file_store_handler.cc  |  5 -----
 caffe2/distributed/file_store_handler.h   |  2 --
 caffe2/distributed/redis_store_handler.cc |  4 ----
 caffe2/distributed/redis_store_handler.h  |  2 --
 caffe2/distributed/store_handler.h        |  5 -----
 torch/csrc/distributed/c10d/init.cpp      |  8 --------
 torch/lib/c10d/FileStore.cpp              |  4 ----
 torch/lib/c10d/FileStore.hpp              |  2 --
 torch/lib/c10d/HashStore.cpp              |  4 ----
 torch/lib/c10d/HashStore.hpp              |  2 --
 torch/lib/c10d/PrefixStore.cpp            |  4 ----
 torch/lib/c10d/PrefixStore.hpp            |  2 --
 torch/lib/c10d/Store.hpp                  |  2 --
 torch/lib/c10d/TCPStore.cpp               | 19 +------------------
 torch/lib/c10d/TCPStore.hpp               |  3 ---
 torch/lib/c10d/test/TCPStoreTest.cpp      | 14 +-------------
 16 files changed, 2 insertions(+), 80 deletions(-)

diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc
index 5a34e53b6947..11f2f5f244d4 100644
--- a/caffe2/distributed/file_store_handler.cc
+++ b/caffe2/distributed/file_store_handler.cc
@@ -127,11 +127,6 @@ int64_t FileStoreHandler::getNumKeys() {
   return 0;
 }
 
-bool FileStoreHandler::deleteKey(const std::string& /* unused */) {
-  CHECK(false) << "deleteKey not implemented for FileStoreHandler";
-  return false;
-}
-
 bool FileStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> paths;
   for (const auto& name : names) {
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
index 9ca81e4c2c7d..36c7df2a845f 100644
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@@ -17,8 +17,6 @@ class CAFFE2_API FileStoreHandler : public StoreHandler {
 
   virtual int64_t add(const std::string& name, int64_t value) override;
 
-  virtual bool deleteKey(const std::string& key) override;
-
   virtual int64_t getNumKeys() override;
 
   virtual bool check(const std::vector<std::string>& names) override;
diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc
index b93e91aafc1f..f635a14ba63d 100644
--- a/caffe2/distributed/redis_store_handler.cc
+++ b/caffe2/distributed/redis_store_handler.cc
@@ -81,10 +81,6 @@ int64_t RedisStoreHandler::getNumKeys() {
   return 0;
 }
 
-bool RedisStoreHandler::deleteKey(const std::string& /* unused */) {
-  CHECK(false) << "deleteKey not implemented for RedisStoreHandler";
-}
-
 bool RedisStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> args;
   args.push_back("EXISTS");
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
index d5fa76741578..94d3a2f762b4 100644
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@@ -25,8 +25,6 @@ class CAFFE2_API RedisStoreHandler : public StoreHandler {
 
   virtual int64_t getNumKeys() override;
 
-  virtual bool deleteKey(const std::string& key) override;
-
   virtual bool check(const std::vector<std::string>& names) override;
 
   virtual void wait(
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
index 951fe26c6ec6..1f74d244337f 100644
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@@ -46,11 +46,6 @@ class CAFFE2_API StoreHandler {
    */
   virtual int64_t getNumKeys() = 0;
 
-  /*
-   * Removes the specified key from the store.
-   */
-  virtual bool deleteKey(const std::string& key) = 0;
-
   /*
    * Check if a keys exist in the store.
    */
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index bc52426f08ac..3cd5d68daf8e 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -98,10 +98,6 @@ class PythonStore : public ::c10d::Store {
     PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, getNumKeys);
   }
 
-  bool deleteKey(const std::string& key) override {
-    PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, deleteKey, key);
-  }
-
   bool check(const std::vector<std::string>& keys) override {
     PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, check, keys);
   }
@@ -311,10 +307,6 @@ They are used in specifying strategies for reduction collectives, e.g.,
               "add",
               &::c10d::Store::add,
               py::call_guard<py::gil_scoped_release>())
-          .def(
-              "delete_key",
-              &::c10d::Store::deleteKey,
-              py::call_guard<py::gil_scoped_release>())
           .def(
               "num_keys",
               &::c10d::Store::getNumKeys,
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index 46642742d307..d48f58994473 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -358,10 +358,6 @@ int64_t FileStore::getNumKeys() {
   TORCH_CHECK(false, "getNumKeys not implemented for FileStore");
 }
 
-bool FileStore::deleteKey(const std::string& /* unused */) {
-  TORCH_CHECK(false, "deleteKey not implemented for FileStore");
-}
-
 bool FileStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> l(activeFileOpLock_);
   File file(path_, O_RDONLY, timeout_);
diff --git a/torch/lib/c10d/FileStore.hpp b/torch/lib/c10d/FileStore.hpp
index aa5d9946e5b3..daf38baa817a 100644
--- a/torch/lib/c10d/FileStore.hpp
+++ b/torch/lib/c10d/FileStore.hpp
@@ -23,8 +23,6 @@ class FileStore : public Store {
 
   int64_t getNumKeys() override;
 
-  bool deleteKey(const std::string& key) override;
-
   bool check(const std::vector<std::string>& keys) override;
 
   void wait(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/HashStore.cpp b/torch/lib/c10d/HashStore.cpp
index 1bc823f0609c..a21b5ea535a7 100644
--- a/torch/lib/c10d/HashStore.cpp
+++ b/torch/lib/c10d/HashStore.cpp
@@ -83,10 +83,6 @@ int64_t HashStore::getNumKeys() {
   TORCH_CHECK(false, "getNumKeys not implemented for HashStore");
 }
 
-bool HashStore::deleteKey(const std::string& /* unused */) {
-  TORCH_CHECK(false, "deleteKey not implemented for HashStore");
-}
-
 bool HashStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> lock(m_);
   for (const auto& key : keys) {
diff --git a/torch/lib/c10d/HashStore.hpp b/torch/lib/c10d/HashStore.hpp
index 1bdd67ca603c..77123e3b4f58 100644
--- a/torch/lib/c10d/HashStore.hpp
+++ b/torch/lib/c10d/HashStore.hpp
@@ -32,8 +32,6 @@ class HashStore : public Store {
 
   bool check(const std::vector<std::string>& keys) override;
 
-  bool deleteKey(const std::string& key) override;
-
  protected:
   std::unordered_map<std::string, std::vector<uint8_t>> map_;
   std::mutex m_;
diff --git a/torch/lib/c10d/PrefixStore.cpp b/torch/lib/c10d/PrefixStore.cpp
index 5f9a3c9c21ec..a78c17435a19 100644
--- a/torch/lib/c10d/PrefixStore.cpp
+++ b/torch/lib/c10d/PrefixStore.cpp
@@ -35,10 +35,6 @@ int64_t PrefixStore::add(const std::string& key, int64_t value) {
   return store_->add(joinKey(key), value);
 }
 
-bool PrefixStore::deleteKey(const std::string& key) {
-  return store_->deleteKey(joinKey(key));
-}
-
 int64_t PrefixStore::getNumKeys() {
   return store_->getNumKeys();
 }
diff --git a/torch/lib/c10d/PrefixStore.hpp b/torch/lib/c10d/PrefixStore.hpp
index cad7112fbd76..8f3953a48e78 100644
--- a/torch/lib/c10d/PrefixStore.hpp
+++ b/torch/lib/c10d/PrefixStore.hpp
@@ -17,8 +17,6 @@ class PrefixStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
-  bool deleteKey(const std::string& key) override;
-
   int64_t getNumKeys() override;
 
   bool check(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
index e42bbf300e0b..5eb277c687ee 100644
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@@ -30,8 +30,6 @@ class Store {
 
   virtual int64_t add(const std::string& key, int64_t value) = 0;
 
-  virtual bool deleteKey(const std::string& key) = 0;
-
   virtual bool check(const std::vector<std::string>& keys) = 0;
 
   virtual int64_t getNumKeys() = 0;
diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index 55705005aad0..45dcc778f53b 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -10,7 +10,7 @@ namespace c10d {
 
 namespace {
 
-enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS, DELETE };
+enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
 
@@ -183,9 +183,6 @@ void TCPStoreDaemon::query(int socket) {
   } else if (qt == QueryType::GETNUMKEYS) {
     getNumKeysHandler(socket);
 
-  } else if (qt == QueryType::DELETE) {
-    deleteHandler(socket);
-
   } else {
     throw std::runtime_error("Unexpected query type");
   }
@@ -238,12 +235,6 @@ void TCPStoreDaemon::getNumKeysHandler(int socket) const {
   tcputil::sendValue<int64_t>(socket, tcpStore_.size());
 }
 
-void TCPStoreDaemon::deleteHandler(int socket) {
-  std::string key = tcputil::recvString(socket);
-  auto numDeleted = tcpStore_.erase(key);
-  tcputil::sendValue<int64_t>(socket, numDeleted);
-}
-
 void TCPStoreDaemon::checkHandler(int socket) const {
   SizeType nargs;
   tcputil::recvBytes<SizeType>(socket, &nargs, 1);
@@ -373,14 +364,6 @@ int64_t TCPStore::add(const std::string& key, int64_t value) {
   return addHelper_(regKey, value);
 }
 
-bool TCPStore::deleteKey(const std::string& key) {
-  std::string regKey = regularPrefix_ + key;
-  tcputil::sendValue<QueryType>(storeSocket_, QueryType::DELETE);
-  tcputil::sendString(storeSocket_, regKey, true);
-  auto numDeleted = tcputil::recvValue<int64_t>(storeSocket_);
-  return (numDeleted == 1);
-}
-
 int64_t TCPStore::addHelper_(const std::string& key, int64_t value) {
   tcputil::sendValue<QueryType>(storeSocket_, QueryType::ADD);
   tcputil::sendString(storeSocket_, key, true);
diff --git a/torch/lib/c10d/TCPStore.hpp b/torch/lib/c10d/TCPStore.hpp
index d26df3e9e8ab..e19d0d83b4bf 100644
--- a/torch/lib/c10d/TCPStore.hpp
+++ b/torch/lib/c10d/TCPStore.hpp
@@ -27,7 +27,6 @@ class TCPStoreDaemon {
   void getHandler(int socket) const;
   void checkHandler(int socket) const;
   void getNumKeysHandler(int socket) const;
-  void deleteHandler(int socket);
   void waitHandler(int socket);
 
   bool checkKeys(const std::vector<std::string>& keys) const;
@@ -63,8 +62,6 @@ class TCPStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
-  bool deleteKey(const std::string& key) override;
-
   bool check(const std::vector<std::string>& keys) override;
 
   int64_t getNumKeys() override;
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index 916e5bedd94a..dfd0dfc1a534 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -36,22 +36,10 @@ void testHelper(const std::string& prefix = "") {
     c10d::test::check(*serverStore, "key0", "value0");
     c10d::test::check(*serverStore, "key1", "value1");
     c10d::test::check(*serverStore, "key2", "value2");
-    serverStore->add("counter", 1);
     auto numKeys = serverStore->getNumKeys();
     // We expect 5 keys since 3 are added above, 'counter' is added by the
     // helper thread, and the init key to coordinate workers.
     EXPECT_EQ(numKeys, 5);
-
-    auto delSuccess = serverStore->deleteKey("key0");
-    // Ensure that the key was successfully deleted
-    EXPECT_TRUE(delSuccess);
-    auto delFailure = serverStore->deleteKey("badKeyName");
-    // The key was not in the store so the delete operation should have failed
-    // and returned false.
-    EXPECT_FALSE(delFailure);
-    numKeys = serverStore->getNumKeys();
-    EXPECT_EQ(numKeys, 4);
-    EXPECT_THROW(serverStore->get("key0"), std::runtime_error);
   });
 
   // Hammer on TCPStore
@@ -69,7 +57,7 @@ void testHelper(const std::string& prefix = "") {
         new c10d::PrefixStore(prefix, clientTCPStores[i])));
   }
 
-  std::string expectedCounterRes = std::to_string(numThreads * numIterations + 1);
+  std::string expectedCounterRes = std::to_string(numThreads * numIterations);
 
   for (auto i = 0; i < numThreads; i++) {
     threads.push_back(

From 4005afe94b59a91401bc58071b9bb98887d8c876 Mon Sep 17 00:00:00 2001
From: neginraoof <neginmr@utexas.edu>
Date: Sun, 27 Sep 2020 15:51:05 -0700
Subject: [PATCH 187/449] [ONNX] Update narrow for dynamic inputs (#44039)

Summary:
Update narrow for dynamic inputs

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44039

Reviewed By: mruberry

Differential Revision: D23742215

Pulled By: bzinodev

fbshipit-source-id: 0d58d2fe996f91a124af988a9a21ee433e842d07
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 9 +++++++++
 torch/onnx/symbolic_opset11.py             | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 816951dfc79e..1aba8d838462 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1711,6 +1711,15 @@ def forward(self, input):
         x = torch.randn(3, 3, requires_grad=True)
         self.run_test(NarrowModel(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_narrow_dynamic(self):
+        class NarrowModel(torch.nn.Module):
+            def forward(self, input):
+                return torch.narrow(input, 0, 0, input.shape[0] - 1)
+
+        x = torch.randn(3, 3, requires_grad=True)
+        self.run_test(NarrowModel(), x)
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_index_fill(self):
         class IndexFillModel(torch.nn.Module):
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 01cdd4890633..f4e77a607c7d 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -712,6 +712,12 @@ def im2col(g, input, kernel_size, dilation, padding, stride):
     return g.op("Reshape", output, output_shape)
 
 
+def narrow(g, input, dim, start, length):
+    from torch.onnx.symbolic_helper import _slice_helper
+    end = g.op("Add", start, length)
+    return _slice_helper(g, input, axes=dim, starts=start, ends=end, dynamic_slice=True)
+
+
 @parse_args('v', 'i', 'i')
 def flatten(g, input, start_dim, end_dim):
     dim = input.type().dim()

From 5b839bca7876a71fe44852443ccfda192d850773 Mon Sep 17 00:00:00 2001
From: shubhambhokare1 <shubhambhokare1@gmail.com>
Date: Sun, 27 Sep 2020 16:27:36 -0700
Subject: [PATCH 188/449] [ONNX] Optimize export_onnx api to reduce string and
 model proto exchange (#44332)

Summary:
Optimize export_onnx api to reduce string and model proto exchange in export.cpp

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44332

Reviewed By: bwasti, eellison

Differential Revision: D23880129

Pulled By: bzinodev

fbshipit-source-id: 1d216d8f710f356cbba2334fb21ea15a89dd16fa
---
 .../jit/passes/onnx/shape_type_inference.cpp  | 20 ++++-----
 torch/csrc/jit/python/python_ir.cpp           |  4 +-
 torch/csrc/jit/serialization/export.cpp       | 14 +++++--
 torch/csrc/jit/serialization/export.h         | 41 +++++++++++--------
 4 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 60725bf0cf32..c3098c4b40e2 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -233,11 +233,10 @@ bool IsGraphValidForInference(std::shared_ptr<Graph> graph) {
 
 void ConvertGraphToONNXProto(
     std::shared_ptr<Graph> graph,
-    onnx::ModelProto& model_proto,
+    std::shared_ptr<onnx::ModelProto>& model_proto,
     int opset_version) {
-  std::string model_str;
   RawDataExportMap export_map;
-  std::tie(model_str, export_map) = export_onnx(
+  std::tie(model_proto, export_map) = export_onnx(
       graph,
       {},
       opset_version,
@@ -250,9 +249,8 @@ void ConvertGraphToONNXProto(
       true,
       false,
       std::string());
-  model_proto.ParseFromString(model_str);
-  for (int i = 0; i < model_proto.graph().output_size(); ++i) {
-    model_proto.mutable_graph()->mutable_output(i)->clear_type();
+  for (int i = 0; i < model_proto->graph().output_size(); ++i) {
+    model_proto->mutable_graph()->mutable_output(i)->clear_type();
   }
 }
 
@@ -330,15 +328,15 @@ void ONNXShapeTypeInference(Node* n, int opset_version) {
   // TODO: Some ops have conversion happen at Peephole pass.
   //       The conversion here is incomplete for these ops.
   //       e.g: ListConstruct, ListUnpack, etc.
-  onnx::ModelProto model_proto;
+  std::shared_ptr<onnx::ModelProto> model_proto;
   ConvertGraphToONNXProto(n_graph, model_proto, opset_version);
-  GRAPH_DEBUG("ONNX graph to run shape inference: ", prettyPrint(model_proto));
+  GRAPH_DEBUG("ONNX graph to run shape inference: ", prettyPrint(*model_proto));
 
   // infer shape
-  onnx::shape_inference::InferShapes(model_proto);
-  GRAPH_DEBUG("ONNX graph after shape inference: ", prettyPrint(model_proto));
+  onnx::shape_inference::InferShapes(*model_proto);
+  GRAPH_DEBUG("ONNX graph after shape inference: ", prettyPrint(*model_proto));
 
-  UpdateOutputTypeByONNXProto(n, clone_node, model_proto);
+  UpdateOutputTypeByONNXProto(n, clone_node, *model_proto);
   GRAPH_DEBUG(
       "Torch graph after shape inference:", n->owningGraph()->toString());
 }
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 46a6448ee310..1a0635a1c122 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -236,8 +236,9 @@ void initPythonIRBindings(PyObject* module_) {
              bool use_external_data_format,
              const std::string& onnx_file_path) {
             std::string graph;
+            std::shared_ptr<::ONNX_NAMESPACE::ModelProto> model_proto;
             RawDataExportMap export_map;
-            std::tie(graph, export_map) = export_onnx(
+            std::tie(model_proto, export_map) = export_onnx(
                 g,
                 initializers,
                 onnx_opset_version,
@@ -261,6 +262,7 @@ void initPythonIRBindings(PyObject* module_) {
               python_serialized_export_map[kv.first] =
                   py::bytes(static_cast<const char*>(t.data_ptr()), copy_bytes);
             }
+            graph = serialize_model_proto_to_string(model_proto);
             return std::make_tuple(
                 py::bytes(graph), python_serialized_export_map);
           },
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index c44c00a88727..ee5a691bb489 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -854,7 +854,8 @@ std::string pretty_print_onnx(
 // conform to the ONNX op specification. Thus, the output will not
 // be interpretable by a ONNX-compatible framework. However, PyTorch or
 // libtorch will be able to import the IR and play it back.
-std::tuple<std::string, RawDataExportMap> export_onnx(
+std::tuple<std::shared_ptr<::ONNX_NAMESPACE::ModelProto>, RawDataExportMap>
+export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
     int64_t onnx_opset_version,
@@ -888,9 +889,14 @@ std::tuple<std::string, RawDataExportMap> export_onnx(
       "Exporting model exceed maximum protobuf size of 2GB. "
       "Please call torch.onnx.export with use_external_data_format=True.");
   GRAPH_UPDATE("onnx proto:", prettyPrint(graph_encoder.get_model_proto()));
-  return std::make_tuple(
-      graph_encoder.get_model_proto().SerializeAsString(),
-      graph_encoder.get_raw_data_export_map());
+  std::shared_ptr<onnx::ModelProto> model_proto =
+      std::make_shared<onnx::ModelProto>(graph_encoder.get_model_proto());
+  return std::make_tuple(model_proto, graph_encoder.get_raw_data_export_map());
+}
+
+std::string serialize_model_proto_to_string(
+    const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto) {
+  return model_proto->SerializeAsString();
 }
 
 void check_onnx_proto(const std::string& proto_string) {
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index 212ed65207fe..dd07e68bba33 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -8,6 +8,10 @@
 
 #include <ostream>
 
+namespace ONNX_NAMESPACE {
+class ModelProto;
+}
+
 namespace torch {
 namespace jit {
 
@@ -21,22 +25,27 @@ namespace jit {
 // file contents being the raw tensor data.
 using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
 
-TORCH_API std::tuple<std::string, RawDataExportMap> export_onnx(
-    const std::shared_ptr<Graph>& graph,
-    const std::map<std::string, at::Tensor>& initializers,
-    int64_t onnx_opset_version,
-    const std::unordered_map<
-        std::string,
-        std::unordered_map<int64_t, std::string>>& dynamic_axes,
-    bool defer_weight_export = false,
-    ::torch::onnx::OperatorExportTypes operator_export_type =
-        ::torch::onnx::OperatorExportTypes::ONNX,
-    bool strip_doc_string = true,
-    bool keep_initializers_as_inputs = true,
-    const std::map<std::string, int>& custom_opsets = {},
-    bool add_node_names = true,
-    bool use_external_data_format = false,
-    const std::string& onnx_file_path = std::string());
+TORCH_API std::
+    tuple<std::shared_ptr<::ONNX_NAMESPACE::ModelProto>, RawDataExportMap>
+    export_onnx(
+        const std::shared_ptr<Graph>& graph,
+        const std::map<std::string, at::Tensor>& initializers,
+        int64_t onnx_opset_version,
+        const std::unordered_map<
+            std::string,
+            std::unordered_map<int64_t, std::string>>& dynamic_axes,
+        bool defer_weight_export = false,
+        ::torch::onnx::OperatorExportTypes operator_export_type =
+            ::torch::onnx::OperatorExportTypes::ONNX,
+        bool strip_doc_string = true,
+        bool keep_initializers_as_inputs = true,
+        const std::map<std::string, int>& custom_opsets = {},
+        bool add_node_names = true,
+        bool use_external_data_format = false,
+        const std::string& onnx_file_path = std::string());
+
+TORCH_API std::string serialize_model_proto_to_string(
+    const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
 
 TORCH_API void check_onnx_proto(const std::string& proto_string);
 

From 8b143771d0f0bcd93d925263adc8b0d6b235b398 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Sun, 27 Sep 2020 20:56:30 -0700
Subject: [PATCH 189/449] Updates and simplifies nonzero as_tuple behavior

---
 test/test_torch.py                            | 24 +++++++-------
 .../templates/python_torch_functions.cpp      | 31 +++++++++----------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 6c875e68b12f..9f1c16b9fc69 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -58,7 +58,7 @@
 
 SIZE = 100
 
-AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() 
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
 
 # Wrap base test class into a class to hide it from testing
 # See https://stackoverflow.com/a/25695512
@@ -10782,15 +10782,6 @@ def assert_tuple_empty(tup, dim):
         self.assertEqual(1, len(z))
         self.assertEqual(torch.empty(0, dtype=torch.long), z[0])
 
-    @onlyOnCPUAndCUDA
-    def test_nonzero_deprecated(self, device):
-        x = torch.randn((2, 3), device=device)
-        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
-            x.nonzero()
-
-        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
-            torch.nonzero(x)
-
     # TODO: add torch.complex64, torch.complex128
     @dtypes(torch.float, torch.double)
     def test_normal(self, device, dtype):
@@ -13059,6 +13050,15 @@ def gen_nontrivial_input(shape, dtype, device):
             self.assertEqual(tup1, np_result, atol=0, rtol=0)
             self.assertEqual(tup2, np_result, atol=0, rtol=0)
 
+    def test_nonzero_astuple_out(self, device):
+        t = torch.randn((3, 3, 3), device=device)
+        out = torch.empty_like(t, dtype=torch.long)
+
+        with self.assertRaises(RuntimeError):
+            torch.nonzero(t, as_tuple=True, out=out)
+
+        self.assertEqual(torch.nonzero(t, as_tuple=False, out=out), torch.nonzero(t, out=out))
+
     @onlyOnCPUAndCUDA
     def test_nonzero_discontiguous(self, device):
         shape = (4, 4)
@@ -19825,7 +19825,7 @@ def inner(self, device, dtype):
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types,
         True, [], 0, True),
     ('addmv', 'scalar', _medium_1d,
-        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, 
+        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
         [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('addmv', 'two_scalars', _medium_1d,
@@ -20065,7 +20065,7 @@ def inner(self, device, dtype):
     ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
-    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, 
+    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5,
         torch.testing.get_all_fp_dtypes() + _complex_types, [torch.bfloat16]),
     ('asin', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('atan', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 673af99bce77..85d78ae8a2f7 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -583,29 +583,28 @@ static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject*
 {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
-    "nonzero(Tensor input, *, Tensor out=None)|deprecated",
-    "nonzero(Tensor input, *, bool as_tuple)",
+    "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)",
   });
-  ParsedArgs<2> parsed_args;
+  ParsedArgs<3> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
 
-  if(r.has_torch_function()){
+  if (r.has_torch_function()){
     return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
   }
 
-  if (r.idx == 0) {
-    if (r.isNone(1)) {
-      return wrap(dispatch_nonzero(r.tensor(0)));
-    } else {
-      return wrap(dispatch_nonzero(r.tensor(0), r.tensor(1)));
-    }
-  } else {
-    if (r.toBool(1)) {
-      return wrap(dispatch_nonzero_numpy(r.tensor(0)));
-    } else {
-      return wrap(dispatch_nonzero(r.tensor(0)));
-    }
+  const auto as_tuple = r.toBool(1);
+  const auto has_out = !r.isNone(2);
+
+  if (as_tuple) {
+    TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True");
+    return wrap(dispatch_nonzero_numpy(r.tensor(0)));
   }
+
+  if (has_out) {
+    return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2)));
+  }
+
+  return wrap(dispatch_nonzero(r.tensor(0)));
   END_HANDLE_TH_ERRORS
 }
 

From 8bdbedd4ee6bf656f8a9fe0c5aaf5272f852d004 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Sun, 27 Sep 2020 20:58:42 -0700
Subject: [PATCH 190/449] Revert "Updates and simplifies nonzero as_tuple
 behavior"

This reverts commit 8b143771d0f0bcd93d925263adc8b0d6b235b398.
---
 test/test_torch.py                            | 24 +++++++-------
 .../templates/python_torch_functions.cpp      | 31 ++++++++++---------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 9f1c16b9fc69..6c875e68b12f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -58,7 +58,7 @@
 
 SIZE = 100
 
-AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() 
 
 # Wrap base test class into a class to hide it from testing
 # See https://stackoverflow.com/a/25695512
@@ -10782,6 +10782,15 @@ def assert_tuple_empty(tup, dim):
         self.assertEqual(1, len(z))
         self.assertEqual(torch.empty(0, dtype=torch.long), z[0])
 
+    @onlyOnCPUAndCUDA
+    def test_nonzero_deprecated(self, device):
+        x = torch.randn((2, 3), device=device)
+        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
+            x.nonzero()
+
+        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
+            torch.nonzero(x)
+
     # TODO: add torch.complex64, torch.complex128
     @dtypes(torch.float, torch.double)
     def test_normal(self, device, dtype):
@@ -13050,15 +13059,6 @@ def gen_nontrivial_input(shape, dtype, device):
             self.assertEqual(tup1, np_result, atol=0, rtol=0)
             self.assertEqual(tup2, np_result, atol=0, rtol=0)
 
-    def test_nonzero_astuple_out(self, device):
-        t = torch.randn((3, 3, 3), device=device)
-        out = torch.empty_like(t, dtype=torch.long)
-
-        with self.assertRaises(RuntimeError):
-            torch.nonzero(t, as_tuple=True, out=out)
-
-        self.assertEqual(torch.nonzero(t, as_tuple=False, out=out), torch.nonzero(t, out=out))
-
     @onlyOnCPUAndCUDA
     def test_nonzero_discontiguous(self, device):
         shape = (4, 4)
@@ -19825,7 +19825,7 @@ def inner(self, device, dtype):
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types,
         True, [], 0, True),
     ('addmv', 'scalar', _medium_1d,
-        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
+        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, 
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
         [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('addmv', 'two_scalars', _medium_1d,
@@ -20065,7 +20065,7 @@ def inner(self, device, dtype):
     ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
-    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5,
+    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, 
         torch.testing.get_all_fp_dtypes() + _complex_types, [torch.bfloat16]),
     ('asin', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('atan', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 85d78ae8a2f7..673af99bce77 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -583,28 +583,29 @@ static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject*
 {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
-    "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)",
+    "nonzero(Tensor input, *, Tensor out=None)|deprecated",
+    "nonzero(Tensor input, *, bool as_tuple)",
   });
-  ParsedArgs<3> parsed_args;
+  ParsedArgs<2> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
 
-  if (r.has_torch_function()){
+  if(r.has_torch_function()){
     return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
   }
 
-  const auto as_tuple = r.toBool(1);
-  const auto has_out = !r.isNone(2);
-
-  if (as_tuple) {
-    TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True");
-    return wrap(dispatch_nonzero_numpy(r.tensor(0)));
-  }
-
-  if (has_out) {
-    return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2)));
+  if (r.idx == 0) {
+    if (r.isNone(1)) {
+      return wrap(dispatch_nonzero(r.tensor(0)));
+    } else {
+      return wrap(dispatch_nonzero(r.tensor(0), r.tensor(1)));
+    }
+  } else {
+    if (r.toBool(1)) {
+      return wrap(dispatch_nonzero_numpy(r.tensor(0)));
+    } else {
+      return wrap(dispatch_nonzero(r.tensor(0)));
+    }
   }
-
-  return wrap(dispatch_nonzero(r.tensor(0)));
   END_HANDLE_TH_ERRORS
 }
 

From c3bf402cbbd5bba53fa8b7d86500e4dc3bb904b8 Mon Sep 17 00:00:00 2001
From: liqunfu <liqfu@microsoft.com>
Date: Sun, 27 Sep 2020 23:24:30 -0700
Subject: [PATCH 191/449] handle onnx nll with default ignore index (#44816)

Summary:
in ONNX NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
therefore, when convert nll op to ONNX, we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44816

Reviewed By: ezyang

Differential Revision: D23880354

Pulled By: bzinodev

fbshipit-source-id: d0bdd58d0a4507ed9ce37133e68533fe6d1bdf2b
---
 ...tOperators.test_softmaxcrossentropy.expect |   5 +
 ...erators.test_softmaxcrossentropy_3d.expect |   5 +
 ...rs.test_softmaxcrossentropy_3d_none.expect |   5 +
 ...erators.test_softmaxcrossentropy_4d.expect |   5 +
 ...rs.test_softmaxcrossentropy_weights.expect |   5 +
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 153 ++++++++----------
 torch/onnx/symbolic_opset12.py                |  10 +-
 7 files changed, 93 insertions(+), 95 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
index cde473fcdb4d..1479846789d4 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
index 58d8c805163d..f5cfba35b032 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
index 10d47a6ed84d..8b0ec04b24c8 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "none"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
index 6ccab9f7b50f..8d3539ca1c64 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect
@@ -8,6 +8,11 @@ graph {
     output: "2"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
index 1ea4adac8cab..bf1667b58812 100644
--- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
+++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect
@@ -9,6 +9,11 @@ graph {
     output: "3"
     name: "SoftmaxCrossEntropyLoss_0"
     op_type: "SoftmaxCrossEntropyLoss"
+    attribute {
+      name: "ignore_index"
+      i: -100
+      type: INT
+    }
     attribute {
       name: "reduction"
       s: "mean"
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 1aba8d838462..27192efc61bd 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -3742,138 +3742,102 @@ def forward(self, *tensor_list):
     @skipIfUnsupportedMinOpsetVersion(12)
     @disableScriptTest()
     def test_crossentropyloss(self):
-        x = torch.randn(3, 5)
-        y = torch.empty(3, dtype=torch.long).random_(5)
-        self._crossentropyloss(x, y)
+        for ignore_index in [-100, 1]:
+            x = torch.randn(3, 5)
+            y = torch.empty(3, dtype=torch.long).random_(5)
+            y[y == 1] = ignore_index
 
-        x = torch.randn(3, 5, 2)
-        y = torch.empty(3, 2, dtype=torch.long).random_(5)
-        self._crossentropyloss(x, y)
+            self._crossentropyloss(x, y, ignore_index)
 
-        x = torch.randn(3, 5, 2, 7)
-        y = torch.empty(3, 2, 7, dtype=torch.long).random_(5)
-        self._crossentropyloss(x, y)
+            x = torch.randn(3, 5, 2)
+            y = torch.empty(3, 2, dtype=torch.long).random_(5)
+            y[y == 1] = ignore_index
+            self._crossentropyloss(x, y, ignore_index)
 
-    def _crossentropyloss(self, x, y):
+            x = torch.randn(3, 5, 2, 7)
+            y = torch.empty(3, 2, 7, dtype=torch.long).random_(5)
+            y[y == 1] = ignore_index
+            self._crossentropyloss(x, y, ignore_index)
+
+    def _crossentropyloss(self, x, y, ignore_index):
         class CrossEntropyLossNone(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossNone, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none')
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none')
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossNone(), input=(x, y))
+        self.run_test(CrossEntropyLossNone(ignore_index), input=(x, y))
 
         class CrossEntropyLossNoneWeight(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossNoneWeight, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5))
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5))
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5), ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossNoneWeight(), input=(x, y))
+        self.run_test(CrossEntropyLossNoneWeight(ignore_index), input=(x, y))
 
         class CrossEntropyLossSum(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossSum, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum')
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum')
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossSum(), input=(x, y))
+        self.run_test(CrossEntropyLossSum(ignore_index), input=(x, y))
 
         class CrossEntropyLossSumWeight(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossSumWeight, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5))
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5))
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5), ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossSumWeight(), input=(x, y))
+        self.run_test(CrossEntropyLossSumWeight(ignore_index), input=(x, y))
 
         class CrossEntropyLossMean(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossMean, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss()
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss()
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossMean(), input=(x, y))
+        self.run_test(CrossEntropyLossMean(ignore_index), input=(x, y))
 
         class CrossEntropyLossMeanWeight(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, ignore_index):
                 super(CrossEntropyLossMeanWeight, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5))
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossMeanWeight(), input=(x, y))
-
-        class CrossEntropyLossNoneIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossNoneIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossNoneIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossNoneWeightIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossNoneWeightIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5), ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossNoneWeightIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossSumIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossSumIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossSumIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossSumWeightIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossSumWeightIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5), ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossSumWeightIgnoreIndex(), input=(x, y))
-
-        class CrossEntropyLossMeanIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossMeanIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(ignore_index=1)
+                if ignore_index == -100:
+                    self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5))
+                else:
+                    self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=ignore_index)
 
             def forward(self, input, target):
                 return self.loss(input, target)
 
-        self.run_test(CrossEntropyLossMeanIgnoreIndex(), input=(x, y))
+        self.run_test(CrossEntropyLossMeanWeight(ignore_index), input=(x, y))
 
-        class CrossEntropyLossMeanWeightIgnoreIndex(torch.nn.Module):
-            def __init__(self):
-                super(CrossEntropyLossMeanWeightIgnoreIndex, self).__init__()
-                self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=1)
-
-            def forward(self, input, target):
-                return self.loss(input, target)
-
-        self.run_test(CrossEntropyLossMeanWeightIgnoreIndex(), input=(x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_kldiv_loss(self):
@@ -3957,6 +3921,9 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16)
         target = torch.empty(N, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
@@ -3976,6 +3943,9 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
@@ -3995,6 +3965,9 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
@@ -4014,6 +3987,9 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
@@ -4033,6 +4009,9 @@ def forward(self, input, target):
         N, C = 5, 4
         input = torch.randn(N, 16, 10, 10)
         target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+
+        # using test data containing default ignore_index=-100
+        target[target == 1] = -100
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 9a7fb9bc9bc2..39b162ae95b5 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -36,15 +36,9 @@ def nll_loss(g, self, target, weight, reduction, ignore_index):
     reduction_vals = ['none', 'mean', 'sum']
     reduction = reduction_vals[reduction]
 
-    # when ignore_index is not specified, ignore_index == onnx::Constant[value={-100}]
+    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
     ignore_index = sym_help._maybe_get_const(ignore_index, 'i')
-    if ignore_index == -100:
-        if weight.node().mustBeNone():
-            return g.op("NegativeLogLikelihoodLoss", self, target, reduction_s=reduction)
-        else:
-            return g.op("NegativeLogLikelihoodLoss", self, target, weight, reduction_s=reduction)
-
-    # if ignore_index is specified, compute nllloss with no reduction and apply the reduction afterwards
     if weight.node().mustBeNone():
         nllloss = g.op("NegativeLogLikelihoodLoss", self, target, reduction_s=reduction, ignore_index_i=ignore_index)
     else:

From 13f76f2be44e5554d6fffc22e9a3edfadb103b13 Mon Sep 17 00:00:00 2001
From: Zino Benaissa <zinob@fb.com>
Date: Mon, 28 Sep 2020 00:03:49 -0700
Subject: [PATCH 192/449] Fix preserve submodule attribute in freezing (#45143)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45143

This PR prevents freezing cleaning up a submodule when user requests to
preserve a submodule.

Test Plan: Imported from OSS

Reviewed By: eellison

Differential Revision: D23844969

Pulled By: bzinodev

fbshipit-source-id: 80e6db3fc12460d62e634ea0336ae2a3551c2151
---
 test/jit/test_freezing.py               | 71 +++++++++++++++++++++++++
 torch/csrc/jit/passes/freeze_module.cpp | 14 ++++-
 2 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 4ec8f7e46d1b..696b97059d19 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -524,6 +524,77 @@ def forward(self, x):
         self.assertEqual(output_s, output_f)
 
 
+    def test_freeze_module_with_preserve_sub_module(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.a = torch.tensor([1.1])
+                self.b = 2.2
+
+            def forward(self, x):
+                return self.a
+
+        class TestModule(nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.sub1 = SubModule()  # aliasing
+                self.sub2 = SubModule()
+
+            def forward(self, x):
+                return self.sub2(x) + self.sub1(x)
+        m = TestModule()
+        ms = torch.jit.script(m)
+        ms.eval()
+        mf = torch._C._freeze_module(ms._c, ["sub1"])
+
+        # Test that 'sub1' is preserved entirely and 'sub2' is completely folded
+        self.assertTrue(mf.hasattr('sub1'))
+        self.assertTrue(mf.sub1.hasattr('a'))
+        self.assertTrue(mf.sub1.hasattr('b'))
+        self.assertFalse(mf.hasattr('sub2'))
+        input = torch.randn(2, 2)
+        output_s = ms.forward(input)
+        output_f = mf.forward(input)
+        self.assertEqual(output_s, output_f)
+
+    def test_freeze_module_with_preserve_sub_module_and_mutation(self):
+        class SubModule(nn.Module):
+            def __init__(self):
+                super(SubModule, self).__init__()
+                self.a = torch.tensor([1.1])
+                self.b = 2.2
+
+            def forward(self, x):
+                self.a[0] = 3.3
+                return self.a
+
+        class TestModule(nn.Module):
+            def __init__(self):
+                super(TestModule, self).__init__()
+                self.sub1 = SubModule()  # aliasing
+                self.sub2 = SubModule()
+
+            def forward(self, x):
+                return self.sub2(x) + self.sub1(x)
+        m = TestModule()
+        ms = torch.jit.script(m)
+        ms.eval()
+        mf = torch._C._freeze_module(ms._c, ["sub1"])
+
+        # Test that be both sub1 and sub1 are preserved and 'b' is preserved
+        # even if it is not used. To fulfill user request to preserve 'sub1'
+        self.assertTrue(mf.hasattr('sub1'))
+        self.assertTrue(mf.sub1.hasattr('a'))
+        self.assertTrue(mf.sub1.hasattr('b'))
+        self.assertTrue(mf.hasattr('sub2'))
+        self.assertTrue(mf.sub2.hasattr('a'))
+        self.assertTrue(mf.sub2.hasattr('b'))
+        input = torch.randn(2, 2)
+        output_s = ms.forward(input)
+        output_f = mf.forward(input)
+        self.assertEqual(output_s, output_f)
+
+
     def test_freeze_module_with_helperfunction(self):
         class SubModule(nn.Module):
             def __init__(self):
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 45eb6db91608..6aefa467f49b 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -42,7 +42,14 @@ class AttributePropagator {
     // explicitly.
     auto checkName = [this](std::string& name) {
       if (module_.hasattr(name)) {
-        insertMutableAttr(name, module_.attr(name), module_._ivalue());
+        auto attr = module_.attr(name);
+
+        // Freezing client wants to presever this submodule. When cleaning
+        // the frozen module, make sure it will be preserved entirely.
+        if (attr.isModule()) {
+          preservedSubModule_.insert(attr.toModule()._ivalue());
+        }
+        insertMutableAttr(name, attr, module_._ivalue());
         return true;
       }
 
@@ -503,7 +510,7 @@ class AttributePropagator {
         return true;
       }
     }
-    return false;
+    return preservedSubModule_.count(subModule._ivalue());
   }
 
   void removeExtraWaitCalls(Block* b) {
@@ -683,6 +690,9 @@ class AttributePropagator {
   // Contains user specified methods to be preserved in frozen module.
   std::unordered_set<Function*> preservedMethods_;
 
+  // Contains user specified sub module to be preserve in frozen module.
+  std::unordered_set<ModulePtr> preservedSubModule_;
+
   // Track all used attributes ivalues that can be aliased.
   IValue::HashAliasedIValues usedAttrs_;
 

From 95a97e51b5cb16ad0067c4458ef1f1a59d94ab6f Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Mon, 28 Sep 2020 00:31:06 -0700
Subject: [PATCH 193/449] [ONNX] Improve scripting inplace indexing ops
 (#44351)

Summary:
Fix a couple of issues with scripting inplace indexing in prepare_inplace_ops_for_onnx pass.
1- Tracing index copy (such as cases lik x[1:3] = data) already applies broadcasting on rhs if needed. The broadcasting node (aten::expand) is missing in scripting cases.

2- Inplace indexing with ellipsis (aten::copy_) is replaced with aten::index_put and then handled with slice+select in this pass.
Support for negative indices for this op added.

Shape inference is also enabled for scripting tests using new JIT API.
A few more tests are enabled for scripting.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44351

Reviewed By: ezyang

Differential Revision: D23880267

Pulled By: bzinodev

fbshipit-source-id: 78b33444633eb7ae0fbabc7415e3b16001f5207f
---
 aten/src/ATen/core/interned_strings.h         |   1 +
 test/onnx/test_models.py                      |  16 +-
 test/onnx/test_models_onnxruntime.py          |  24 +-
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 346 ++++++++++++------
 .../onnx/remove_inplace_ops_for_onnx.cpp      |  64 +++-
 torch/csrc/jit/passes/remove_inplace_ops.cpp  |   1 +
 torch/onnx/symbolic_opset11.py                |   2 +-
 torch/onnx/symbolic_opset9.py                 |   4 +-
 8 files changed, 304 insertions(+), 154 deletions(-)

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index b279a2400350..0dad790a2fb6 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -272,6 +272,7 @@ namespace c10 {
   _(prim, grad)                      \
   _(aten, zero_)                     \
   _(aten, fill_)                     \
+  _(aten, masked_fill_)              \
   FORALL_ATEN_BASE_SYMBOLS(_)        \
   _(onnx, Add)                       \
   _(onnx, Concat)                    \
diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py
index 6f37fa6d7e72..f91f6bea165b 100644
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@@ -49,7 +49,6 @@ class TestModels(TestCase):
     opset_version = _export_onnx_opset_version
 
     def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7):
-        self.is_script_test_enabled = True
         with torch.onnx.select_model_mode_for_export(model, None):
             graph = torch.onnx.utils._trace(model, inputs, OperatorExportTypes.ONNX)
             torch._C._jit_pass_lint(graph)
@@ -94,14 +93,12 @@ def test_srresnet(self):
         self.exportTest(toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x))
 
     @skipIfNoLapack
-    @disableScriptTest()
     def test_super_resolution(self):
         x = Variable(
             torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0)
         )
         self.exportTest(toC(SuperResolutionNet(upscale_factor=3)), toC(x), atol=1e-6)
 
-    @disableScriptTest()
     def test_alexnet(self):
         x = Variable(
             torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)
@@ -137,13 +134,12 @@ def test_vgg19_bn(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(vgg19_bn()), toC(x))
 
-    @disableScriptTest()
     def test_resnet(self):
         # ResNet50 model
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(resnet50()), toC(x), atol=1e-6)
 
-    @disableScriptTest()
+    @disableScriptTest()  # None type in outputs
     def test_inception(self):
         x = Variable(
             torch.randn(BATCH_SIZE, 3, 299, 299) + 1.)
@@ -208,22 +204,20 @@ def test_qat_resnet(self):
 
         self.exportTest(toC(qat_resnet50), toC(x))
 
-    @disableScriptTest()
+    @disableScriptTest()  # None type in outputs
     def test_googlenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(googlenet()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_mnasnet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(mnasnet1_0()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_mobilenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(mobilenet_v2()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
+    @disableScriptTest()  # prim_data
     def test_shufflenet(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(shufflenet_v2_x1_0()), toC(x), rtol=1e-3, atol=1e-5)
@@ -238,20 +232,18 @@ def test_deeplab(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(toC(deeplabv3_resnet101()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_r3d_18_video(self):
         x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0))
         self.exportTest(toC(r3d_18()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_mc3_18_video(self):
         x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0))
         self.exportTest(toC(mc3_18()), toC(x), rtol=1e-3, atol=1e-5)
 
-    @disableScriptTest()
     def test_r2plus1d_18_video(self):
         x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0))
         self.exportTest(toC(r2plus1d_18()), toC(x), rtol=1e-3, atol=1e-5)
 
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index 657a1479723d..c916b60844d1 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -15,13 +15,31 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None):
                        input=inputs, rtol=rtol, atol=atol)
 
         if self.is_script_test_enabled and opset_version > 11:
+            TestModels.use_new_jit_passes = True
+            TestModels.onnx_shape_inference = True
+
             outputs = model(inputs)
             script_model = torch.jit.script(model)
             run_model_test(self, script_model, False, example_outputs=outputs,
-                           input=inputs, rtol=rtol, atol=atol, use_new_jit_passes=True)
+                           input=inputs, rtol=rtol, atol=atol)
+
+
+TestModels = type(str("TestModels"),
+                  (unittest.TestCase,),
+                  dict(TestModels.__dict__,
+                       is_script_test_enabled=False,
+                       exportTest=exportTest))
+
+
+# model tests for scripting with new JIT APIs and shape inference
+TestModels_new_jit_API = type(str("TestModels_new_jit_API"),
+                              (unittest.TestCase,),
+                              dict(TestModels.__dict__,
+                                   exportTest=exportTest,
+                                   is_script_test_enabled=True,
+                                   use_new_jit_passes=True,
+                                   onnx_shape_inference=True))
 
 
 if __name__ == '__main__':
-    TestModels.is_script_test_enabled = True
-    TestModels.exportTest = exportTest
     unittest.main()
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 27192efc61bd..622795db76d1 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -15,6 +15,7 @@
                                  skipIfUnsupportedMaxOpsetVersion, skipIfONNXShapeInference)
 from test_pytorch_common import BATCH_SIZE
 from test_pytorch_common import RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE
+from typing import List
 import model_defs.word_language_model as word_language_model
 import torchvision
 import onnx
@@ -189,6 +190,7 @@ def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7,
                 ort_outs = run_ort(ort_sess, input_copy)
                 ort_compare_with_pytorch(ort_outs, output, rtol, atol)
 
+
     @skipIfUnsupportedMinOpsetVersion(9)  # Because external data format was released with Opset 9.
     def test_embedding_model_with_external_data(self):
         class LargeModel(torch.nn.Module):
@@ -315,7 +317,7 @@ def run_word_language_model(self, model_name):
         self.run_test(model, (x, model.hidden))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Faster RCNN model is not scriptable
     def test_faster_rcnn(self):
         model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=True, min_size=200,
                                                                                  max_size=300)
@@ -380,27 +382,53 @@ def test_word_language_model_LSTM(self):
     def test_word_language_model_GRU(self):
         self.run_word_language_model("GRU")
 
-    @disableScriptTest()
     def test_index_1d(self):
-        self._test_index_generic(lambda input: input[0])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[0]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
-    @disableScriptTest()
     def test_index_2d_1dimslice(self):
-        self._test_index_generic(lambda input: input[0:1, :])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[0:1, :]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
-    @disableScriptTest()
     def test_index_2d_sliceint(self):
-        self._test_index_generic(lambda input: input[1, :])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[1, :]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
-    @disableScriptTest()
     def test_index_2d_neg_slice(self):
-        self._test_index_generic(lambda input: input[0:-1, :])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[0:-1, :]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_index_mask(self):
-        self._test_index_generic(lambda input: input[torch.tensor([0, 1, 0], dtype=torch.uint8)])
-        self._test_index_generic(lambda input: input[torch.tensor([0, 1, 0], dtype=torch.bool)])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[torch.tensor([0, 1, 0], dtype=torch.uint8)]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
+
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[torch.tensor([0, 1, 0], dtype=torch.bool)]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), m1)
 
     @disableScriptTest()
     def test_dict(self):
@@ -660,14 +688,18 @@ def forward(self, x):
 
     def squeeze_model_tests(self, d, x1, x2):
         class Squeeze(torch.nn.Module):
+            def __init__(self, d):
+                super(Squeeze, self).__init__()
+                self.d = d
+
             def forward(self, x):
-                if d is not None:
-                    return torch.squeeze(x, dim=d)
+                if self.d is not None:
+                    return torch.squeeze(x, dim=self.d)
                 else:
                     return torch.squeeze(x)
 
         x2 = [] if x2 is None else [x2]
-        self.run_test(Squeeze(), x1, input_names=['input'], dynamic_axes={'input': {0: '0', 1: '1', 2: '2'}}, test_with_inputs=x2)
+        self.run_test(Squeeze(d), x1, input_names=['input'], dynamic_axes={'input': {0: '0', 1: '1', 2: '2'}}, test_with_inputs=x2)
 
     def test_squeeze_without_no_op(self):
         x = torch.randn(2, 1, 4)
@@ -761,7 +793,7 @@ def test_maxpool_3d_ceil(self):
         self.run_test(model, x)
 
     @skipIfUnsupportedMinOpsetVersion(8)
-    @disableScriptTest()
+    @disableScriptTest()  # Functional module not scriptable
     def test_maxpool_with_indices(self):
         model = torch.nn.MaxPool1d(2, stride=1, return_indices=True)
         x = torch.randn(20, 16, 50)
@@ -814,7 +846,6 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(ArithmeticModule(), x)
 
-    @disableScriptTest()
     # In scripting the first transpose node do not carry shape and dtype info.
     # The following test only works when onnx shape inference is enabled.
     @skipIfONNXShapeInference(False)
@@ -977,7 +1008,7 @@ def forward(self, x, y):
         self.run_test(InputIndexSlice(), (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()
+    @disableScriptTest()  # scripting tuple/list append
     def test_slice_dynamic(self):
         class DynamicSliceExportMod(torch.nn.Module):
             def forward(self, x):
@@ -1014,7 +1045,7 @@ def forward(self, x):
         self.run_test(DynamicSliceModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()
+    @disableScriptTest()   # scripting tuple/list append
     def test_slice_dynamic_to_end(self):
         class DynamicSliceExportMod(torch.nn.Module):
             def forward(self, x):
@@ -1121,7 +1152,7 @@ def forward(self, input):
         self.run_test(SizeModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # x.stride() not scriptable
     def test_as_strided(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -1134,28 +1165,42 @@ def forward(self, x):
         x = torch.randn(5, 8, 7)
         self.run_test(Model(), x)
 
-    def _test_index_generic(self, fn):
+    @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
+    def test_tensor_index_advanced_indexing_ellipsis(self):
         class MyModel(torch.nn.Module):
-            def __init__(self):
-                super(MyModel, self).__init__()
-
             def forward(self, input):
-                return fn(input)
+                return input[..., torch.tensor([2, 1]), torch.tensor([0, 3])]
 
         m1 = torch.randn(3, 4, 5, 6, 7)
-        self.run_test(MyModel(), m1)
+        self.run_test(MyModel(), (m1,))
 
-    @disableScriptTest()
     def test_tensor_index_advanced_indexing(self):
-        self._test_index_generic(
-            lambda input: input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])])
-        self._test_index_generic(lambda input: input[..., torch.tensor([2, 1]), torch.tensor([0, 3])])
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])])
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), (m1,))
+
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])]
+
+        self.run_test(MyModel(), (m1,))
+
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])]
+
+        self.run_test(MyModel(), (m1,))
 
-    @disableScriptTest()
     def test_tensor_index_advanced_indexing_consecutive(self):
-        self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None])
+        class MyModel(torch.nn.Module):
+            def forward(self, input):
+                return input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None]
+
+        m1 = torch.randn(3, 4, 5, 6, 7)
+        self.run_test(MyModel(), (m1,))
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_index_put(self):
@@ -1181,7 +1226,6 @@ def forward(self, x, ind, update):
         self.run_test(IndexPutModel(), (x, ind, update))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
     def test_index_put_slice_index(self):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, update):
@@ -1256,7 +1300,7 @@ def forward(self, x, update):
         self.run_test(IndexPutModel8(), (x, update))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Ellipses followed by tensor indexing not scriptable
     def test_index_put_ellipsis(self):
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, update):
@@ -1277,7 +1321,6 @@ def forward(self, x, update):
         self.run_test(IndexPutModel2(), (x, update))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
     def test_copy_(self):
         class CopyModel(torch.nn.Module):
             def forward(self, x, data):
@@ -1319,9 +1362,6 @@ def forward(self, x, data):
         update = torch.randn(2)
         self.run_test(CopyModel3(), (x, update))
 
-        update = torch.randn(1, 2)
-        self.run_test(CopyModel3(), (x, update))
-
         class CopyModel4(torch.nn.Module):
             def forward(self, x, ind, data):
                 x[ind] = data
@@ -1333,7 +1373,18 @@ def forward(self, x, ind, data):
         self.run_test(CopyModel4(), (x, ind, data))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Model not scriptable (output with shape doesn't match the broadcast shape)
+    def test_copy_tracing(self):
+        class CopyModel(torch.nn.Module):
+            def forward(self, x, data):
+                x[1, 1:3] = data
+                return x
+
+        x = torch.randn(3, 4)
+        update = torch.randn(1, 2)
+        self.run_test(CopyModel(), (x, update))
+
+    @skipIfUnsupportedMinOpsetVersion(11)
     def test_copy_ellipsis(self):
         class CopyModel(torch.nn.Module):
             def forward(self, x, update):
@@ -1348,14 +1399,18 @@ def forward(self, x, update):
         update = torch.ones(1)
         self.run_test(CopyModel(), (x, update))
 
-        class CopyModel2(torch.nn.Module):
+    @skipIfUnsupportedMinOpsetVersion(11)
+    @disableScriptTest()  # Missing input size (with ellipsis indexing)
+    def test_copy_ellipsis_tracing(self):
+        class CopyModel(torch.nn.Module):
             def forward(self, x, update):
                 x[2, ..., 1:3] = update
                 return x
 
         x = torch.randn(3, 4, 5, 6)
+
         update = torch.ones(1)
-        self.run_test(CopyModel2(), (x, update))
+        self.run_test(CopyModel(), (x, update))
 
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_flip(self):
@@ -1381,8 +1436,8 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Rand(), x)
 
-    @disableScriptTest()
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()  # symbolic update for randn
     def test_random_dynamic_size(self):
         class RandN(torch.nn.Module):
             def forward(self, x):
@@ -1415,7 +1470,6 @@ def forward(self, x):
         self.run_test(RandLike(), x)
         self.run_test(torch.jit.script(RandLike()), x)
 
-    @disableScriptTest()
     def test_random_like_dtype(self):
         class RandNLike(torch.nn.Module):
             def forward(self, x):
@@ -1767,7 +1821,6 @@ def forward(self, x):
         x = torch.randn(3, 4)
         self.run_test(IndexSelectScalerIndexModel(), x)
 
-    @disableScriptTest()
     def test_index_select_scaler_index(self):
         class IndexSelectScalerIndexModel(torch.nn.Module):
             def __init__(self, index_base):
@@ -1826,7 +1879,6 @@ def forward(self, x, k):
         self.run_test(MyModuleDynamic(), [x, k])
 
     @skipIfUnsupportedOpsetVersion([7])
-    @disableScriptTest()
     def test_normalize(self):
         class Model(torch.nn.Module):
             def forward(self, x):
@@ -1963,7 +2015,6 @@ def forward(self, input, indices):
         self.run_test(GatherModel(), input=(input, indices))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_expand(self):
         class ExpandModel(torch.nn.Module):
             def forward(self, input):
@@ -1984,7 +2035,7 @@ def forward(self, input, size):
                 return input.expand(size)
 
         input = torch.randn(3,)
-        size = torch.tensor([-1])
+        size = torch.tensor(-1)
         self.run_test(ExpandTensorSizeModel(), input=(input, size))
 
     def test_multinomial(self):
@@ -2114,6 +2165,7 @@ def test_logsoftmax_dim(self):
             self.run_test(model, input)
 
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()  # scripting prim_dtype
     def test_lstm_no_hidden(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
@@ -2143,7 +2195,7 @@ def test_lstm_default_init_state(self):
         self.run_test(model, input)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # LSTMModel model not scriptable
     def test_lstm_fixed_batch_size(self):
         class LSTMModel(torch.nn.Module):
             def __init__(self):
@@ -2606,7 +2658,7 @@ def forward(self, input, other):
         y = torch.randn(6, 4)
         self.run_test(ViewModel(), (x, y))
 
-    @disableScriptTest()
+    @disableScriptTest()  # ONNX Shape inference failure in if/else block for Gemm
     def test_weight_norm(self):
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1)
         x = torch.randn(3, 4, 5, requires_grad=True)
@@ -2624,7 +2676,7 @@ def test_weight_norm(self):
         x = torch.randn(3, 3, 5, requires_grad=True)
         self.run_test(model, x)
 
-    @disableScriptTest()
+    @disableScriptTest()  # ONNX Shape inference failure in if/else block for Gemm
     def test_weight_norm_nodim(self):
         model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None)
         x = torch.randn(3, 4, 5, requires_grad=True)
@@ -2654,7 +2706,6 @@ def forward(self, x):
         x = torch.randint(10, (1, 2, 3, 4))
         self.run_test(FlattenModel(), x)
 
-    @disableScriptTest()
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_flatten_dynamic_axes(self):
         class MyModule(torch.nn.Module):
@@ -2689,11 +2740,11 @@ def forward(self, x, y, z, ind):
         ind = torch.tensor(-2, dtype=torch.long)
         self.run_test(GetItemModel(), (x, y, z, ind))
 
-    @disableScriptTest()
     def test_unbind(self):
         class UnbindModel(torch.nn.Module):
             def forward(self, input):
-                return input.unbind()
+                _, out, _ = input.unbind()
+                return out
 
         x = torch.randn(3, 4, 5)
         self.run_test(UnbindModel(), x)
@@ -2753,18 +2804,19 @@ def forward(self, input):
         x = torch.randn(3, 4, 5)
         self.run_test(UnbindModel2(), x)
 
-    @disableScriptTest()
     def test_split(self):
         class SplitModel(torch.nn.Module):
             def forward(self, input):
-                return input.split([2, 1, 2])
+                out1, out2, out3 = input.split([2, 1, 2])
+                return out1, out2, out3
 
         x = torch.randn(5, 4, 3)
         self.run_test(SplitModel(), x)
 
         class SplitModel2(torch.nn.Module):
             def forward(self, input):
-                return input.split([2, 1, 1], -2)
+                out1, out2, out3 = input.split([2, 1, 1], -2)
+                return out1, out2, out3
 
         x = torch.randn(5, 4, 3)
         self.run_test(SplitModel2(), x)
@@ -2781,18 +2833,20 @@ def forward(self, input):
     @disableScriptTest()
     def test_split_size_as_list(self):
         class SplitModel(torch.nn.Module):
-            def forward(self, input):
+            def forward(self, input, split_sizes: List[int]):
                 out = []
-                split_sizes = [input.shape[0] - 1, 1]
-                for ob in input.split(split_sizes):
+                split_list: List[torch.Tensor] = input.split(split_sizes)
+
+                for ob in split_list:
                     out.append(ob)
                 return torch.cat(out, dim=0)
 
-        x = torch.randn(5, 4, 3)
-        self.run_test(SplitModel(), x)
+        x = torch.randn(6, 4, 3)
+        split_sizes = [torch.tensor(2), torch.tensor(4)]
+        self.run_test(SplitModel(), (x, split_sizes))
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    def test_split_size_list_to_slice(self):
+    def test_split_size_with_slice(self):
         class SplitModule(torch.nn.Module):
             def forward(self, x, y, t):
                 splits = (x.size(1), y.size(1))
@@ -3000,7 +3054,6 @@ def forward(self, x):
         self.run_test(Zero_(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_new_zeros(self):
         class Zero_(torch.nn.Module):
             def forward(self, x):
@@ -3065,6 +3118,17 @@ def forward(self, x):
         x = torch.randn(2, 3, 4)
         self.run_test(Full(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_inplace_list(self):
+        class Arithmetic(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x, y):
+                return torch.cat([x.add_(3), y.fill_(0)])
+
+        x = torch.randn(2, 3)
+        y = torch.randn(2, 3)
+        self.run_test(Arithmetic(), (x, y))
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_inplace_fill(self):
         class Fill_(torch.nn.Module):
@@ -3128,6 +3192,28 @@ def forward(self, x):
         x = torch.arange(16).view(2, 2, 4).to(torch.float32)
         self.run_test(MaskedFillModel2(), x)
 
+    @skipIfUnsupportedMinOpsetVersion(9)
+    def test_masked_fill_inplace(self):
+
+        class MaskedFillModel(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                mask = torch.tensor([[0, 0, 1], [1, 1, 0]], dtype=torch.uint8)
+                x.masked_fill_(mask, 2)
+                return x
+
+        x = torch.zeros(4, 2, 3, requires_grad=True)
+        self.run_test(MaskedFillModel(), x)
+
+        class MaskedFillModel2(torch.jit.ScriptModule):
+            @torch.jit.script_method
+            def forward(self, x):
+                x.masked_fill_(x > 3, -1)
+                return x
+
+        x = torch.arange(16).view(2, 2, 4).to(torch.float32)
+        self.run_test(MaskedFillModel2(), x)
+
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_masked_scatter(self):
         class MaskedScatterModel(torch.nn.Module):
@@ -3156,7 +3242,6 @@ def forward(self, x):
         self.run_test(PixelShuffle(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_scalar_type(self):
         class ArithmeticModel(torch.nn.Module):
             def forward(self, x):
@@ -3203,7 +3288,7 @@ def forward(self, x):
         self.run_test(FullModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # dtype mismatch
     def test_full_like(self):
         class FullLikeModel(torch.nn.Module):
             def forward(self, x):
@@ -3213,7 +3298,7 @@ def forward(self, x):
         self.run_test(FullLikeModel(), x)
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
+    @disableScriptTest()  # dtype mismatch
     def test_full_like_value(self):
         class FullLikeModel(torch.nn.Module):
             def forward(self, x, y):
@@ -3407,7 +3492,7 @@ def forward(self, input):
                                     'output': [0]
                                     })
 
-    @disableScriptTest()
+    @disableScriptTest()  # error in propagate as assign input shapes
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_embedding_bag(self):
         model = torch.nn.EmbeddingBag(10, 5, mode='sum', scale_grad_by_freq=True)
@@ -3424,7 +3509,7 @@ def test_embedding_bag(self):
         input = torch.randint(10, (7, 5))
         self.run_test(model, (input))
 
-    @disableScriptTest()
+    @disableScriptTest()  # error in propagate as assign input shapes
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_embedding_bag_1d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
@@ -3439,7 +3524,7 @@ def forward(self, embedding_matrix, input, offset, weights):
         embedding_matrix = torch.rand(10, 15)
         self.run_test(model, (embedding_matrix, x, offset, w))
 
-    @disableScriptTest()
+    @disableScriptTest()  # error in propagate as assign input shapes
     @skipIfUnsupportedMinOpsetVersion(10)
     def test_embedding_bag_2d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
@@ -3454,11 +3539,11 @@ def forward(self, embedding_matrix, input, weights):
         self.run_test(model, (embedding_matrix, x, w))
 
     @skipIfUnsupportedMinOpsetVersion(8)
-    @disableScriptTest()
     def test_meshgrid(self):
         class Meshgrid(torch.nn.Module):
             def forward(self, x, y, z):
-                return torch.meshgrid(x, y, z)
+                output1, output2, output3 = torch.meshgrid(x, y, z)
+                return output1, output2, output3
 
         x = torch.randn(3, requires_grad=True)
         y = torch.zeros(4, requires_grad=True)
@@ -3466,11 +3551,11 @@ def forward(self, x, y, z):
         self.run_test(Meshgrid(), (x, y, z))
 
     @skipIfUnsupportedMinOpsetVersion(8)
-    @disableScriptTest()
     def test_meshgrid_scalar(self):
         class Meshgrid(torch.nn.Module):
             def forward(self, x, y, z):
-                return torch.meshgrid(x, y, z)
+                output1, output2, output3 = torch.meshgrid(x, y, z)
+                return output1, output2, output3
 
         x = torch.ones(3, requires_grad=True)
         y = torch.zeros(4, requires_grad=True)
@@ -3541,7 +3626,6 @@ def forward(self, input, other):
         self.run_test(model, (x, y))
 
     @skipIfUnsupportedMinOpsetVersion(9)
-    @disableScriptTest()
     def test_ones_bool(self):
         class MyModule(torch.nn.Module):
             def forward(self, input):
@@ -3588,7 +3672,7 @@ def test_constant_pad(self):
 
     # Dynamic padding is added in opset 11
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # Functional module not scriptable
     def test_pad_types(self):
         # Test for different pad integer types
         class Pad(torch.nn.Module):
@@ -3622,7 +3706,7 @@ def run():
         self.assertEqual('Unsupported: ONNX export of Pad in opset 9. The sizes of the padding must be constant. ' +
                          'Please try opset version 11.', the_exception.args[0])
 
-    @disableScriptTest()
+    @disableScriptTest()  # export prim::Uninitialized
     def test_reflection_pad(self):
         model = torch.nn.ReflectionPad1d(2)
         x = torch.randn(2, 4, 4)
@@ -3632,7 +3716,7 @@ def test_reflection_pad(self):
         x = torch.randn(2, 2, 4, 4)
         self.run_test(model, x)
 
-    @disableScriptTest()
+    @disableScriptTest()  # export prim::Uninitialized
     def test_replication_pad(self):
         model = torch.nn.ReplicationPad1d(2)
         x = torch.randn(2, 4, 4)
@@ -3643,7 +3727,7 @@ def test_replication_pad(self):
         self.run_test(model, x)
 
     @skipIfUnsupportedMinOpsetVersion(11)
-    @disableScriptTest()
+    @disableScriptTest()  # export prim::Uninitialized
     def test_im2col(self):
         class Unfold(torch.nn.Module):
             def forward(self, input):
@@ -3667,7 +3751,6 @@ def forward(self, x):
     # This test checks output scalar type in the ONNX graph should not be null
     # https://github.com/pytorch/pytorch/issues/28607
     @skipIfUnsupportedMinOpsetVersion(10)
-    @disableScriptTest()
     def test_trace_script(self):
         @torch.jit.script
         def center_slice_helper(input, h_offset):
@@ -3697,13 +3780,14 @@ def forward(self, input):
                 out = input * 2
                 out *= out.dim()
                 return out
+
         empty_input = torch.randn(0, requires_grad=True)
         multi_dim_input = torch.randn(1, 2, 3, requires_grad=True)
         self.run_test(DimModel(), empty_input)
         self.run_test(DimModel(), multi_dim_input)
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # variable number of inputs not scriptable
     def test_einsum(self):
         class EinsumModelBatchDiagonal(torch.nn.Module):
             def forward(self, *tensor_list):
@@ -3740,7 +3824,7 @@ def forward(self, *tensor_list):
         self.run_test(EinsumModelTranspose(), input=(x,))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_crossentropyloss(self):
         for ignore_index in [-100, 1]:
             x = torch.randn(3, 5)
@@ -3840,6 +3924,7 @@ def forward(self, input, target):
 
 
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()   # Output dtype mismatch
     def test_kldiv_loss(self):
 
         x = torch.randn(5)
@@ -3906,7 +3991,7 @@ def forward(self, input, target):
         self.run_test(KLDivLossMiniBatchMean(), input=(x, y))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -3927,7 +4012,7 @@ def forward(self, input, target):
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_none(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -3949,7 +4034,7 @@ def forward(self, input, target):
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -3971,7 +4056,7 @@ def forward(self, input, target):
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_sum(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -3993,7 +4078,7 @@ def forward(self, input, target):
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean_weights(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -4015,7 +4100,7 @@ def forward(self, input, target):
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean_ignore_index(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -4034,7 +4119,7 @@ def forward(self, input, target):
         self.run_test(NLLModel(), (input, target))
 
     @skipIfUnsupportedMinOpsetVersion(12)
-    @disableScriptTest()
+    @disableScriptTest()  # shape/type inference
     def test_nllloss_2d_mean_ignore_index_weights(self):
         class NLLModel(torch.nn.Module):
             def __init__(self):
@@ -4176,6 +4261,7 @@ def forward(self, cond, input, other):
         self.run_test(Model(), (x, y, z))
 
     @skipIfUnsupportedMinOpsetVersion(9)
+    @disableScriptTest()   # symbolic update needed for unbind: ONNX export of unbind with dynamic number of outputs
     def test_where_condition(self):
         class Model1(torch.nn.Module):
             def forward(self, input):
@@ -4205,6 +4291,7 @@ def forward(self, input):
                 else:
                     pass
                 return out
+
         x = torch.randn(1, 2, 3, requires_grad=True)
         self.run_test(EmptyBranchModel(), x)
 
@@ -4231,6 +4318,7 @@ def __init__(self):
 
             def forward(self, x):
                 return 2 * x
+
         x = torch.randn(1, 2, 3, requires_grad=True)
         f = io.BytesIO()
         torch.onnx._export(Model(), x, f)
@@ -4239,13 +4327,15 @@ def forward(self, x):
 
         def check_proto():
             torch._C._check_onnx_proto(model.SerializeToString())
+
         self.assertRaises(RuntimeError, check_proto)
 
-    @disableScriptTest()
+    @disableScriptTest()  # dtype mismatch
     def test_split_tensor_scalar(self):
         class SplitModel(torch.nn.Module):
             def forward(self, x):
                 return torch.split(x, x.size(1))
+
         x = torch.randn(1, 2, 3, requires_grad=True)
         self.run_test(SplitModel(), x)
 
@@ -4253,10 +4343,12 @@ def test_split_tensor_multi(self):
         class SplitModel(torch.nn.Module):
             def forward(self, x):
                 return torch.split(x, torch.ones(3))
+
         x = torch.randn(1, 2, 3, requires_grad=True)
 
         def run_model():
             SplitModel(x)
+
         self.assertRaises(TypeError, run_model)
 
     def _dispatch_rnn_test(self, name, *args, **kwargs):
@@ -4410,7 +4502,8 @@ def forward(self, x):
 
         model.train()
 
-        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                   training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
 
@@ -4436,7 +4529,8 @@ def forward(self, x):
 
         model.train()
 
-        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                   training=torch.onnx.TrainingMode.TRAINING)
         ort_outs = run_ort(ort_sess, input=(x,))
 
         y = model(input)
@@ -4464,11 +4558,14 @@ def forward(self, x):
 
         model = MyModule()
         x = torch.randn(10, 3, 128, 128)
-        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs1 = run_ort(ort_sess1, input=(x,))
-        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL)
+        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.EVAL)
         ort_outs2 = run_ort(ort_sess2, input=(x,))
-        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in zip(ort_outs1, ort_outs2)]
+        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
+         zip(ort_outs1, ort_outs2)]
 
     def test_multiple_conv_bn(self):
         class MyModule(torch.nn.Module):
@@ -4482,7 +4579,6 @@ def __init__(self):
                 self.relu = torch.nn.ReLU(inplace=True)
                 self.maxpool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 
-
             def forward(self, x):
                 x = self.conv1(x)
                 x = self.bn(x)
@@ -4498,11 +4594,14 @@ def forward(self, x):
 
         model = MyModule()
         x = torch.randn(2, 3, 224, 224)
-        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING)
+        ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.TRAINING)
         ort_outs1 = run_ort(ort_sess1, input=(x,))
-        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL)
+        ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version,
+                                    training=torch.onnx.TrainingMode.EVAL)
         ort_outs2 = run_ort(ort_sess2, input=(x,))
-        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in zip(ort_outs1, ort_outs2)]
+        [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in
+         zip(ort_outs1, ort_outs2)]
 
 def make_test(name, base, layer, bidirectional, initial_state,
               variable_length, dropout,
@@ -4515,7 +4614,7 @@ def make_test(name, base, layer, bidirectional, initial_state,
 
     # Cannot export with older opsets because of 'ConstantFill' op
     # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime
-    @disableScriptTest()
+    @disableScriptTest()  # Test code not scriptable
     @skipIfUnsupportedMinOpsetVersion(9)
     def f(self):
         self._dispatch_rnn_test(
@@ -4530,7 +4629,6 @@ def f(self):
     f.__name__ = test_name
     setattr(TestONNXRuntime, f.__name__, f)
 
-
 def setup_rnn_tests():
     layers_opts = [
         (1, 'unilayer'),
@@ -4555,13 +4653,12 @@ def setup_rnn_tests():
     ]
     test_count = 0
     for (layer, bidirectional, initial_state, variable_length, dropout) in \
-        itertools.product(
-            layers_opts,
-            bidirectional_opts,
-            initial_state_opts,
-            variable_length_opts,
-            dropout_opts,
-    ):
+            itertools.product(
+                layers_opts,
+                bidirectional_opts,
+                initial_state_opts,
+                variable_length_opts,
+                dropout_opts,):
 
         for base, name, extra_kwargs in (
                 ('elman', 'elman_relu', {'nonlinearity': u'relu'}),
@@ -4582,7 +4679,6 @@ def setup_rnn_tests():
     if test_count != 192:
         raise ValueError('Expected 192 tests but found {}'.format(test_count))
 
-
 setup_rnn_tests()
 
 
@@ -4642,17 +4738,25 @@ def setup_rnn_tests():
                                     dict(TestONNXRuntime.__dict__, opset_version=12,
                                          keep_initializers_as_inputs=False))
 
-# opset 9 tests, with use_new_jit_passes=True for using new jit API
-TestONNXRuntime_opset9_new_jit_API = type(str("TestONNXRuntime_opset9_new_jit_API"),
-                                          (unittest.TestCase,),
-                                          dict(TestONNXRuntime.__dict__,
-                                               use_new_jit_passes=True))
-
-# opset 12 tests, with use_new_jit_passes=True for using new jit API
-TestONNXRuntime_opset12_new_jit_API = type(str("TestONNXRuntime_opset12_new_jit_API"),
-                                           (unittest.TestCase,),
-                                           dict(TestONNXRuntime.__dict__, opset_version=12,
-                                                use_new_jit_passes=True))
+
+# opset 9 tests, with use_new_jit_passes=True for using new jit API,
+# and with keep_initializers_as_inputs=False for IR version 4 style export.
+TestONNXRuntime_opset9_IRv4_new_jit_API = type(str("TestONNXRuntime_opset9_IRv4_new_jit_API"),
+                                               (unittest.TestCase,),
+                                               dict(TestONNXRuntime.__dict__,
+                                                    keep_initializers_as_inputs=False,
+                                                    use_new_jit_passes=True,
+                                                    onnx_shape_inference=True))
+
+
+# opset 12 tests, with use_new_jit_passes=True for using new jit API,
+# and keep_initializers_as_inputs=False for IR version 4 style export.
+TestONNXRuntime_opset12_IRv4_new_jit_API = type(str("TestONNXRuntime_opset12_IRv4_new_jit_API"),
+                                                (unittest.TestCase,),
+                                                dict(TestONNXRuntime.__dict__, opset_version=12,
+                                                     keep_initializers_as_inputs=False,
+                                                     use_new_jit_passes=True,
+                                                     onnx_shape_inference=True))
 
 
 # opset 12 tests, with _onnx_shape_inference=True.
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index d9ae3ca244fd..d1527ae41411 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -105,15 +105,27 @@ std::unordered_map<int64_t, ConvertedIndex> MergeSliceAndSelectToIndices(
   // Loop over fetched slice and select nodes and convert them to index tensors.
   // keep track of which dimension the current slice/select node is applying to.
   int64_t cur_dim = 0;
-  // select does not keep dims,
-  // this creates offset for latter slice and select nodes.
   int64_t dim_offset = 0;
   const auto orig_tensor_indices = index_put_node->input(1)->node()->inputs();
   for (auto it = slice_and_select_nodes.rbegin();
        it != slice_and_select_nodes.rend();
        ++it) {
     auto node = *it;
-    auto dim = node->get(attr::dim)->toInt() + dim_offset;
+    // select does not keep dims,
+    // this creates offset for latter slice and select nodes.
+    auto dim = node->get(attr::dim)->toInt();
+    if (dim < 0) {
+      auto input_type = node->input(0)->type()->expect<TensorType>();
+      if (input_type->dim().has_value()) {
+        auto rank = input_type->dim().value();
+        dim = dim + rank;
+      } else {
+        std::cerr
+            << "Error: ONNX Remove Inplace Ops - Cannot export ellipsis indexing for input "
+            << "of unknown rank.";
+      }
+    }
+    dim = dim + dim_offset;
 
     while (cur_dim < dim) {
       // Handle skipped dims, these are created from ..., or tensor indices
@@ -340,14 +352,23 @@ void PrepareCopyForONNX(Block* block) {
       // Remove aten::copy_, and replace it with index_put.
       // 1. create an empty listConstruct node as indices input for index_put.
       // 2. create index_put node.
+
+      // Tracing aten::copy_ broadcasts the rhs values.
+      // 3. Apply broadcasting for scripting.
       WithInsertPoint guard(node);
       auto graph = node->owningGraph();
       auto dummy_list =
           graph->insertNode(graph->createList(OptionalType::ofTensor(), {}))
               ->output();
+
+      auto expanded_value =
+          graph->insert(aten::expand_as, {node->input(1), node->input(0)});
+      expanded_value->node()->setSourceRange(node->sourceRange());
+      expanded_value->copyMetadata(node->input(1));
+
       auto index_put = graph->insert(
           aten::index_put,
-          {node->input(0), dummy_list, node->input(1), node->input(2)});
+          {node->input(0), dummy_list, expanded_value, node->input(2)});
       index_put->node()->setSourceRange(node->sourceRange());
       index_put->copyMetadata(node->output());
       node->output()->replaceAllUsesWith(index_put);
@@ -452,18 +473,29 @@ static void PrepareForRemoveMutations(MutationRemover& mr, Block* b) {
             << "Warning: ONNX Preprocess - Removing mutation on block inputs. "
             << "This changes graph semantics." << std::endl;
 
-        auto newNode = node->owningGraph()->create(aten::clone, 1);
-        newNode->output()->copyMetadata(input);
-        newNode->addInput(input);
-
-        auto* noneNode = node->owningGraph()->create(prim::Constant);
-        noneNode->output()->setType(NoneType::get());
-        newNode->addInput(noneNode->output());
-
-        newNode->insertBefore(node);
-        noneNode->insertBefore(newNode);
-        node->replaceInput(index, newNode->output());
-        input->replaceAllUsesAfterNodeWith(node, newNode->output());
+        if (input->type()->kind() == TypeKind::ListType) {
+          // Create an aten::list to clone the list in graph inputs
+          auto newNode = node->owningGraph()->create(aten::list, 1);
+          newNode->output()->copyMetadata(input);
+          newNode->addInput(input);
+          newNode->insertBefore(node);
+          node->replaceInput(index, newNode->output());
+          input->replaceAllUsesAfterNodeWith(node, newNode->output());
+        } else {
+          // Create an aten::clone to clone the tensor in graph inputs
+          auto newNode = node->owningGraph()->create(aten::clone, 1);
+          newNode->output()->copyMetadata(input);
+          newNode->addInput(input);
+
+          auto* noneNode = node->owningGraph()->create(prim::Constant);
+          noneNode->output()->setType(NoneType::get());
+          newNode->addInput(noneNode->output());
+
+          newNode->insertBefore(node);
+          noneNode->insertBefore(newNode);
+          node->replaceInput(index, newNode->output());
+          input->replaceAllUsesAfterNodeWith(node, newNode->output());
+        }
       }
     }
   }
diff --git a/torch/csrc/jit/passes/remove_inplace_ops.cpp b/torch/csrc/jit/passes/remove_inplace_ops.cpp
index 5d7ee3a2c5cc..fb4445371c74 100644
--- a/torch/csrc/jit/passes/remove_inplace_ops.cpp
+++ b/torch/csrc/jit/passes/remove_inplace_ops.cpp
@@ -8,6 +8,7 @@ static const std::unordered_map<NodeKind, NodeKind> inPlaceToOutOfPlace = {
     {aten::sub_, aten::sub},
     {aten::div_, aten::div},
     {aten::mul_, aten::mul},
+    {aten::masked_fill_, aten::masked_fill},
     {aten::zero_, aten::zeros_like},
     {aten::fill_, aten::full_like}};
 
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index f4e77a607c7d..18af82df2f79 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -272,7 +272,7 @@ def masked_scatter(g, self, mask, source):
 
 
 def _len(g, self):
-    if self.type().isSubtypeOf(torch._C.ListType.ofTensors()):
+    if self.type().isSubtypeOf(torch._C.ListType.ofTensors()) or self.node().kind() == "onnx::SplitToSequence":
         return g.op("SequenceLength", self)
     return g.op("Size", self)
 
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index b6b9aeda949d..444e4460eff7 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -186,9 +186,11 @@ def stack(g, tensor_list, dim):
     unsqueezed = [g.op("Unsqueeze", t, axes_i=[dim]) for t in sym_help._unpack_list(tensor_list)]
     return g.op("Concat", *unsqueezed, axis_i=dim)
 
+
 def _list(g, self):
     return self
 
+
 def mm(g, self, other):
     # Create a dummy C tensor. Only needed for API purposes, the value is
     # since beta = 0
@@ -1558,7 +1560,7 @@ def tensor(g, data, dtype=None, device=None, requires_grad=False):
         return g.op("Concat", *input_list, axis_i=0)
     else:
         if dtype is None:
-            dtype = sym_help._maybe_get_const(data, 't').type().scalarType()
+            dtype = data.type().scalarType()
             dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype])
     return g.op("Cast", data, to_i=sym_help.scalar_type_to_onnx[dtype])
 

From 7818a214c50c4a378d52fda77b0f2f9844da1656 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005325676 <generatedunixname89002005325676@fb.com>
Date: Mon, 28 Sep 2020 05:06:46 -0700
Subject: [PATCH 194/449] [AutoAccept][Codemod][FBSourceClangFormatLinter]
 Daily `arc lint --take CLANGFORMAT`

Reviewed By: zertosh

Differential Revision: D23959094

fbshipit-source-id: 6caa046d263114bff38a38d756099aac357e4f04
---
 torch/csrc/jit/runtime/static/impl.cpp | 2 +-
 torch/csrc/jit/runtime/static/ops.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 968a2d88b01e..42abeec3e3a5 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1,10 +1,10 @@
+#include <torch/csrc/jit/runtime/static/impl.h>
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/op_registration/op_registration.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/subgraph_rewrite.h>
-#include <torch/csrc/jit/runtime/static/impl.h>
 #include <torch/csrc/jit/runtime/static/ops.h>
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 94e4dc32f676..19c783c8a996 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -1,6 +1,6 @@
+#include <torch/csrc/jit/runtime/static/ops.h>
 #include <ATen/NativeFunctions.h>
 #include <torch/csrc/jit/ir/ir.h>
-#include <torch/csrc/jit/runtime/static/ops.h>
 
 namespace torch {
 namespace jit {

From 6417a70465879b7d4376d8508cc26f4b4ee3a8f0 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Mon, 28 Sep 2020 05:27:03 -0700
Subject: [PATCH 195/449] Updates linalg warning + docs (#45415)

Summary:
Changes the deprecation of norm to a docs deprecation, since PyTorch components still rely on norm and some behavior, like automatically flattening tensors, may need to be ported to torch.linalg.norm. The documentation is also updated to clarify that torch.norm and torch.linalg.norm are distinct.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45415

Reviewed By: ngimel

Differential Revision: D23958252

Pulled By: mruberry

fbshipit-source-id: fd54e807c59a2655453a6bcd9f4073cb2c12e8ac
---
 test/test_linalg.py | 13 -------------
 torch/functional.py |  8 +++-----
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index d3e1905e8d24..6215337d7456 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -1,7 +1,6 @@
 import torch
 import unittest
 import itertools
-import warnings
 from math import inf, nan, isnan
 
 from torch.testing._internal.common_utils import \
@@ -654,18 +653,6 @@ def run_test_case(input, ord, dim, keepdim, should_error):
                 for ord in ord_matrix:
                     run_test_case(input, ord, dim, keepdim, ord in error_ords)
 
-    def test_norm_deprecated(self, device):
-        expected_message = (
-            r'torch.norm is deprecated and may be removed in a future PyTorch release. '
-            r'Use torch.linalg.norm instead.')
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            for func in [torch.norm, torch.functional.norm]:
-                func(torch.rand(10, device=device))
-        self.assertEqual(len(w), 2)
-        for wi in w:
-            self.assertEqual(str(wi.message), expected_message)
-
     def test_norm_fastpaths(self, device):
         x = torch.randn(3, 5, device=device)
 
diff --git a/torch/functional.py b/torch/functional.py
index 84dbc2c5a4b7..8237313a56ee 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -9,7 +9,6 @@
 from .overrides import has_torch_function, handle_torch_function
 from ._jit_internal import boolean_dispatch, List
 from ._jit_internal import _overload as overload
-import warnings
 
 Tensor = torch.Tensor
 from torch import _VF
@@ -1214,7 +1213,9 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
     .. warning::
 
         torch.norm is deprecated and may be removed in a future PyTorch release.
-        Use :func:`torch.linalg.norm` instead.
+        Use :func:`torch.linalg.norm` instead, but note that :func:`torch.linalg.norm`
+        has a different signature and slightly different behavior that is
+        more consistent with NumPy's numpy.linalg.norm.
 
     Args:
         input (Tensor): the input tensor
@@ -1273,9 +1274,6 @@ def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa
         >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
         (tensor(3.7417), tensor(11.2250))
     """
-    warnings.warn((
-        "torch.norm is deprecated and may be removed in a future PyTorch release. "
-        "Use torch.linalg.norm instead."))
 
     if not torch.jit.is_scripting():
         if type(input) is not Tensor and has_torch_function((input,)):

From e4950a093a282e863abb8c519a9da271016c00cd Mon Sep 17 00:00:00 2001
From: Nikita Vedeneev <nik@quansight.com>
Date: Mon, 28 Sep 2020 07:20:53 -0700
Subject: [PATCH 196/449] Backward support for generalized eigenvalue solver
 with LOBPCG in forward [only k-rank SYMEIG case] (#43002)

Summary:
As per title. Fixes [#{38948}](https://github.com/pytorch/pytorch/issues/38948). Therein you can find some blueprints for the algorithm being used in this PR.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43002

Reviewed By: zou3519

Differential Revision: D23931326

Pulled By: albanD

fbshipit-source-id: e6994af70d94145f974ef87aa5cea166d6deff1e
---
 test/test_autograd.py |  61 +++++++
 torch/_lobpcg.py      | 408 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 453 insertions(+), 16 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index c03c1a496605..47c994251395 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2592,6 +2592,67 @@ def run_test(upper, dims):
         for upper, dims in product([True, False], [(3, 3), (5, 3, 3), (4, 3, 2, 2)]):
             run_test(upper, dims)
 
+    @slowTest
+    @skipIfNoLapack
+    def test_lobpcg(self):
+
+        def func(k, A, largest=True, B=None):
+            X_shape = list(A.shape)
+            X_shape[-1] = k
+            X = torch.eye(A.size(-2), k, dtype=A.dtype, device=A.device)
+            if A.dim() > 2:
+                X = X.expand(X_shape)
+
+            D, U = torch.lobpcg(A=A, k=k, B=B, X=X)
+
+            # LOBPCG uses a random initial eigenspace approximation
+            # if parameter `X` is not provided.
+            # This may cause a non-deterministic behavior
+            # when it comes to the sign of an eigenvector
+            # (note if v is an eigenvector, so is -v),
+            # hence we eliminate this non-determinism
+            # by making sure that each column of U
+            # gets multiplied by the sign of its max (in absolute value) element.
+            # Also, gradcheck changes the content of the input by +/- eps (default to 1e-06)
+            # to compute the numerical gradient which can also cause the signs to flip.
+            _, idx = U.abs().max(-2, keepdim=True)
+            sign = U.gather(-2, idx).sign()
+            U = U * sign
+            return D, U
+
+        def run_symeig_test(k, sizes, largest=True):
+            A = torch.rand(*sizes).double()
+            A = A.matmul(A.transpose(-1, -2)) / 10
+            A.requires_grad_(True)
+
+            gradcheck(lambda A: func(k, A, largest), A)
+
+            # Custom gradient vectors for better stability due to some
+            # non-determinism in the lobpcg's forward.
+            # Note it is not required if symeig is in forward instead (tested).
+            D_grad = torch.rand(*A.shape[:-2], k) / 100
+            U_grad = torch.rand(*A.shape[:-1], k) / 100
+            gradgradcheck(lambda A: func(k, A, largest), A, [D_grad, U_grad], atol=1e-4)
+
+            # check whether A.grad is symmetric
+            A = A.detach().requires_grad_(True)
+            D, U = func(k, A, largest)
+            (D.sum() + U.sum()).backward()
+            self.assertEqual(A.grad, A.grad.transpose(-1, -2))
+
+        # the tests below take about 1-2 minutes to finish,
+        # but we want to be extra sure that the backward is correct.
+        for largest in [True, False]:
+            run_symeig_test(1, (6, 6), largest=largest)
+            run_symeig_test(1, (2, 6, 6), largest=largest)
+            run_symeig_test(1, (2, 2, 6, 6), largest=largest)
+            run_symeig_test(2, (6, 6), largest=largest)
+            run_symeig_test(2, (2, 6, 6), largest=largest)
+            run_symeig_test(2, (2, 2, 6, 6), largest=largest)
+            run_symeig_test(3, (9, 9), largest=largest)
+            run_symeig_test(3, (2, 9, 9), largest=largest)
+            run_symeig_test(3, (2, 2, 9, 9), largest=largest)
+
     @skipIfNoLapack
     def test_cholesky_inverse(self):
         def _test_with_size(upper, dims):
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index b0cbf45b252b..ec0ad81dced0 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -13,23 +13,343 @@
 
 __all__ = ['lobpcg']
 
+def _symeig_backward_complete_eigenspace(D_grad, U_grad, A, D, U):
+    # compute F, such that F_ij = (d_j - d_i)^{-1} for i != j, F_ii = 0
+    F = D.unsqueeze(-2) - D.unsqueeze(-1)
+    F.diagonal(dim1=-2, dim2=-1).fill_(float('inf'))
+    F.pow_(-1)
+
+    # A.grad = U (D.grad + (U^T U.grad * F)) U^T
+    Ut = U.transpose(-1, -2).contiguous()
+    res = torch.matmul(
+        U,
+        torch.matmul(
+            torch.diag_embed(D_grad) + torch.matmul(Ut, U_grad) * F,
+            Ut
+        )
+    )
+
+    return res
+
+
+def _polynomial_coefficients_given_roots(roots):
+    """
+    Given the `roots` of a polynomial, find the polynomial's coefficients.
+
+    If roots = (r_1, ..., r_n), then the method returns
+    coefficients (a_0, a_1, ..., a_n (== 1)) so that
+    p(x) = (x - r_1) * ... * (x - r_n)
+         = x^n + a_{n-1} * x^{n-1} + ... a_1 * x_1 + a_0
+
+    Note: for better performance requires writing a low-level kernel
+    """
+    poly_order = roots.shape[-1]
+    poly_coeffs_shape = list(roots.shape)
+    # we assume p(x) = x^n + a_{n-1} * x^{n-1} + ... + a_1 * x + a_0,
+    # so poly_coeffs = {a_0, ..., a_n, a_{n+1}(== 1)},
+    # but we insert one extra coefficient to enable better vectorization below
+    poly_coeffs_shape[-1] += 2
+    poly_coeffs = roots.new_zeros(poly_coeffs_shape)
+    poly_coeffs[..., 0] = 1
+    poly_coeffs[..., -1] = 1
+
+    # perform the Horner's rule
+    for i in range(1, poly_order + 1):
+        # note that it is computationally hard to compute backward for this method,
+        # because then given the coefficients it would require finding the roots and/or
+        # calculating the sensitivity based on the Vieta's theorem.
+        # So the code below tries to circumvent the explicit root finding by series
+        # of operations on memory copies imitating the Horner's method.
+        # The memory copies are required to construct nodes in the computational graph
+        # by exploting the explicit (not in-place, separate node for each step)
+        # recursion of the Horner's method.
+        # Needs more memory, O(... * k^2), but with only O(... * k^2) complexity.
+        poly_coeffs_new = poly_coeffs.clone() if roots.requires_grad else poly_coeffs
+        out = poly_coeffs_new.narrow(-1, poly_order - i, i + 1)
+        out -= roots.narrow(-1, i - 1, 1) * poly_coeffs.narrow(-1, poly_order - i + 1, i + 1)
+        poly_coeffs = poly_coeffs_new
+
+    return poly_coeffs.narrow(-1, 1, poly_order + 1)
+
+
+def _polynomial_value(poly, x, zero_power, transition):
+    """
+    A generic method for computing poly(x) using the Horner's rule.
 
-def lobpcg(A,                   # type: Tensor
-           k=None,              # type: Optional[int]
-           B=None,              # type: Optional[Tensor]
-           X=None,              # type: Optional[Tensor]
-           n=None,              # type: Optional[int]
-           iK=None,             # type: Optional[Tensor]
-           niter=None,          # type: Optional[int]
-           tol=None,            # type: Optional[float]
-           largest=None,        # type: Optional[bool]
-           method=None,         # type: Optional[str]
-           tracker=None,        # type: Optional[None]
-           ortho_iparams=None,  # type: Optional[Dict[str, int]]
-           ortho_fparams=None,  # type: Optional[Dict[str, float]]
-           ortho_bparams=None,  # type: Optional[Dict[str, bool]]
-           ):
-    # type: (...) -> Tuple[Tensor, Tensor]
+    Arguments:
+      poly (Tensor): the (possibly batched) 1D Tensor representing
+                     polynomial coefficients such that
+                     poly[..., i] = (a_{i_0}, ..., a{i_n} (==1)), and
+                     poly(x) = poly[..., 0] * zero_power + ... + poly[..., n] * x^n
+
+      x (Tensor): the value (possible batched) to evalate the polynomial `poly` at.
+
+      zero_power (Tensor): the represenation of `x^0`. It is application-specific.
+
+      transition (Callable): the function that accepts some intermediate result `int_val`,
+                             the `x` and a specific polynomial coefficient
+                             `poly[..., k]` for some iteration `k`.
+                             It basically performs one iteration of the Horner's rule
+                             defined as `x * int_val + poly[..., k] * zero_power`.
+                             Note that `zero_power` is not a parameter,
+                             because the step `+ poly[..., k] * zero_power` depends on `x`,
+                             whether it is a vector, a matrix, or something else, so this
+                             functionality is delegated to the user.
+    """
+
+    res = zero_power.clone()
+    for k in range(poly.size(-1) - 2, -1, -1):
+        res = transition(res, x, poly[..., k])
+    return res
+
+def _matrix_polynomial_value(poly, x, zero_power=None):
+    """
+    Evaluates `poly(x)` for the (batched) matrix input `x`.
+    Check out `_polynomial_value` function for more details.
+    """
+
+    # matrix-aware Horner's rule iteration
+    def transition(curr_poly_val, x, poly_coeff):
+        res = x.matmul(curr_poly_val)
+        res.diagonal(dim1=-2, dim2=-1).add_(poly_coeff.unsqueeze(-1))
+        return res
+
+    if zero_power is None:
+        zero_power = torch.eye(x.size(-1), x.size(-1), dtype=x.dtype, device=x.device) \
+            .view(*([1] * len(list(x.shape[:-2]))), x.size(-1), x.size(-1))
+
+    return _polynomial_value(poly, x, zero_power, transition)
+
+def _vector_polynomial_value(poly, x, zero_power=None):
+    """
+    Evaluates `poly(x)` for the (batched) vector input `x`.
+    Check out `_polynomial_value` function for more details.
+    """
+
+    # vector-aware Horner's rule iteration
+    def transition(curr_poly_val, x, poly_coeff):
+        res = torch.addcmul(poly_coeff.unsqueeze(-1), x, curr_poly_val)
+        return res
+
+    if zero_power is None:
+        zero_power = x.new_ones(1).expand(x.shape)
+
+    return _polynomial_value(poly, x, zero_power, transition)
+
+def _symeig_backward_partial_eigenspace(D_grad, U_grad, A, D, U, largest):
+    # compute a projection operator onto an orthogonal subspace spanned by the
+    # columns of U defined as (I - UU^T)
+    Ut = U.transpose(-2, -1).contiguous()
+    proj_U_ortho = -U.matmul(Ut)
+    proj_U_ortho.diagonal(dim1=-2, dim2=-1).add_(1)
+
+    # compute U_ortho, a basis for the orthogonal complement to the span(U),
+    # by projecting a random [..., m, m - k] matrix onto the subspace spanned
+    # by the columns of U.
+    #
+    # fix generator for determinism
+    gen = torch.Generator(A.device)
+
+    # orthogonal complement to the span(U)
+    U_ortho = proj_U_ortho.matmul(
+        torch.randn(
+            (*A.shape[:-1], A.size(-1) - D.size(-1)),
+            dtype=A.dtype,
+            device=A.device,
+            generator=gen
+        )
+    )
+    U_ortho_t = U_ortho.transpose(-2, -1).contiguous()
+
+    # compute the coefficients of the characteristic polynomial of the tensor D.
+    # Note that D is diagonal, so the diagonal elements are exactly the roots
+    # of the characteristic polynomial.
+    chr_poly_D = _polynomial_coefficients_given_roots(D)
+
+    # the code belows finds the explicit solution to the Sylvester equation
+    # U_ortho^T A U_ortho dX - dX D = -U_ortho^T A U
+    # and incorporates it into the whole gradient stored in the `res` variable.
+    #
+    # Equivalent to the following naive implementation:
+    # res = A.new_zeros(A.shape)
+    # p_res = A.new_zeros(*A.shape[:-1], D.size(-1))
+    # for k in range(1, chr_poly_D.size(-1)):
+    #     p_res.zero_()
+    #     for i in range(0, k):
+    #         p_res += (A.matrix_power(k - 1 - i) @ U_grad) * D.pow(i).unsqueeze(-2)
+    #     res -= chr_poly_D[k] * (U_ortho @ poly_D_at_A.inverse() @ U_ortho_t @  p_res @ U.t())
+    #
+    # Note that dX is a differential, so the gradient contribution comes from the backward sensitivity
+    # Tr(f(U_grad, D_grad, A, U, D)^T dX) = Tr(g(U_grad, A, U, D)^T dA) for some functions f and g,
+    # and we need to compute g(U_grad, A, U, D)
+    #
+    # The naive implementation is based on the paper
+    # Hu, Qingxi, and Daizhan Cheng.
+    # "The polynomial solution to the Sylvester matrix equation."
+    # Applied mathematics letters 19.9 (2006): 859-864.
+    #
+    # We can modify the computation of `p_res` from above in a more efficient way
+    # p_res =   U_grad * (chr_poly_D[1] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k)).unsqueeze(-2)
+    #       + A U_grad * (chr_poly_D[2] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k - 1)).unsqueeze(-2)
+    #       + ...
+    #       + A.matrix_power(k - 1) U_grad * chr_poly_D[k]
+    # Note that this saves us from redundant matrix products with A (elimination of matrix_power)
+    U_grad_projected = U_grad
+    series_acc = U_grad_projected.new_zeros(U_grad_projected.shape)
+    for k in range(1, chr_poly_D.size(-1)):
+        poly_D = _vector_polynomial_value(chr_poly_D[..., k:], D)
+        series_acc += U_grad_projected * poly_D.unsqueeze(-2)
+        U_grad_projected = A.matmul(U_grad_projected)
+
+    # compute chr_poly_D(A) which essentially is:
+    #
+    # chr_poly_D_at_A = A.new_zeros(A.shape)
+    # for k in range(chr_poly_D.size(-1)):
+    #     chr_poly_D_at_A += chr_poly_D[k] * A.matrix_power(k)
+    #
+    # Note, however, for better performance we use the Horner's rule
+    chr_poly_D_at_A = _matrix_polynomial_value(chr_poly_D, A)
+
+    # compute the action of `chr_poly_D_at_A` restricted to U_ortho_t
+    chr_poly_D_at_A_to_U_ortho = torch.matmul(
+        U_ortho_t,
+        torch.matmul(
+            chr_poly_D_at_A,
+            U_ortho
+        )
+    )
+    # we need to invert 'chr_poly_D_at_A_to_U_ortho`, for that we compute its
+    # Cholesky decomposition and then use `torch.cholesky_solve` for better stability.
+    # Cholesky decomposition requires the input to be positive-definite.
+    # Note that `chr_poly_D_at_A_to_U_ortho` is positive-definite if
+    # 1. `largest` == False, or
+    # 2. `largest` == True and `k` is even
+    # under the assumption that `A` has distinct eigenvalues.
+    #
+    # check if `chr_poly_D_at_A_to_U_ortho` is positive-definite or negative-definite
+    chr_poly_D_at_A_to_U_ortho_sign = -1 if (largest and (k % 2 == 1)) else +1
+    chr_poly_D_at_A_to_U_ortho_L = torch.cholesky(
+        chr_poly_D_at_A_to_U_ortho_sign * chr_poly_D_at_A_to_U_ortho
+    )
+
+    # compute the gradient part in span(U)
+    res = _symeig_backward_complete_eigenspace(
+        D_grad, U_grad, A, D, U
+    )
+
+    # incorporate the Sylvester equation solution into the full gradient
+    # it resides in span(U_ortho)
+    res -= U_ortho.matmul(
+        chr_poly_D_at_A_to_U_ortho_sign * torch.cholesky_solve(
+            U_ortho_t.matmul(series_acc),
+            chr_poly_D_at_A_to_U_ortho_L
+        )
+    ).matmul(Ut)
+
+    return res
+
+def _symeig_backward(D_grad, U_grad, A, D, U, largest):
+    # if `U` is square, then the columns of `U` is a complete eigenspace
+    if U.size(-1) == U.size(-2):
+        return _symeig_backward_complete_eigenspace(
+            D_grad, U_grad, A, D, U
+        )
+    else:
+        return _symeig_backward_partial_eigenspace(
+            D_grad, U_grad, A, D, U, largest
+        )
+
+class LOBPCGAutogradFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx,
+                A: Tensor,
+                k: Optional[int] = None,
+                B: Optional[Tensor] = None,
+                X: Optional[Tensor] = None,
+                n: Optional[int] = None,
+                iK: Optional[Tensor] = None,
+                niter: Optional[int] = None,
+                tol: Optional[float] = None,
+                largest: Optional[bool] = None,
+                method: Optional[str] = None,
+                tracker: Optional[None] = None,
+                ortho_iparams: Optional[Dict[str, int]] = None,
+                ortho_fparams: Optional[Dict[str, float]] = None,
+                ortho_bparams: Optional[Dict[str, bool]] = None
+                ) -> Tuple[Tensor, Tensor]:
+
+        # makes sure that input is contiguous for efficiency.
+        # Note: autograd does not support dense gradients for sparse input yet.
+        A = A.contiguous() if (not A.is_sparse) else A
+        if B is not None:
+            B = B.contiguous() if (not B.is_sparse) else B
+
+        D, U = _lobpcg(
+            A, k, B, X,
+            n, iK, niter, tol, largest, method, tracker,
+            ortho_iparams, ortho_fparams, ortho_bparams
+        )
+
+        ctx.save_for_backward(A, B, D, U, largest)
+
+        return D, U
+
+    @staticmethod
+    def backward(ctx, D_grad, U_grad):
+        A_grad = B_grad = None
+        grads = [None] * 14
+
+        A, B, D, U, largest = ctx.saved_tensors
+
+        # lobpcg.backward has some limitations. Checks for unsupported input
+        if A.is_sparse or (B is not None and B.is_sparse and ctx.needs_input_grad[2]):
+            raise ValueError(
+                'lobpcg.backward does not support sparse input yet.'
+                'Note that lobpcg.forward does though.'
+            )
+        if A.dtype in (torch.complex64, torch.complex128) or \
+           B is not None and B.dtype in (torch.complex64, torch.complex128):
+            raise ValueError(
+                'lobpcg.backward does not support complex input yet.'
+                'Note that lobpcg.forward does though.'
+            )
+        if B is not None:
+            raise ValueError(
+                'lobpcg.backward does not support backward with B != I yet.'
+            )
+
+        if largest is None:
+            largest = True
+
+        # symeig backward
+        if B is None:
+            A_grad = _symeig_backward(
+                D_grad, U_grad, A, D, U, largest
+            )
+
+        # A has index 0
+        grads[0] = A_grad
+        # B has index 2
+        grads[2] = B_grad
+        return tuple(grads)
+
+
+def lobpcg(A: Tensor,
+           k: Optional[int] = None,
+           B: Optional[Tensor] = None,
+           X: Optional[Tensor] = None,
+           n: Optional[int] = None,
+           iK: Optional[Tensor] = None,
+           niter: Optional[int] = None,
+           tol: Optional[float] = None,
+           largest: Optional[bool] = None,
+           method: Optional[str] = None,
+           tracker: Optional[None] = None,
+           ortho_iparams: Optional[Dict[str, int]] = None,
+           ortho_fparams: Optional[Dict[str, float]] = None,
+           ortho_bparams: Optional[Dict[str, bool]] = None
+           ) -> Tuple[Tensor, Tensor]:
 
     """Find the k largest (or smallest) eigenvalues and the corresponding
     eigenvectors of a symmetric positive defined generalized
@@ -53,6 +373,17 @@ def lobpcg(A,                   # type: Tensor
       not recommended but there exist cases where the usage of the
       basic method may be preferred.
 
+    .. warning:: The backward method does not support sparse and complex inputs.
+      It works only when `B` is not provided (i.e. `B == None`).
+      We are actively working on extensions, and the details of
+      the algorithms are going to be published promptly.
+
+    .. warning:: While it is assumed that `A` is symmetric, `A.grad` is not.
+      To make sure that `A.grad` is symmetric, so that `A - t * A.grad` is symmetric
+      in first-order optimization routines, prior to running `lobpcg`
+      we do the following symmetrization map: `A -> (A + A.t()) / 2`.
+      The map is performed only when the `A` requires gradients.
+
     Arguments:
 
       A (Tensor): the input tensor of size :math:`(*, m, m)`
@@ -175,6 +506,51 @@ def lobpcg(A,                   # type: Tensor
                 ortho_fparams=ortho_fparams,
                 ortho_bparams=ortho_bparams)
 
+    if not torch._jit_internal.is_scripting():
+        if A.requires_grad or (B is not None and B.requires_grad):
+            # While it is expected that `A` is symmetric,
+            # the `A_grad` might be not. Therefore we perform the trick below,
+            # so that `A_grad` becomes symmetric.
+            # The symmetrization is important for first-order optimization methods,
+            # so that (A - alpha * A_grad) is still a symmetric matrix.
+            # Same holds for `B`.
+            A_sym = (A + A.transpose(-2, -1)) / 2
+            B_sym = (B + B.transpose(-2, -1)) / 2 if (B is not None) else None
+
+            return LOBPCGAutogradFunction.apply(
+                A_sym, k, B_sym, X, n, iK, niter, tol, largest,
+                method, tracker, ortho_iparams, ortho_fparams, ortho_bparams
+            )
+    else:
+        if A.requires_grad or (B is not None and B.requires_grad):
+            raise RuntimeError(
+                'Script and require grads is not supported atm.'
+                'If you just want to do the forward, use .detach()'
+                'on A and B before calling into lobpcg'
+            )
+
+    return _lobpcg(
+        A, k, B, X,
+        n, iK, niter, tol, largest, method, tracker,
+        ortho_iparams, ortho_fparams, ortho_bparams
+    )
+
+def _lobpcg(A: Tensor,
+            k: Optional[int] = None,
+            B: Optional[Tensor] = None,
+            X: Optional[Tensor] = None,
+            n: Optional[int] = None,
+            iK: Optional[Tensor] = None,
+            niter: Optional[int] = None,
+            tol: Optional[float] = None,
+            largest: Optional[bool] = None,
+            method: Optional[str] = None,
+            tracker: Optional[None] = None,
+            ortho_iparams: Optional[Dict[str, int]] = None,
+            ortho_fparams: Optional[Dict[str, float]] = None,
+            ortho_bparams: Optional[Dict[str, bool]] = None
+            ) -> Tuple[Tensor, Tensor]:
+
     # A must be square:
     assert A.shape[-2] == A.shape[-1], A.shape
     if B is not None:

From e2ffdf467a988279f09de66ff31e9ccf33a55d23 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 28 Sep 2020 09:50:48 -0700
Subject: [PATCH 197/449] docker: Add torchelastic to docker image (#45438)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45438

Adds torchelastic (as well as its dependencies) to the official docker
images

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: tierex

Differential Revision: D23963787

Pulled By: seemethere

fbshipit-source-id: 54ebb4b9c50699e543f264975dadf99badf55753
---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 5bae3ec14ea6..d328a75c7c6c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -52,6 +52,7 @@ FROM conda as conda-installs
 ARG INSTALL_CHANNEL=pytorch-nightly
 RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=11.0.221 && \
     /opt/conda/bin/conda clean -ya
+RUN /opt/conda/bin/pip install torchelastic
 
 FROM ${BASE_IMAGE} as official
 LABEL com.nvidia.volumes.needed="nvidia_driver"

From 48d29c830d6c8be55b435f78b1369a57bc009ea0 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Mon, 28 Sep 2020 09:52:22 -0700
Subject: [PATCH 198/449] [hotfix] disable problematic cuda tests on rocm
 builds (#45435)

Summary:
Disable the recent 3 cuda tests on amd rocm build/tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45435

Reviewed By: malfet

Differential Revision: D23962881

Pulled By: walterddr

fbshipit-source-id: ad4ea1f835b4722cdbdce685806cfd64376cc16f
---
 torch/testing/_internal/distributed/distributed_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 153a9963d540..791172b4e46c 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -753,6 +753,7 @@ def test_broadcast(self):
             "Only Gloo and Nccl backend supports CUDA allReduce",
         )
         @skip_if_no_gpu
+        @skip_if_rocm
         def test_broadcast_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -951,6 +952,7 @@ def test_reduce_full_group_max(self):
 
         @skip_if_no_gpu
         @require_backend({"gloo", "nccl"})
+        @skip_if_rocm
         def test_all_reduce_result_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()
@@ -1917,6 +1919,7 @@ def _test_barrier_helper(
 
         @skip_if_no_gpu
         @unittest.skipIf(BACKEND == "mpi", "MPI doesn't supports GPU barrier")
+        @skip_if_rocm
         def test_barrier_cuda(self):
             group, group_id, rank = self._init_global_test()
             rank_to_GPU = self._init_multigpu_helper()

From e5242aaf89d2cf0d3e53b61f33edee7b7544146d Mon Sep 17 00:00:00 2001
From: Luca Wehrstedt <lcw@fb.com>
Date: Mon, 28 Sep 2020 10:30:18 -0700
Subject: [PATCH 199/449] Update TensorPipe submodule (#45433)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45433

Primarily in order to pick up the fix landed in https://github.com/pytorch/tensorpipe/pull/225 which fixes the handling of scopes in link-local IPv6 addresses, which was reported by a user.

Test Plan: The specific upstream change is covered by new unit tests. The submodule update will be validated by the PyTorch CI.

Reviewed By: beauby

Differential Revision: D23962289

fbshipit-source-id: 4ed762fc19c4aeb1398d1337d61b3188c4c228be
---
 third_party/tensorpipe | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/tensorpipe b/third_party/tensorpipe
index 9646e1a43199..95ff9319161f 160000
--- a/third_party/tensorpipe
+++ b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 9646e1a431997edb1579972cef196d8fb97a77a5
+Subproject commit 95ff9319161fcdb3c674d2bb63fac3e94095b343

From 993628c74a9c0e1edad533b666f417a8316f1e36 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Mon, 28 Sep 2020 10:44:03 -0700
Subject: [PATCH 200/449] Build shape expressions and remove outputs that are
 only used by `aten::size`s (#45080)

Summary:
Currently, TE materializes all intermediate results even if they are only used for computing their shapes. This diff ports the approach the OF (Old Fuser) took to deal with this issue. Namely, given the structure of a fusion group we infer all the sizes outside a fusion group based on fusion group's inputs.

A simple example would be:

```
        def test_fuse(a, b):
            c = a + b
            d = c + b
            return d
```

Here we don't need to cache `c` as computing a gradient for `b` in `d = c + b` doesn't need it. We do need to compute sizes for all arguments here in case broadcasts happen.

Without this optimization, TE would need to materialize `c` so we can get its size

```
[DUMP profiling_graph_executor_impl.cpp:499] Optimized Graph:
[DUMP profiling_graph_executor_impl.cpp:499] graph(%a.1 : Tensor,
[DUMP profiling_graph_executor_impl.cpp:499]       %b.1 : Tensor):
[DUMP profiling_graph_executor_impl.cpp:499]   %11 : Tensor = prim::DifferentiableGraph_0(%b.1, %a.1)
[DUMP profiling_graph_executor_impl.cpp:499]   return (%11)
[DUMP profiling_graph_executor_impl.cpp:499] with prim::DifferentiableGraph_0 = graph(%11 : Tensor,
[DUMP profiling_graph_executor_impl.cpp:499]       %13 : Tensor):
[DUMP profiling_graph_executor_impl.cpp:499]   %59 : int[] = aten::size(%13) # <string>:3:44
[DUMP profiling_graph_executor_impl.cpp:499]   %62 : int[] = aten::size(%11) # <string>:3:93
[DUMP profiling_graph_executor_impl.cpp:499]   %83 : Double(1:1, requires_grad=0, device=cuda:0), %84 : Double(1:1, requires_grad=0, device=cuda:0), %85 : bool = prim::TypeCheck(%11, %13)
[DUMP profiling_graph_executor_impl.cpp:499]   %86 : Tensor, %87 : Tensor = prim::If(%85)
[DUMP profiling_graph_executor_impl.cpp:499]     block0():
[DUMP profiling_graph_executor_impl.cpp:499]       %d.4 : Double(1:1, requires_grad=0, device=cuda:0), %c.4 : Double(1:1, requires_grad=0, device=cuda:0) = prim::TensorExprGroup_0(%83, %84)
[DUMP profiling_graph_executor_impl.cpp:499]       -> (%d.4, %c.4)
[DUMP profiling_graph_executor_impl.cpp:499]     block1():
[DUMP profiling_graph_executor_impl.cpp:499]       %94 : Function = prim::Constant[name="fallback_function", fallback=1]()
[DUMP profiling_graph_executor_impl.cpp:499]       %95 : (Tensor, Tensor) = prim::CallFunction(%94, %11, %13)
[DUMP profiling_graph_executor_impl.cpp:499]       %96 : Tensor, %97 : Tensor = prim::TupleUnpack(%95)
[DUMP profiling_graph_executor_impl.cpp:499]       -> (%96, %97)
[DUMP profiling_graph_executor_impl.cpp:499]   %60 : int[] = aten::size(%87) # <string>:3:55
[DUMP profiling_graph_executor_impl.cpp:499]   %61 : int[]? = aten::_size_if_not_equal(%59, %60) # <string>:3:19
[DUMP profiling_graph_executor_impl.cpp:499]   %64 : int[]? = aten::_size_if_not_equal(%62, %60) # <string>:3:68
[DUMP profiling_graph_executor_impl.cpp:499]   %67 : int[] = aten::size(%86) # <string>:3:55
[DUMP profiling_graph_executor_impl.cpp:499]   %68 : int[]? = aten::_size_if_not_equal(%60, %67) # <string>:3:19
[DUMP profiling_graph_executor_impl.cpp:499]   %71 : int[]? = aten::_size_if_not_equal(%62, %67) # <string>:3:68
[DUMP profiling_graph_executor_impl.cpp:499]   return (%86, %61, %64, %68, %71)
[DUMP profiling_graph_executor_impl.cpp:499] with prim::TensorExprGroup_0 = graph(%1 : Double(1:1, requires_grad=0, device=cuda:0),
[DUMP profiling_graph_executor_impl.cpp:499]       %4 : Double(1:1, requires_grad=0, device=cuda:0)):
[DUMP profiling_graph_executor_impl.cpp:499]   %5 : int = prim::Constant[value=1]()
[DUMP profiling_graph_executor_impl.cpp:499]   %c.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%4, %1, %5) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2872:16
[DUMP profiling_graph_executor_impl.cpp:499]   %2 : int = prim::Constant[value=1]()
[DUMP profiling_graph_executor_impl.cpp:499]   %d.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%c.3, %1, %2) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2873:16
[DUMP profiling_graph_executor_impl.cpp:499]   return (%d.3, %c.3)
```

With this optimization we use `prim::BroadcastSizes` to compute the size of `c`. No need to materialize it.

```
[DUMP profiling_graph_executor_impl.cpp:499] Optimized Graph:
[DUMP profiling_graph_executor_impl.cpp:499] graph(%a.1 : Tensor,
[DUMP profiling_graph_executor_impl.cpp:499]       %b.1 : Tensor):
[DUMP profiling_graph_executor_impl.cpp:499]   %11 : Tensor = prim::DifferentiableGraph_0(%b.1, %a.1)
[DUMP profiling_graph_executor_impl.cpp:499]   return (%11)
[DUMP profiling_graph_executor_impl.cpp:499] with prim::DifferentiableGraph_0 = graph(%11 : Tensor,
[DUMP profiling_graph_executor_impl.cpp:499]       %13 : Tensor):
[DUMP profiling_graph_executor_impl.cpp:499]   %59 : int[] = aten::size(%13) # <string>:3:44
[DUMP profiling_graph_executor_impl.cpp:499]   %62 : int[] = aten::size(%11) # <string>:3:93
[DUMP profiling_graph_executor_impl.cpp:499]   %88 : Double(1:1, requires_grad=0, device=cuda:0), %89 : Double(1:1, requires_grad=0, device=cuda:0), %90 : bool = prim::TypeCheck(%11, %13)
[DUMP profiling_graph_executor_impl.cpp:499]   %91 : Tensor = prim::If(%90)
[DUMP profiling_graph_executor_impl.cpp:499]     block0():
[DUMP profiling_graph_executor_impl.cpp:499]       %d.4 : Double(1:1, requires_grad=0, device=cuda:0) = prim::TensorExprGroup_0(%88, %89)
[DUMP profiling_graph_executor_impl.cpp:499]       -> (%d.4)
[DUMP profiling_graph_executor_impl.cpp:499]     block1():
[DUMP profiling_graph_executor_impl.cpp:499]       %97 : Function = prim::Constant[name="fallback_function", fallback=1]()
[DUMP profiling_graph_executor_impl.cpp:499]       %98 : (Tensor) = prim::CallFunction(%97, %11, %13)
[DUMP profiling_graph_executor_impl.cpp:499]       %99 : Tensor = prim::TupleUnpack(%98)
[DUMP profiling_graph_executor_impl.cpp:499]       -> (%99)
[DUMP profiling_graph_executor_impl.cpp:499]   %85 : int[] = aten::size(%91)
[DUMP profiling_graph_executor_impl.cpp:499]   %86 : int[] = prim::BroadcastSizes(%59, %62)
[DUMP profiling_graph_executor_impl.cpp:499]   %61 : int[]? = aten::_size_if_not_equal(%59, %86) # <string>:3:19
[DUMP profiling_graph_executor_impl.cpp:499]   %64 : int[]? = aten::_size_if_not_equal(%62, %86) # <string>:3:68
[DUMP profiling_graph_executor_impl.cpp:499]   %68 : int[]? = aten::_size_if_not_equal(%86, %85) # <string>:3:19
[DUMP profiling_graph_executor_impl.cpp:499]   %71 : int[]? = aten::_size_if_not_equal(%62, %85) # <string>:3:68
[DUMP profiling_graph_executor_impl.cpp:499]   return (%91, %61, %64, %68, %71)
[DUMP profiling_graph_executor_impl.cpp:499] with prim::TensorExprGroup_0 = graph(%1 : Double(1:1, requires_grad=0, device=cuda:0),
[DUMP profiling_graph_executor_impl.cpp:499]       %4 : Double(1:1, requires_grad=0, device=cuda:0)):
[DUMP profiling_graph_executor_impl.cpp:499]   %5 : int = prim::Constant[value=1]()
[DUMP profiling_graph_executor_impl.cpp:499]   %c.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%4, %1, %5) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2872:16
[DUMP profiling_graph_executor_impl.cpp:499]   %2 : int = prim::Constant[value=1]()
[DUMP profiling_graph_executor_impl.cpp:499]   %d.3 : Double(1:1, requires_grad=0, device=cuda:0) = aten::add(%c.3, %1, %2) # /scratch/villedepommes/pytorches/bench/test/test_jit.py:2873:16
[DUMP profiling_graph_executor_impl.cpp:499]   return (%d.3)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45080

Reviewed By: bertmaher

Differential Revision: D23856410

Pulled By: Krovatkin

fbshipit-source-id: 2956286eb03a4894a5baa151c35e6092466322b1
---
 test/test_jit_fuser_te.py                  |  20 +++
 torch/csrc/jit/passes/graph_fuser.cpp      |  18 +--
 torch/csrc/jit/passes/tensorexpr_fuser.cpp | 150 ++++++++++++++++++++-
 torch/csrc/jit/passes/tensorexpr_fuser.h   |   3 +
 4 files changed, 171 insertions(+), 20 deletions(-)

diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 356d396b4c02..5cb43cbe8079 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -683,6 +683,26 @@ def foo(hx, cx):
         # XXX: TE fuser can handle concats in a fusion group.
         # FileCheck().check("FusedConcat").check_next("return").run(str(graph))
 
+    @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
+    def test_remove_output_used_only_in_size(self):
+        def test_fuse(a, b):
+            c = a + b
+            d = c + b
+            return d
+
+        scripted_f = torch.jit.script(test_fuse)
+        x = torch.ones(1, requires_grad=True, device='cuda')
+        y = torch.ones(1, requires_grad=True, device='cuda')
+        warmup_forward(scripted_f, x, y)
+        g = torch.jit.last_executed_optimized_graph()
+        diff_nodes = [n for n in g.nodes() if n.kind() == 'prim::DifferentiableGraph']
+        self.assertEqual(len(diff_nodes), 1)
+        g = diff_nodes[0].g('Subgraph')
+        if_nodes = [n for n in g.nodes() if n.kind() == 'prim::If']
+        self.assertEqual(len(if_nodes), 1)
+        # the if node and the fusion group inside it should only have one output
+        self.assertEqual(len(list(if_nodes[0].outputs())), 1)
+
     @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA")
     def test_concat_invariant_cuda(self):
         # Invariant: the output of prim::FusedConcat may
diff --git a/torch/csrc/jit/passes/graph_fuser.cpp b/torch/csrc/jit/passes/graph_fuser.cpp
index 07634bfc5200..027daba912cc 100644
--- a/torch/csrc/jit/passes/graph_fuser.cpp
+++ b/torch/csrc/jit/passes/graph_fuser.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/autodiff.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
@@ -120,16 +121,6 @@ bool isSimpleMap(Node* node) {
   return true;
 }
 
-Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db) {
-  AT_ASSERT(!sizes.empty());
-  Graph* graph = sizes[0]->owningGraph();
-  Node* broadcast_n =
-      graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
-  broadcast_n->output()->setType(ListType::ofInts());
-  db->createValue(broadcast_n->output());
-  return broadcast_n->output();
-}
-
 struct GraphFuser {
   using FusionCallback = std::function<bool(GraphFuser*, Node*)>;
 
@@ -926,13 +917,6 @@ struct GraphFuser {
     }
   }
 
-  bool usedOnlyInSize(Value* v) {
-    const auto& uses = v->uses();
-    return std::all_of(uses.begin(), uses.end(), [](const Use& u) {
-      return u.user->matches("aten::size(Tensor self) -> int[]");
-    });
-  }
-
   // Builds up expressions that compute shapes of all intermediates (and
   // outputs) of the fusion group, based on the sizes of inputs. You should run
   // DCE to remove those that you end up not using.
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 3782c2af4f33..8e90fbefa77f 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -29,6 +29,23 @@ bool isSupportedForBlock(Node* node) {
   }
 }
 
+bool usedOnlyInSize(Value* v) {
+  const auto& uses = v->uses();
+  return std::all_of(uses.begin(), uses.end(), [](const Use& u) {
+    return u.user->matches("aten::size(Tensor self) -> int[]");
+  });
+}
+
+Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db) {
+  AT_ASSERT(!sizes.empty());
+  Graph* graph = sizes[0]->owningGraph();
+  Node* broadcast_n =
+      graph->insertNode(graph->create(prim::BroadcastSizes, sizes));
+  broadcast_n->output()->setType(ListType::ofInts());
+  db->createValue(broadcast_n->output());
+  return broadcast_n->output();
+}
+
 namespace tensorexpr {
 bool isSupported(Node* node) {
   // For Block codegen we allow limited ops.
@@ -287,6 +304,132 @@ class TensorExprFuser {
         min_group_size_(min_group_size),
         disable_shape_checks_(disable_shape_checks) {}
 
+  // Builds up expressions that compute shapes of all intermediates (and
+  // outputs) of the fusion group, based on the sizes of inputs. You should run
+  // DCE to remove those that you end up not using.
+  std::unordered_map<Value*, Value*> buildShapeExpressions(Node* fusion_group) {
+    GRAPH_DUMP("buildShapeExpressions for ", fusion_group->g(attr::Subgraph));
+    WithInsertPoint insert_guard{fusion_group->next()};
+    std::unordered_map<Value*, Value*> shape_of;
+
+    Graph* graph = fusion_group->owningGraph();
+    auto subgraph = fusion_group->g(attr::Subgraph);
+
+    auto inputs = fusion_group->inputs();
+    auto sinputs = subgraph->inputs();
+    AT_ASSERT(inputs.size() == sinputs.size());
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (inputs[i]->type()->isSubtypeOf(TensorType::get())) {
+        Value* soutput = graph->insert(aten::size, {inputs[i]});
+        aliasDb_->createValue(soutput);
+        GRAPH_DEBUG(
+            "Adding a mapping for %",
+            sinputs[i]->debugName(),
+            " ",
+            getHeader(soutput->node()));
+        shape_of[sinputs[i]] = soutput;
+      }
+    }
+
+    // When we have a guarantee that an output won't be removed, because it's
+    // used in expressions that don't involve size checks, we can use its size
+    // instead of computing a long chain of broadcasts, starting from the
+    // beginning of the kernel.
+    auto outputs = fusion_group->outputs();
+    auto soutputs = subgraph->outputs();
+    AT_ASSERT(outputs.size() == soutputs.size());
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      if (usedOnlyInSize(outputs[i]))
+        continue;
+      Value* soutput = graph->insert(aten::size, {outputs[i]});
+      aliasDb_->createValue(soutput);
+      shape_of[soutputs[i]] = soutput;
+    }
+
+    for (Node* n : subgraph->nodes()) {
+      // XXX: Use of shape_of.emplace is crucial to the output shape
+      // optimization!
+      if (n->kind() == aten::cat) {
+        // This is a bit more involved, because we have to account for the case
+        // when inputs have different shapes, but fortunately those tensors are
+        // always outputs, and so we can simply avoid replacing their queries,
+        // because it won't help us.
+        continue;
+      }
+      if (n->kind() == prim::Constant) {
+        continue;
+      }
+      if (n->kind() == prim::ConstantChunk) {
+        Node* sizes_node = graph->insertNode(
+            graph->create(prim::ChunkSizes, shape_of.at(n->input()), 2));
+        sizes_node->i_(attr::dim, n->i(attr::dim));
+        sizes_node->i_(attr::chunks, n->i(attr::chunks));
+        for (Value* output : sizes_node->outputs()) {
+          aliasDb_->createValue(output);
+        }
+        Value* regular_size = sizes_node->outputs().at(0);
+        Value* last_size = sizes_node->outputs().at(1);
+        regular_size->setType(ListType::ofInts());
+        last_size->setType(ListType::ofInts());
+        auto outputs = n->outputs();
+        for (Value* o : outputs.slice(0, outputs.size() - 1)) {
+          shape_of.emplace(o, regular_size);
+        }
+        shape_of.emplace(outputs.at(outputs.size() - 1), last_size);
+        continue;
+      }
+      auto tensor_inputs = filter(n->inputs(), [](Value* v) {
+        return v->type()->isSubtypeOf(TensorType::get());
+      });
+      GRAPH_DEBUG("Building sizes for ", getHeader(n));
+      bool all_inputs_have_sizes = true;
+      auto shapes = fmap(tensor_inputs, [&](Value* v) {
+        GRAPH_DEBUG("Getting aten::size for %", v->debugName());
+        all_inputs_have_sizes &= shape_of.count(v);
+        return shape_of.count(v) != 0 ? shape_of.at(v) : nullptr;
+      });
+
+      if (!all_inputs_have_sizes) {
+        GRAPH_DEBUG(
+            "Not all tensor arguments have sizes available to compute the broadcasted size",
+            getHeader(n));
+        continue;
+      }
+      shape_of.emplace(
+          n->output(),
+          shapes.size() == 1 ? shapes[0]
+                             : broadcastSizes(shapes, aliasDb_.get()));
+    }
+    return shape_of;
+  }
+
+  void removeOutputsUsedOnlyInSize(Node* fusion_group) {
+    if (fusion_group->kind() != prim::TensorExprGroup)
+      return;
+    auto subgraph = fusion_group->g(attr::Subgraph);
+
+    auto shape_of = buildShapeExpressions(fusion_group);
+    auto outputs = fusion_group->outputs().vec();
+    auto soutputs = subgraph->outputs().vec();
+    // XXX: Iterating in this order is not only good for performance reasons!
+    // It is also crucial for correctness (i has to reflect the current true
+    // index of outputs[i])!
+    for (int64_t i = static_cast<int64_t>(outputs.size()) - 1; i >= 0; --i) {
+      auto output = outputs[i];
+      auto soutput = soutputs[i];
+      if (usedOnlyInSize(output) && shape_of.count(soutput) > 0) {
+        auto uses = output->uses();
+        for (Use u : uses) {
+          AT_ASSERT(u.user->matches("aten::size(Tensor self) -> int[]"));
+          u.user->output()->replaceAllUsesWith(shape_of.at(soutput));
+          u.user->destroy();
+        }
+        fusion_group->eraseOutput(i);
+        subgraph->eraseOutput(i);
+      }
+    }
+  }
+
   void run() {
     aliasDb_ = torch::make_unique<AliasDb>(graph_);
     RemoveRedundantProfiles(graph_);
@@ -298,7 +441,7 @@ class TensorExprFuser {
     // fusion is done.
     inlineSmallFusionGroups(graph_->block());
     GRAPH_DUMP("After inlining small fusion groups: ", graph_);
-    guardFusionGroups(graph_->block());
+    guardFusionGroupsAndRemoveOutputs(graph_->block());
     GRAPH_DUMP("After guarding fusion groups: ", graph_);
     removeTensorTypeSpecializations(graph_->block());
     GRAPH_DUMP("After removing tensor type specializations: ", graph_);
@@ -772,17 +915,18 @@ class TensorExprFuser {
     }
   }
 
-  void guardFusionGroups(Block* block) {
+  void guardFusionGroupsAndRemoveOutputs(Block* block) {
     std::vector<Node*> fusion_groups;
     for (Node* n : block->nodes()) {
       for (Block* b : n->blocks()) {
-        guardFusionGroups(b);
+        guardFusionGroupsAndRemoveOutputs(b);
       }
       if (n->kind() == prim::TensorExprGroup) {
         fusion_groups.push_back(n);
       }
     }
     for (Node* fusion_group : fusion_groups) {
+      removeOutputsUsedOnlyInSize(fusion_group);
       guardFusionGroup(fusion_group);
     }
   }
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.h b/torch/csrc/jit/passes/tensorexpr_fuser.h
index a99cc88ef439..1474fba0ffa4 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.h
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -29,6 +29,9 @@ TORCH_API void RemoveProfileNodesAndSpecializeTypes(
     std::shared_ptr<Graph>& graph);
 TORCH_API void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph);
 
+TORCH_API bool usedOnlyInSize(Value* v);
+TORCH_API Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db);
+
 namespace tensorexpr {
 TORCH_API bool isSupported(Node* node);
 }

From 03342af3a3c56829ccf266d1badd8fb3617635bf Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Mon, 28 Sep 2020 11:37:57 -0700
Subject: [PATCH 201/449] Add env variable to bypass CUDACachingAllocator for
 debugging (#45294)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45294

While tracking down a recent memory corruption bug we found that
cuda-memcheck wasn't finding the bad accesses, and ngimel pointed out that
it's because we use a caching allocator so a lot of "out of bounds" accesses
land in a valid slab.

This PR adds a runtime knob (`PYTORCH_NO_CUDA_MEMORY_CACHING`) that, when set,
bypasses the caching allocator's caching logic so that allocations go straight
to cudaMalloc.  This way, cuda-memcheck will actually work.

Test Plan:
Insert some memory errors and run a test under cuda-memcheck;
observe that cuda-memcheck flags an error where expected.

Specifically I removed the output-masking logic here:
https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/cuda_codegen.cpp#L819-L826

And ran:
```
PYTORCH_NO_CUDA_MEMORY_CACHING=1 cuda-memcheck pytest -k test_superslomo test_jit_fuser_te.py
```

Reviewed By: ngimel

Differential Revision: D23964734

Pulled By: bertmaher

fbshipit-source-id: 04efd11e8aff037b9edde80c70585cb820ee6e39
---
 c10/cuda/CUDACachingAllocator.cpp | 17 +++++++++++++++++
 docs/source/notes/cuda.rst        |  4 ++++
 2 files changed, 21 insertions(+)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 2285a332f709..84542f064c2e 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -896,6 +896,19 @@ class THCCachingAllocator {
 
 THCCachingAllocator caching_allocator;
 
+// Returns whether to force all allocations to bypass the caching allocator and
+// go straight to cudaMalloc.  This setting is useful when debugging GPU memory
+// errors, since the caching allocator foils cuda-memcheck.
+bool forceUncachedAllocator() {
+  static bool force_uncached =
+      getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
+  return force_uncached;
+}
+
+static void uncached_delete(void* ptr) {
+  C10_CUDA_CHECK(cudaFree(ptr));
+}
+
 // NB: I decided not to fold this into THCCachingAllocator, because the latter
 // has a lot more methods and it wasn't altogether clear that they should
 // actually be publicly exposed
@@ -904,6 +917,10 @@ struct CudaCachingAllocator : public Allocator {
     int device;
     C10_CUDA_CHECK(cudaGetDevice(&device));
     void* r = nullptr;
+    if (forceUncachedAllocator()) {
+      C10_CUDA_CHECK(cudaMalloc(&r, size));
+      return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)};
+    }
     if (size != 0) {
       caching_allocator.malloc(&r, device, size, cuda::getCurrentCUDAStream(device));
     }
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 230426be8695..a34b0d7231fb 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -214,6 +214,10 @@ complete snapshot of the memory allocator state via
 :meth:`~torch.cuda.memory_snapshot`, which can help you understand the
 underlying allocation patterns produced by your code.
 
+Use of a caching allocator can interfere with memory checking tools such as
+``cuda-memcheck``.  To debug memory errors using ``cuda-memcheck``, set
+``PYTORCH_NO_CUDA_MEMORY_CACHING=1`` in your environment to disable caching.
+
 .. _cufft-plan-cache:
 
 cuFFT plan cache

From 36c3fbc9e3737c37cfd413dd2638a1a5fd142043 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Mon, 28 Sep 2020 11:38:15 -0700
Subject: [PATCH 202/449] CUDA BFloat Conv (non-cuDNN) (#45007)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45007

Reviewed By: zou3519

Differential Revision: D23933174

Pulled By: ngimel

fbshipit-source-id: 84eb028f09c9197993fb9981c0efb535014e5f78
---
 aten/src/ATen/cuda/CUDABlas.cpp               |  2 +-
 aten/src/ATen/native/Convolution.cpp          |  3 ++
 aten/src/THC/THCBlas.cu                       | 46 ++++++-------------
 aten/src/THC/THCBlas.h                        |  5 +-
 .../THCUNN/generic/SpatialConvolutionMM.cu    | 14 ------
 .../generic/SpatialDepthwiseConvolution.cu    | 12 -----
 test/test_nn.py                               | 15 +++---
 test/test_torch.py                            |  2 +-
 8 files changed, 28 insertions(+), 71 deletions(-)

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index c4e4793b1938..26423889caa4 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -374,7 +374,7 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
     // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required.
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
   } else {
-    AT_ERROR("BFloat16 gemm in CUDA requires Ampere or later GPU");
+    TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU");
   }
 }
 #endif
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index ea7903369e93..2655796bb624 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -198,6 +198,9 @@ auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) co
   if (!input.is_cuda() || !cudnn_enabled) {
     return false;
   }
+  if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
+    return false;
+  }
   if (!cudnn_conv_use_channels_last(input, weight)) { // bypass dilation checks for channels-last convolution
     if (deterministic && is_dilated()) {
       // cudnn doesn't support deterministic dilated convolution fully yet
diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu
index 73d411f05ef1..859d904a582b 100644
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@@ -123,12 +123,10 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
   at::cuda::blas::gemm<at::Half>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
-#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc)
 {
   at::cuda::blas::gemm<at::BFloat16>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
-#endif
 
 void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc)
 {
@@ -183,11 +181,11 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
 #endif // __HIP_PLATFORM_HCC__
 }
 
-#ifdef __HIP_PLATFORM_HCC__
 void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB,
                              at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount)
 {
+  at::globalContext().alertCuBLASConfigNotDeterministic();
   if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
 
   {
@@ -195,6 +193,7 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i
             "with the bound [val] <= %d", INT_MAX);
   }
 
+
   adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
   cublasOperation_t opa = convertTransToCublasOperation(transa);
   cublasOperation_t opb = convertTransToCublasOperation(transb);
@@ -202,49 +201,30 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   float fAlpha = alpha;
   float fBeta = beta;
-  THCublasCheck(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k,
-                                   (void*)&fAlpha, a, rocblas_datatype_bf16_r, (int)lda, strideA,
-                                   b, rocblas_datatype_bf16_r, (int)ldb, strideB,
-                                   (void*)&fBeta, c, rocblas_datatype_bf16_r, (int)ldc, strideC,
-                                   c, rocblas_datatype_bf16_r, (int)ldc, strideC,
-                                   (int) batchCount, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
-                                   0, 0, NULL, NULL));
-}
-#endif // __HIP_PLATFORM_HCC__
 
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
-                             at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB,
-                             at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount)
-{
-  at::globalContext().alertCuBLASConfigNotDeterministic();
-  if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX)  || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
-
-  {
-    THError("Cublas_SgemmStridedBatched only supports m, n, k, lda, ldb, ldc, batchCount"
-            "with the bound [val] <= %d", INT_MAX);
-  }
-
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   if (prop->major < 8) {
     TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU");
   }
-
-  adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc);
-  cublasOperation_t opa = convertTransToCublasOperation(transa);
-  cublasOperation_t opb = convertTransToCublasOperation(transb);
-
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-  float fAlpha = alpha;
-  float fBeta = beta;
   THCublasCheck(cublasGemmStridedBatchedEx(handle,
                                    opa, opb, (int)m, (int)n, (int)k,
                                    (void*)&fAlpha, a, CUDA_R_16BF, (int)lda, strideA,
                                    b, CUDA_R_16BF, (int)ldb, strideB,
                                    (void*)&fBeta, c, CUDA_R_16BF, (int)ldc, strideC,
                                    (int)batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-}
+#elif defined(__HIP_PLATFORM_HCC__)
+  THCublasCheck(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k,
+                                  (void*)&fAlpha, a, rocblas_datatype_bf16_r, (int)lda, strideA,
+                                  b, rocblas_datatype_bf16_r, (int)ldb, strideB,
+                                  (void*)&fBeta, c, rocblas_datatype_bf16_r, (int)ldc, strideC,
+                                  c, rocblas_datatype_bf16_r, (int)ldc, strideC,
+                                  (int) batchCount, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
+                                  0, 0, NULL, NULL));
+#else
+  TORCH_CHECK(false, "THCudaBlas_BgemmStridedBatched is only available on CUDA_VERSION >= 11");
 #endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+}
 
 void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                              float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h
index a9b646a4374f..4078363eb888 100644
--- a/aten/src/THC/THCBlas.h
+++ b/aten/src/THC/THCBlas.h
@@ -14,9 +14,8 @@ THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t
 THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
 
 THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc);
-#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+
 THC_API void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc);
-#endif
 
 THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb,
@@ -35,10 +34,8 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i
                                      THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB,
                                                                   THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount);
 
-#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000
 void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k,
                                      at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB,
                                      at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount);
-#endif
 
 #endif
diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
index 535c43636af0..44616bf4cf60 100644
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@@ -114,9 +114,6 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
            int kW, int kH,
            int dW, int dH,
            int padW, int padH) {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialConvolutionMM_updateOutput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
   if (bias) {
     THCUNN_assertSameGPU(state, 2, weight, bias);
@@ -267,7 +264,6 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
 
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, weight);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialConvolutionMM_updateGradInput)(
@@ -281,10 +277,6 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
            int kW, int kH,
            int dW, int dH,
            int padW, int padH) {
-
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialConvolutionMM_updateGradInput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
                        gradColumns, gradInput);
   weight = THNN_(newViewWeightMM2d)(state, weight);
@@ -380,7 +372,6 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
 
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialConvolutionMM_accGradParameters)(
@@ -395,10 +386,6 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
            int dW, int dH,
            int padW, int padH,
            accreal scale_) {
-
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialConvolutionMM_updateGradParameters not suppported with BFloat16");
-  #else
   scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);
   THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones);
   if (gradWeight) {
@@ -554,7 +541,6 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
 
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 #endif
diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
index 18d8da647d15..53eff031a822 100644
--- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@@ -13,9 +13,6 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
                   int padW, int padH,
                   int dilationW, int dilationH)
 {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialDepthwiseConvolution_updateOutput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 3, input, output, weight);
 
   // Only handle 4D Input Tensors for now
@@ -94,7 +91,6 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
   THCTensor_(free)(state, input);
   THCTensor_(free)(state, weight);
   if (bias) THCTensor_(free)(state, bias);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
@@ -108,9 +104,6 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
                   int padW, int padH,
                   int dilationW, int dilationH)
 {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialDepthwiseConvolution_updateGradInput not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 3, gradOutput, gradInput, weight);
 
   // Only handle 4D Input Tensors for now
@@ -203,7 +196,6 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)(
 
   THCTensor_(free)(state, weight);
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
@@ -216,9 +208,6 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
                   int padW, int padH,
                   int dilationW, int dilationH)
 {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "SpatialDepthwiseConvolution_accGradParameters not suppported with BFloat16");
-  #else
   THCUNN_assertSameGPU(state, 3, input, gradOutput, gradWeight);
 
   // Only handle 4D Input Tensors for now
@@ -271,7 +260,6 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)(
   THCudaCheck(cudaGetLastError());
 
   THCTensor_(free)(state, gradOutput);
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 #endif
diff --git a/test/test_nn.py b/test/test_nn.py
index 4020eb0cf308..9a531b12373a 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -54,6 +54,9 @@
 from torch.testing._internal.common_utils import dtype2prec_DONTUSE
 from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, tf32_off, tf32_on
 
+
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
+
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -3953,7 +3956,7 @@ def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
-    @repeat_test_for_types(ALL_TENSORTYPES2)
+    @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     def test_Conv2d_deterministic_cudnn(self, dtype=torch.float):
         inputs = torch.randn(2, 3, 5, 5, device="cuda", dtype=dtype, requires_grad=True)
         with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
@@ -3983,7 +3986,7 @@ def test_Conv2d_backward_twice(self):
                                lambda: o1.sum().backward())
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @repeat_test_for_types(ALL_TENSORTYPES2)
+    @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     def test_Conv2d_large_workspace(self, dtype=torch.float):
         # These sizes require huge cuDNN workspaces. Make sure we choose a
         # reasonable algorithm that does not run out of memory
@@ -4110,7 +4113,7 @@ def test_Conv2d_groups_nobias(self):
         dev_dtypes = [("cpu", torch.float)]
         if TEST_CUDA:
             dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if TEST_WITH_ROCM:
+        if AMPERE_OR_ROCM:
             dev_dtypes += [("cuda", torch.bfloat16)]
         for device, dtype in dev_dtypes:
             m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype)
@@ -4148,7 +4151,7 @@ def test_Conv2d_groups_nobias_v2(self):
         dev_dtypes = [("cpu", torch.float)]
         if TEST_CUDA:
             dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if TEST_WITH_ROCM:
+        if AMPERE_OR_ROCM:
             dev_dtypes += [("cuda", torch.bfloat16)]
         for device, dtype in dev_dtypes:
             m = nn.Conv2d(4, 16, kernel_size=3, groups=2, bias=False).to(device, dtype)
@@ -6381,7 +6384,7 @@ def test_inplace_thnn(self):
             self.assertEqual(grad_output, grad_output_clone)
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @repeat_test_for_types(ALL_TENSORTYPES2)
+    @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     def test_noncontig_conv_grad_cuda(self, dtype=torch.float):
         # FIXME: remove after adding non-contiguous grad tests for all modules
         module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to("cuda", dtype)
@@ -11847,7 +11850,7 @@ def test_multihead_attention_dtype(self, device, dtype):
         self.assertEqual(q.size(), out[0].size())
         self.assertEqual(dtype, out[0].dtype)
 
-    @dtypesIfCUDA(*ALL_TENSORTYPES2)
+    @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
     @dtypes(torch.float)
     def test_Conv2d_naive_groups(self, device, dtype):
         # Check that grouped convolutions matches two half convolutions
diff --git a/test/test_torch.py b/test/test_torch.py
index 6c875e68b12f..90368696a0e9 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -58,7 +58,7 @@
 
 SIZE = 100
 
-AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() 
+AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
 
 # Wrap base test class into a class to hide it from testing
 # See https://stackoverflow.com/a/25695512

From 6ab1c0b1ca8631a54752b8df9d86f2dfe8fff5d9 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Mon, 28 Sep 2020 12:53:02 -0700
Subject: [PATCH 203/449] Disable a few tests in preparation to enabling PE+TE
 (#44815)

Summary:
Disable a few tests in preparation to enabling PE+TE
Next PR: https://github.com/pytorch/pytorch/pull/45396

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44815

Reviewed By: ZolotukhinM

Differential Revision: D23948445

Pulled By: Krovatkin

fbshipit-source-id: 93e641b7b8a3f13bd3fd3840116076553408f224
---
 test/test_jit.py            | 2 +-
 test/test_jit_cuda_fuser.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index b689f76681f7..3f6f17066364 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1425,7 +1425,7 @@ def test_dropout(self):
             self.assertEqual(outputs, m(*inputs))
 
     @slowTest
-    @unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.SIMPLE, 'Testing differentiable graph')
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, 'Testing differentiable graph')
     def test_dropout_module_requires_grad(self):
         with enable_profiling_mode_for_profiling_tests():
             class MyModule(torch.nn.Module):
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 0c8a1f9a967d..6f28e302c344 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -514,6 +514,7 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
         self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GROUP)
 
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skip("temp disable while switching to Profiling Executor")
     def test_random_topo(self):
         os.environ["PYTORCH_CUDA_FUSER_DISABLE_FALLBACK"] = "1"
         self.assertTrue(runDefaultTestWithSeed(28449))
@@ -618,6 +619,7 @@ def test_reduction(self):
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
                      ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
     @skipIfRocm
+    @unittest.skip("temp disable while switching to Profiling Executor")
     def test_reduction_permutation(self):
         x = [7, 8, 12]
         # note that num_dim is exclusive from len(x), so we are not reducing

From 87b356d09354cd5c003b0d2f3a5e96fc43112cfe Mon Sep 17 00:00:00 2001
From: Bram Wasti <bwasti@fb.com>
Date: Mon, 28 Sep 2020 12:53:59 -0700
Subject: [PATCH 204/449] [static runtime] Split out graph preparation from
 runtime (#44131)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44131

Test Plan: Imported from OSS

Reviewed By: hlu1

Differential Revision: D23604305

Pulled By: bwasti

fbshipit-source-id: 7b47da4961d99074199417ef1407a788c7d80ee6
---
 .../static_runtime/deep_wide_pt_bench.cc      | 26 ++++++++-
 .../static_runtime/test_static_runtime.cc     |  6 +-
 test/test_static_runtime.py                   | 14 ++---
 torch/csrc/jit/runtime/static/impl.cpp        | 55 ++++++++++---------
 torch/csrc/jit/runtime/static/impl.h          | 16 +++---
 torch/csrc/jit/runtime/static/init.cpp        |  4 +-
 6 files changed, 75 insertions(+), 46 deletions(-)

diff --git a/benchmarks/static_runtime/deep_wide_pt_bench.cc b/benchmarks/static_runtime/deep_wide_pt_bench.cc
index ef960d28d7eb..21c2923f8301 100644
--- a/benchmarks/static_runtime/deep_wide_pt_bench.cc
+++ b/benchmarks/static_runtime/deep_wide_pt_bench.cc
@@ -60,7 +60,8 @@ static void BM_deep_wide_jit_profiling_executor(benchmark::State& state) {
 
 static void BM_deep_wide_static(benchmark::State& state) {
   auto mod = getDeepAndWideSciptModel();
-  torch::jit::StaticRuntime runtime(mod);
+  auto g = torch::jit::PrepareForStaticRuntime(mod);
+  torch::jit::StaticRuntime runtime(g);
 
   const int batch_size = state.range(0);
   auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
@@ -75,6 +76,28 @@ static void BM_deep_wide_static(benchmark::State& state) {
   }
 }
 
+const std::shared_ptr<torch::jit::Graph>& getStaticGraph() {
+  static const std::shared_ptr<torch::jit::Graph> g =
+      torch::jit::PrepareForStaticRuntime(getDeepAndWideSciptModel());
+  return g;
+}
+
+static void BM_deep_wide_static_threaded(benchmark::State& state) {
+  auto g = getStaticGraph();
+  torch::jit::StaticRuntime runtime(g);
+
+  const int batch_size = 1; // state.range(0);
+  auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+  auto user_emb = torch::randn({batch_size, 1, embedding_size});
+  auto wide = torch::randn({batch_size, num_features});
+
+  std::vector<at::Tensor> inputs({ad_emb_packed, user_emb, wide});
+
+  for (auto _ : state) {
+    runtime.run(inputs);
+  }
+}
+
 BENCHMARK(BM_deep_wide_base)->RangeMultiplier(8)->Ranges({{1, 20}});
 
 BENCHMARK(BM_deep_wide_jit_graph_executor)
@@ -86,5 +109,6 @@ BENCHMARK(BM_deep_wide_jit_profiling_executor)
     ->Ranges({{1, 20}});
 
 BENCHMARK(BM_deep_wide_static)->RangeMultiplier(8)->Ranges({{1, 20}});
+BENCHMARK(BM_deep_wide_static_threaded)->Threads(8);
 
 BENCHMARK_MAIN();
diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 6b9b1dd198cc..211839f0eaa5 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -14,7 +14,8 @@ TEST(StaticRuntime, TrivialModel) {
 
   // run static runtime
   std::vector<at::Tensor> input_tensors({a, b, c});
-  torch::jit::StaticRuntime runtime(mod);
+  auto g = torch::jit::PrepareForStaticRuntime(mod);
+  torch::jit::StaticRuntime runtime(g);
   at::Tensor output_2 = runtime.run(input_tensors)[0];
   EXPECT_TRUE(output_1.equal(output_2));
 }
@@ -23,7 +24,8 @@ TEST(StaticRuntime, DeepWide) {
   const int embedding_size = 32;
   const int num_features = 50;
   torch::jit::Module mod = getDeepAndWideSciptModel();
-  torch::jit::StaticRuntime runtime(mod);
+  auto g = torch::jit::PrepareForStaticRuntime(mod);
+  torch::jit::StaticRuntime runtime(g);
 
   for (int batch_size : {1, 8, 32}) {
     for (int i = 0; i < 5; ++i) {
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 582cb320bbad..86dafa3903dd 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -150,13 +150,13 @@ def test_mlp(self):
             acc_top = top_l_acc(top_inp)[0]
             torch.testing.assert_allclose(acc_top, ref_top)
 
-    # def test_trivial_graph(self):
-    #     s = torch.full((2, 2), 2)
-    #     tg = torch.jit.script(trivial_graph)
-    #     o_ref = tg(s, s, s)
-    #     tg_a = StaticRuntime(tg)
-    #     o_test = tg_a(s, s, s)[0]
-    #     torch.testing.assert_allclose(o_ref, o_test)
+    def test_trivial_graph(self):
+        s = torch.full((2, 2), 2)
+        tg = torch.jit.script(trivial_graph)
+        o_ref = tg(s, s, s)
+        tg_a = StaticRuntime(tg)
+        o_test = tg_a(s, s, s)[0]
+        torch.testing.assert_allclose(o_ref, o_test)
 
 
 if __name__ == "__main__":
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 42abeec3e3a5..1ebd7cc89062 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -14,20 +14,16 @@ namespace jit {
 using c10::DispatchKey;
 using c10::RegisterOperators;
 
-StaticRuntime::StaticRuntime(const torch::jit::Module& m)
-    : module_(m.copy()), graph_(nullptr) {
-  module_.eval();
-  module_ = freeze_module(module_);
-  graph_ = module_.get_method("forward").graph();
-
-  Inline(*graph_);
-  ConstantPropagation(graph_);
-  Canonicalize(graph_);
-  ConstantPropagation(graph_);
-  RemoveTensorMutation(graph_);
-  ConstantPropagation(graph_);
-
-  for (auto n : graph_->nodes()) {
+std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    std::shared_ptr<torch::jit::Graph> g) {
+  Inline(*g);
+  ConstantPropagation(g);
+  Canonicalize(g);
+  ConstantPropagation(g);
+  RemoveTensorMutation(g);
+  ConstantPropagation(g);
+
+  for (auto n : g->nodes()) {
     if (n->kind() == c10::Symbol::fromQualString("prim::GetAttr")) {
       throw std::runtime_error("Cannot accelerate unfrozen graphs");
     }
@@ -45,12 +41,25 @@ StaticRuntime::StaticRuntime(const torch::jit::Module& m)
   }
 
   // remove unused input 0 from graph
-  if (graph_->inputs().at(0)->type()->is_module()) {
-    if (!graph_->inputs().at(0)->hasUses()) {
-      graph_->eraseInput(0);
+  if (g->inputs().at(0)->type()->is_module()) {
+    if (!g->inputs().at(0)->hasUses()) {
+      g->eraseInput(0);
     }
   }
 
+  return g;
+}
+
+std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    const torch::jit::Module& m) {
+  auto module = m.copy();
+  module.eval();
+  module = freeze_module(module);
+  auto g = module.get_method("forward").graph();
+  return PrepareForStaticRuntime(g);
+}
+
+StaticRuntime::StaticRuntime(std::shared_ptr<torch::jit::Graph> g) : graph_(g) {
   // fill workspace_ with constants
   for (Node* node : graph_->nodes()) {
     if (node->kind() == prim::Constant) {
@@ -63,19 +72,13 @@ StaticRuntime::StaticRuntime(const torch::jit::Module& m)
 }
 
 std::vector<at::Tensor> StaticRuntime::run(
-    const std::vector<at::Tensor>& inps) {
+    const std::vector<at::Tensor>& inps) const {
   // Container for inputs, outputs, and activations (excluding parameters)
 
-  int start = 0;
-  if (graph_->inputs().size() != inps.size()) {
-    start = 1;
-    CHECK_EQ(graph_->inputs().size(), inps.size() + 1);
-    CHECK((graph_->inputs().at(0)->type()->is_module()));
-    workspace_[graph_->inputs()[0]] = module_._ivalue();
-  }
+  TORCH_INTERNAL_ASSERT(graph_->inputs().size() == inps.size());
 
   for (size_t i = 0; i < inps.size(); i++) {
-    workspace_[graph_->inputs()[i + start]] = inps[i];
+    workspace_[graph_->inputs()[i]] = inps[i];
   }
 
   for (const auto& n : nodes_) {
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 3fc8a2ee2a2b..8fe8c805dfba 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -14,15 +14,16 @@
 namespace torch {
 namespace jit {
 
+TORCH_API std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    std::shared_ptr<torch::jit::Graph> g);
+TORCH_API std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
+    const torch::jit::Module& m);
+
 class ProcessedNode;
 class TORCH_API StaticRuntime {
  public:
-  explicit StaticRuntime(std::shared_ptr<torch::jit::Graph> g)
-      : graph_(std::move(g)) {}
-
-  explicit StaticRuntime(const torch::jit::Module& m);
-
-  std::vector<at::Tensor> run(const std::vector<at::Tensor>& inps);
+  explicit StaticRuntime(std::shared_ptr<torch::jit::Graph> g);
+  std::vector<at::Tensor> run(const std::vector<at::Tensor>& inps) const;
 
 #ifdef FBCODE_CAFFE2
   using ConstantMap = folly::F14FastMap<Value*, IValue>;
@@ -31,12 +32,11 @@ class TORCH_API StaticRuntime {
 #endif
 
  private:
-  torch::jit::Module module_;
   std::shared_ptr<torch::jit::Graph> graph_;
 
   // Static runtime states
   // Value table (including weights)
-  ConstantMap workspace_;
+  mutable ConstantMap workspace_;
 
   // The nodes we need to run
   std::vector<ProcessedNode> nodes_;
diff --git a/torch/csrc/jit/runtime/static/init.cpp b/torch/csrc/jit/runtime/static/init.cpp
index d57242d6b68c..86292f4ee5f5 100644
--- a/torch/csrc/jit/runtime/static/init.cpp
+++ b/torch/csrc/jit/runtime/static/init.cpp
@@ -10,10 +10,10 @@ void initStaticRuntimeBindings(PyObject* module) {
   m.def(
        "_jit_to_static_runtime",
        [](const std::shared_ptr<torch::jit::Graph>& g) {
-         return StaticRuntime(g);
+         return StaticRuntime(PrepareForStaticRuntime(g));
        })
       .def("_jit_to_static_runtime", [](const torch::jit::Module& m) {
-        return StaticRuntime(m);
+        return StaticRuntime(PrepareForStaticRuntime(m));
       });
 }
 

From 722faeb2a4ef1ae2918d8705ceee22f6d2691097 Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@devfair004.maas>
Date: Mon, 28 Sep 2020 13:04:57 -0700
Subject: [PATCH 205/449] [RELAND] Added optimizers based on multi tensor apply
 (#45408)

Summary:
Original PR https://github.com/pytorch/pytorch/pull/45299.  The present PR fixes minor bugs that caused revert.

Adding a new namespace `torch.optim._multi_tensor` with a bunch of updated optimizers. Those optimizers are using _foreach APIs which improve performance significantly.

### Tests
- updated existing tests to use both optimizers
- added `test_multi_tensor_optimizers` test to verify correctness.

### Perf results

**Adam**
timeit: 42.69 ms --> 10.16 ms
autorange: 41.96 ms --> 10.28 ms

**AdamW**
timeit: 51.38 ms --> 15.63 ms
autorange: 50.82 ms --> 16.07 ms

**SGD**
timeit: 6.28 ms --> 4.40 ms
autorange: 6.13 ms --> 4.73 ms

**RMSprop**
timeit: 28.63 ms --> 5.89 ms
autorange: 28.27 ms -->  5.76 ms

**Rprop**
timeit: 213.30 --> 178.42
autorange: 212.03 --> 178.03

**ASGD**
timeit: 21.67 --> 9.33
autorange: 21.64 --> 9.27

**Adamax**
timeit: 55.60 --> 48.29
autorange: 55.22 -> 49.13

**Rerf Script used**

```
import torch
import time
import torch.optim as optim
from torch.autograd import Variable
from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, StepLR
import torch.nn as nn
import time
import torchvision
import torch.utils._benchmark as benchmark_utils

device = "cuda"
model = torchvision.models.resnet.resnet101(pretrained=True).to(device)
targets = torch.randint(0, 1000, (100, 100), device=device)
criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=1e-3) # <----------------------- optimizer.
                                                          # would compare optim.SGD vs optim._multi_tensor.SGD
running_loss = 0.0
target = torch.empty(128, dtype=torch.long, device=device).random_(5)

optimizer.zero_grad()
inputs = torch.rand(128, 3, 100, 100, device=device , requires_grad=True)
outputs = model(inputs)
loss = criterion(outputs, target)
loss.backward()
optimizer.step()
running_loss += loss.item()

def main():
    timer = benchmark_utils.Timer(
        stmt="optimizer.step()",
        globals=globals(),
        label="str(optimizer)",
    )

    for i in range(1):
        print(f"Run: {i}\n{'-' * 40}")
        print(f"timeit:\n{timer.timeit(1000)}\n")
        print(f"autorange:\n{timer.blocked_autorange()}\n\n")

if __name__ == "__main__":
    main()
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45408

Reviewed By: gchanan

Differential Revision: D23956680

Pulled By: izdeby

fbshipit-source-id: c5eab7bf5fce14a287c15cead1cdc26e42cfed94
---
 .../ATen/native/cuda/ForeachPointwiseOp.cu    |   2 +-
 test/test_optim.py                            | 416 +++++++++++-------
 torch/__init__.py                             |   1 +
 torch/optim/_multi_tensor/__init__.py         |  24 +
 torch/optim/_multi_tensor/__init__.pyi        |   8 +
 torch/optim/_multi_tensor/adadelta.py         |  99 +++++
 torch/optim/_multi_tensor/adadelta.pyi        |   5 +
 torch/optim/_multi_tensor/adam.py             | 142 ++++++
 torch/optim/_multi_tensor/adam.pyi            |   5 +
 torch/optim/_multi_tensor/adamax.py           | 107 +++++
 torch/optim/_multi_tensor/adamax.pyi          |   5 +
 torch/optim/_multi_tensor/adamw.py            | 144 ++++++
 torch/optim/_multi_tensor/adamw.pyi           |   5 +
 torch/optim/_multi_tensor/asgd.py             |  94 ++++
 torch/optim/_multi_tensor/asgd.pyi            |   5 +
 torch/optim/_multi_tensor/rmsprop.py          | 123 ++++++
 torch/optim/_multi_tensor/rmsprop.pyi         |   5 +
 torch/optim/_multi_tensor/rprop.py            |  95 ++++
 torch/optim/_multi_tensor/rprop.pyi           |   5 +
 torch/optim/_multi_tensor/sgd.py              | 154 +++++++
 torch/optim/_multi_tensor/sgd.pyi             |   4 +
 21 files changed, 1295 insertions(+), 153 deletions(-)
 create mode 100644 torch/optim/_multi_tensor/__init__.py
 create mode 100644 torch/optim/_multi_tensor/__init__.pyi
 create mode 100644 torch/optim/_multi_tensor/adadelta.py
 create mode 100644 torch/optim/_multi_tensor/adadelta.pyi
 create mode 100644 torch/optim/_multi_tensor/adam.py
 create mode 100644 torch/optim/_multi_tensor/adam.pyi
 create mode 100644 torch/optim/_multi_tensor/adamax.py
 create mode 100644 torch/optim/_multi_tensor/adamax.pyi
 create mode 100644 torch/optim/_multi_tensor/adamw.py
 create mode 100644 torch/optim/_multi_tensor/adamw.pyi
 create mode 100644 torch/optim/_multi_tensor/asgd.py
 create mode 100644 torch/optim/_multi_tensor/asgd.pyi
 create mode 100644 torch/optim/_multi_tensor/rmsprop.py
 create mode 100644 torch/optim/_multi_tensor/rmsprop.pyi
 create mode 100644 torch/optim/_multi_tensor/rprop.py
 create mode 100644 torch/optim/_multi_tensor/rprop.pyi
 create mode 100644 torch/optim/_multi_tensor/sgd.py
 create mode 100644 torch/optim/_multi_tensor/sgd.pyi

diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index b514f3294c52..32b3ca97f77b 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -59,7 +59,7 @@ void foreach_tensor_##NAME##_cuda_(TensorList input, TensorList tensors1, Tensor
     if (!can_use_fast_route(input, scalar) ||                                                                                  \
         !can_use_fast_route(tensors1, tensors2) ||                                                                             \
         !can_use_fast_route(input, tensors1)) {                                                                                \
-        at::native::foreach_tensor_##NAME##_slow_(input, tensors1, tensors2, scalar);                                          \
+        return at::native::foreach_tensor_##NAME##_slow_(input, tensors1, tensors2, scalar);                                   \
     }                                                                                                                          \
                                                                                                                                \
     foreach_pointwise_op_<OP>(input, tensors1, tensors2, scalar);                                                              \
diff --git a/test/test_optim.py b/test/test_optim.py
index f8a2cc405bf6..0b1949235017 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -6,6 +6,7 @@
 import torch
 from torch._six import inf
 import torch.optim as optim
+import torch.optim._multi_tensor as optim_mt
 import torch.nn.functional as F
 from torch.optim import SGD
 from torch.autograd import Variable
@@ -249,109 +250,184 @@ def _build_params_dict_single(self, weight, bias, **kwargs):
         return [dict(params=bias, **kwargs)]
 
     def test_sgd(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD(
-                self._build_params_dict_single(weight, bias, lr=1e-2))
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
-            [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
-             lambda opt: ExponentialLR(opt, gamma=0.99),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
-            optim.SGD(None, lr=1e-2, momentum=-0.5)
+        for optimizer in [optim.SGD, optim_mt.SGD]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict_single(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict_single(weight, bias, lr=1e-2))
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3),
+                [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
+                 lambda opt: ExponentialLR(opt, gamma=0.99),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, momentum=1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, momentum=1, weight_decay=1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], nesterov=True, lr=1e-3, momentum=1, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
+                optimizer(None, lr=1e-2, momentum=-0.5)
 
     def test_sgd_sparse(self):
-        self._test_rosenbrock_sparse(
-            lambda params: optim.SGD(params, lr=5e-3)
-        )
-        self._test_rosenbrock_sparse(
-            lambda params: optim.SGD(params, lr=0.005),
-            [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
-        )
+        for optimizer in [optim.SGD, optim_mt.SGD]:
+            self._test_rosenbrock_sparse(
+                lambda params: optimizer(params, lr=5e-3)
+            )
+            self._test_rosenbrock_sparse(
+                lambda params: optimizer(params, lr=0.005),
+                [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
+            )
+
+    def test_multi_tensor_optimizers(self):
+        if not torch.cuda.is_available():
+            return
 
-    def test_adam(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
-                                            amsgrad=True)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
-                                            weight_decay=0.1)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, amsgrad=True)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3),
-            [lambda opt: ExponentialLR(opt, gamma=0.9)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
-                                            amsgrad=True),
-            [lambda opt: ExponentialLR(opt, gamma=0.9),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adam(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, amsgrad=True),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
-            optim.Adam(None, lr=1e-2, betas=(1.0, 0.0))
+        orig_optimizers = [optim.Adam, optim.AdamW,
+                           optim.SGD, optim.RMSprop,
+                           optim.Rprop, optim.ASGD,
+                           optim.Adamax, optim.Adadelta]
+
+        mt_optimizers = [optim._multi_tensor.Adam, optim._multi_tensor.AdamW,
+                         optim._multi_tensor.SGD, optim._multi_tensor.RMSprop,
+                         optim._multi_tensor.Rprop, optim._multi_tensor.ASGD,
+                         optim._multi_tensor.Adamax, optim._multi_tensor.Adadelta]
+
+        kIterations = 1001
+        device = 'cuda'
+
+        for opt1, opt2 in zip(orig_optimizers, mt_optimizers):
+            optimizers = [opt1, opt2]
+            res = []
+            for opt in optimizers:
+                weight = torch.tensor([[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]], 
+                                      dtype=torch.float64, device=device, requires_grad=True)
+                bias = torch.tensor([-0.1085, -0.2979, 0.6892], dtype=torch.float64, device=device, requires_grad=True)
+                weight2 = torch.tensor([[-0.0508, -0.3941, -0.2843]], 
+                                       dtype=torch.float64, device=device, requires_grad=True)
+                bias2 = torch.tensor([-0.0711], dtype=torch.float64, device=device, requires_grad=True)
+                input = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=torch.float64, device=device).reshape(3, 2)
+
+                model = torch.nn.Sequential(torch.nn.Linear(2, 3), 
+                                            torch.nn.Sigmoid(),
+                                            torch.nn.Linear(3, 1),
+                                            torch.nn.Sigmoid())
+                model.to(torch.float64).to(device)
+
+                pretrained_dict = model.state_dict()
+                pretrained_dict['0.weight'] = weight
+                pretrained_dict['0.bias'] = bias
+                pretrained_dict['2.weight'] = weight2
+                pretrained_dict['2.bias'] = bias2
+                model.load_state_dict(pretrained_dict)
+
+                optimizer = opt(model.parameters(), lr=1.0)
+
+                for _ in range(kIterations): 
+                    optimizer.zero_grad()
+                    output = model(input)
+                    loss = output.sum()
+                    loss.backward()
+
+                    def closure():
+                        return torch.Tensor([10])
+
+                    optimizer.step(closure)
+
+                res.append(model.parameters())
+
+            for p1, p2 in zip(res[0], res[1]):
+                self.assertEqual(p1, p2)
 
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
-            optim.Adam(None, lr=1e-2, weight_decay=-1)
+    def test_adam(self):
+        for optimizer in [optim.Adam, optim_mt.Adam]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=0.1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3, amsgrad=True)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3),
+                [lambda opt: ExponentialLR(opt, gamma=0.9)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True),
+                [lambda opt: ExponentialLR(opt, gamma=0.9),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3, amsgrad=True),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
+                optimizer(None, lr=1e-2, betas=(1.0, 0.0))
+
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
+                optimizer(None, lr=1e-2, weight_decay=-1)
 
     def test_adamw(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.AdamW([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.AdamW(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
-            optim.AdamW(None, lr=1e-2, weight_decay=-1)
+        for optimizer in [optim.AdamW, optim_mt.AdamW]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=1, amsgrad=True)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
+                optimizer(None, lr=1e-2, weight_decay=-1)
 
     def test_sparse_adam(self):
         self._test_rosenbrock_sparse(
@@ -369,21 +445,25 @@ def test_sparse_adam(self):
     # ROCm precision is too low to pass this test
     @skipIfRocm
     def test_adadelta(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta([weight, bias])
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95))
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adadelta(
-                self._build_params_dict(weight, bias, rho=0.95)),
-            [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
-             lambda opt: ReduceLROnPlateau(opt)]
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
-            optim.Adadelta(None, lr=1e-2, rho=1.1)
+        for optimizer in [optim.Adadelta, optim_mt.Adadelta]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias])
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95))
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, rho=0.95)),
+                [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+                 lambda opt: ReduceLROnPlateau(opt)]
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
+                optimizer(None, lr=1e-2, rho=1.1)
 
     def test_adagrad(self):
         self._test_basic_cases(
@@ -425,52 +505,84 @@ def test_adagrad_sparse(self):
         )
 
     def test_adamax(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Adamax(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-1)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
-            optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0))
+        for optimizer in [optim.Adamax, optim_mt.Adamax]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-1, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
+                optimizer(None, lr=1e-2, betas=(0.0, 1.0))
 
     def test_rmsprop(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.RMSprop(
-                self._build_params_dict(weight, bias, lr=1e-3),
-                lr=1e-2)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
-            optim.RMSprop(None, lr=1e-2, momentum=-1.0)
+        for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-2)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, centered=True)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, centered=True, momentum=0.1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, momentum=0.1)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, momentum=0.1, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"):
+                optimizer(None, lr=1e-2, momentum=-1.0)
 
     def test_asgd(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.ASGD(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3, t0=100)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
-            optim.ASGD(None, lr=1e-2, weight_decay=-0.5)
+        for optimizer in [optim.ASGD, optim_mt.ASGD]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3, t0=100)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3, t0=100)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-3),
+                    lr=1e-2, weight_decay=1)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"):
+                optimizer(None, lr=1e-2, weight_decay=-0.5)
 
     def test_rprop(self):
-        self._test_basic_cases(
-            lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3)
-        )
-        self._test_basic_cases(
-            lambda weight, bias: optim.Rprop(
-                self._build_params_dict(weight, bias, lr=1e-2),
-                lr=1e-3)
-        )
-        with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
-            optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5))
+        for optimizer in [optim.Rprop, optim_mt.Rprop]:
+            self._test_basic_cases(
+                lambda weight, bias: optimizer([weight, bias], lr=1e-3)
+            )
+            self._test_basic_cases(
+                lambda weight, bias: optimizer(
+                    self._build_params_dict(weight, bias, lr=1e-2),
+                    lr=1e-3)
+            )
+            with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"):
+                optimizer(None, lr=1e-2, etas=(1.0, 0.5))
 
     def test_lbfgs(self):
         self._test_basic_cases(
diff --git a/torch/__init__.py b/torch/__init__.py
index a88e441b9238..8b69a4612ae7 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -552,6 +552,7 @@ def manager_path():
 import torch.nn.intrinsic
 import torch.nn.quantized
 import torch.optim
+import torch.optim._multi_tensor
 import torch.multiprocessing
 import torch.sparse
 import torch.utils.backcompat
diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py
new file mode 100644
index 000000000000..2f9b65bfbe3f
--- /dev/null
+++ b/torch/optim/_multi_tensor/__init__.py
@@ -0,0 +1,24 @@
+"""
+:mod:`torch.optim._multi_tensor` is a package implementing various optimization algorithms.
+Most commonly used methods are already supported, and the interface is general
+enough, so that more sophisticated ones can be also easily integrated in the
+future.
+"""
+
+from .adam import Adam
+from .adamw import AdamW
+from .sgd import SGD
+from .rmsprop import RMSprop
+from .rprop import Rprop
+from .asgd import ASGD
+from .adamax import Adamax
+from .adadelta import Adadelta
+
+del adam
+del adamw
+del sgd
+del rmsprop
+del rprop
+del asgd
+del adamax
+del adadelta
diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi
new file mode 100644
index 000000000000..952b969012b7
--- /dev/null
+++ b/torch/optim/_multi_tensor/__init__.pyi
@@ -0,0 +1,8 @@
+from .adam import Adam as Adam
+from .adamw import AdamW as AdamW
+from .sgd import SGD as SGD
+from .rmsprop import RMSprop as RMSprop
+from .rprop import Rprop as Rprop
+from .asgd import ASGD as ASGD
+from .adamax import Adamax as Adamax
+from .adadelta import Adadelta as Adadelta
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adadelta.py b/torch/optim/_multi_tensor/adadelta.py
new file mode 100644
index 000000000000..e69c2143fd12
--- /dev/null
+++ b/torch/optim/_multi_tensor/adadelta.py
@@ -0,0 +1,99 @@
+import torch
+from ..optimizer import Optimizer
+
+class Adadelta(Optimizer):
+    """Implements Adadelta algorithm.
+
+    It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        rho (float, optional): coefficient used for computing a running average
+            of squared gradients (default: 0.9)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-6)
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1212.5701
+    """
+
+    def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= rho <= 1.0:
+            raise ValueError("Invalid rho value: {}".format(rho))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
+        super(Adadelta, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            square_avgs = []
+            acc_deltas = []
+
+            rho, eps = group['rho'], group['eps']
+
+            for p in group['params']:
+                if p.grad is not None: 
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adadelta does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['acc_delta'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    square_avgs.append(state['square_avg'])
+                    acc_deltas.append(state['acc_delta'])
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            torch._foreach_mul_(square_avgs, rho)
+            torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - rho)
+
+            std = torch._foreach_add(square_avgs, eps)
+            torch._foreach_sqrt_(std)
+
+            deltas = torch._foreach_add(acc_deltas, eps)
+            torch._foreach_sqrt_(deltas)
+            torch._foreach_div_(deltas, std)
+            torch._foreach_mul_(deltas, grads)
+
+            torch._foreach_add_(params_with_grad, deltas, alpha=-group['lr'])
+
+            torch._foreach_mul_(acc_deltas, rho)
+            torch._foreach_addcmul_(acc_deltas, deltas, deltas, value=1 - rho)
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adadelta.pyi b/torch/optim/_multi_tensor/adadelta.pyi
new file mode 100644
index 000000000000..0ca4478a16da
--- /dev/null
+++ b/torch/optim/_multi_tensor/adadelta.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adadelta(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., rho: float=..., eps: float=..., weight_decay: float=...) -> None: ...
\ No newline at end of file
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
new file mode 100644
index 000000000000..4711febf7153
--- /dev/null
+++ b/torch/optim/_multi_tensor/adam.py
@@ -0,0 +1,142 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+class Adam(Optimizer):
+    r"""Implements Adam algorithm with multi tensor APIs.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+    The implementation of the L2 penalty follows changes proposed in
+    `Decoupled Weight Decay Regularization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(Adam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Adam, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            amsgrad = group['amsgrad']
+
+            grads = []
+            states = []
+            exp_avg = []
+            exp_avg_sq = []
+            max_exp_avg_sq = []
+            params_with_grad = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+
+            for p in params_with_grad:
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avg.append(state['exp_avg'])
+                exp_avg_sq.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
+
+                state['step'] += 1
+                states.append(state)
+
+            beta1, beta2 = group['betas']
+
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+            if group['weight_decay'] != 0:
+                grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay'])
+
+            #
+            # Decay the first and second moment running average coefficient
+            #
+            torch._foreach_mul_(exp_avg, beta1)
+            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
+
+            torch._foreach_mul_(exp_avg_sq, beta2)
+            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                # Use the max. for normalizing running avg. of gradient
+                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
+
+            step_size = [group['lr'] / bc for bc in bias_correction1]
+
+            for i in range(len(step_size)):
+                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adam.pyi b/torch/optim/_multi_tensor/adam.pyi
new file mode 100644
index 000000000000..09f29597fd18
--- /dev/null
+++ b/torch/optim/_multi_tensor/adam.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adam(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamax.py b/torch/optim/_multi_tensor/adamax.py
new file mode 100644
index 000000000000..6eb86676315d
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamax.py
@@ -0,0 +1,107 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class Adamax(Optimizer):
+    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    __ https://arxiv.org/abs/1412.6980
+    """
+
+    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        super(Adamax, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            exp_avgs = []
+            exp_infs = []
+
+            beta1, beta2 = group['betas']
+            eps = group['eps']
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('Adamax does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['exp_inf'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_infs.append(state['exp_inf'])
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            # Update biased first moment estimate.
+            torch._foreach_mul_(exp_avgs, beta1)
+            torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1)
+
+            # Update the exponentially weighted infinity norm.
+            torch._foreach_mul_(exp_infs, beta2)
+
+            for exp_inf, grad in zip(exp_infs, grads):
+                norm_buf = torch.cat([
+                    exp_inf.unsqueeze(0),
+                    grad.abs().add_(eps).unsqueeze_(0)
+                ], 0)
+                torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
+
+            bias_corrections = [1 - beta1 ** state['step'] for state in states]
+            clr = [group['lr'] / bias_correction for bias_correction in bias_corrections]
+
+            for i in range(len(params_with_grad)):
+                params_with_grad[i].addcdiv_(exp_avgs[i], exp_infs[i], value=-clr[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adamax.pyi b/torch/optim/_multi_tensor/adamax.pyi
new file mode 100644
index 000000000000..4ac68f75ba99
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamax.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Adamax(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
new file mode 100644
index 000000000000..5440352ad908
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamw.py
@@ -0,0 +1,144 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+
+class AdamW(Optimizer):
+    r"""Implements AdamW algorithm.
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            amsgrad = group['amsgrad']
+
+            grads = []
+            states = []
+            exp_avg = []
+            exp_avg_sq = []
+            max_exp_avg_sq = []
+            params_with_grad = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('AdamW does not support sparse gradients')
+
+                    # Perform stepweight decay
+                    p.mul_(1 - group['lr'] * group['weight_decay'])
+
+                    params_with_grad.append(p)
+                    grads.append(p.grad)
+
+            for p in params_with_grad:
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avg.append(state['exp_avg'])
+                exp_avg_sq.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sq.append(state['max_exp_avg_sq'])
+
+                state['step'] += 1
+                states.append(state)
+
+            beta1, beta2 = group['betas']
+
+            bias_correction1 = [1 - beta1 ** state['step'] for state in states] 
+            bias_correction2 = [1 - beta2 ** state['step'] for state in states] 
+
+            #
+            # Decay the first and second moment running average coefficient
+            #
+            torch._foreach_mul_(exp_avg, beta1)
+            torch._foreach_add_(exp_avg, grads, alpha=1 - beta1)
+
+            torch._foreach_mul_(exp_avg_sq, beta2)
+            torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                # Use the max. for normalizing running avg. of gradient
+                max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(max_exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps'])
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq)
+                bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
+                torch._foreach_div_scalar_list_(exp_avg_sq_sqrt, bias_correction_sqrt)
+                denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps'])
+
+            step_size = [group['lr'] / bc for bc in bias_correction1]
+
+            for i in range(len(step_size)):
+                params_with_grad[i].addcdiv_(exp_avg[i], denom[i], value=-step_size[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/adamw.pyi b/torch/optim/_multi_tensor/adamw.pyi
new file mode 100644
index 000000000000..dedd8de3f876
--- /dev/null
+++ b/torch/optim/_multi_tensor/adamw.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class AdamW(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ...
diff --git a/torch/optim/_multi_tensor/asgd.py b/torch/optim/_multi_tensor/asgd.py
new file mode 100644
index 000000000000..f49a4ab0e5b0
--- /dev/null
+++ b/torch/optim/_multi_tensor/asgd.py
@@ -0,0 +1,94 @@
+import math
+import torch
+from ..optimizer import Optimizer
+
+
+class ASGD(Optimizer):
+    """Implements Averaged Stochastic Gradient Descent.
+
+    It has been proposed in `Acceleration of stochastic approximation by
+    averaging`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lambd (float, optional): decay term (default: 1e-4)
+        alpha (float, optional): power for eta update (default: 0.75)
+        t0 (float, optional): point at which to start averaging (default: 1e6)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    .. _Acceleration of stochastic approximation by averaging:
+        https://dl.acm.org/citation.cfm?id=131098
+    """
+
+    def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay=0):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, lambd=lambd, alpha=alpha, t0=t0,
+                        weight_decay=weight_decay)
+        super(ASGD, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        grads = []
+        params_with_grad = []
+        states = []
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('ASGD does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['eta'] = group['lr']
+                        state['mu'] = 1
+                        state['ax'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    state['step'] += 1
+                    states.append(state)
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            # decay term
+            torch._foreach_mul_(params_with_grad, 1 - group['lambd'] * state['eta'])
+
+            # update parameter
+            torch._foreach_add_(params_with_grad, grads, alpha=-state['eta'])
+
+            # averaging
+            for i in range(len(states)):
+                if states[i]['mu'] != 1:
+                    states[i]['ax'].add_(params_with_grad[i].sub(states[i]['ax']).mul(states[i]['mu']))
+                else:
+                    states[i]['ax'].copy_(params_with_grad[i])
+
+            # update eta and mu
+            for state in states:
+                state['eta'] = (group['lr'] /
+                                math.pow((1 + group['lambd'] * group['lr'] * state['step']), group['alpha']))
+                state['mu'] = 1 / max(1, state['step'] - group['t0'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/asgd.pyi b/torch/optim/_multi_tensor/asgd.pyi
new file mode 100644
index 000000000000..06e9149b72f5
--- /dev/null
+++ b/torch/optim/_multi_tensor/asgd.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class ASGD(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., lambd: float=..., alpha: float=..., t0: float=..., weight_decay: float=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rmsprop.py b/torch/optim/_multi_tensor/rmsprop.py
new file mode 100644
index 000000000000..10a422c23c37
--- /dev/null
+++ b/torch/optim/_multi_tensor/rmsprop.py
@@ -0,0 +1,123 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class RMSprop(Optimizer):
+    r"""Implements RMSprop algorithm.
+
+    Proposed by G. Hinton in his
+    `course <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
+
+    The centered version first appears in `Generating Sequences
+    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
+
+    The implementation here takes the square root of the gradient average before
+    adding epsilon (note that TensorFlow interchanges these two operations). The effective
+    learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`
+    is the scheduled learning rate and :math:`v` is the weighted moving average
+    of the squared gradient.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    """
+
+    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= momentum:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= alpha:
+            raise ValueError("Invalid alpha value: {}".format(alpha))
+
+        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
+        super(RMSprop, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RMSprop, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('momentum', 0)
+            group.setdefault('centered', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            grads = []
+            params_with_grad = []
+            states = []
+            alpha = group['alpha']
+            square_avg = []
+
+            for p in group['params']:
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('RMSprop does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['momentum'] > 0:
+                            state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        if group['centered']:
+                            state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                        state['step'] += 1
+
+                    states.append(state)
+                    square_avg.append(state['square_avg'])
+
+            if group['weight_decay'] != 0:
+                torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay'])
+
+            torch._foreach_mul_(square_avg, alpha)
+            torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha)
+
+            if group['centered']:
+                grad_avgs = [s['grad_avg'] for s in states]
+                torch._foreach_mul_(grad_avgs, alpha)
+                torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha)
+                avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1)
+                torch._foreach_sqrt_(avg)
+                torch._foreach_add_(avg, group['eps'])
+            else:
+                avg = torch._foreach_sqrt(square_avg)
+                torch._foreach_add_(avg, group['eps'])
+
+            if group['momentum'] > 0:
+                buf = [s['momentum_buffer'] for s in states]
+                torch._foreach_mul_(buf, group['momentum'])
+                torch._foreach_addcdiv_(buf, grads, avg)
+                torch._foreach_add_(params_with_grad, buf, alpha=-group['lr'])
+            else:
+                torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/rmsprop.pyi b/torch/optim/_multi_tensor/rmsprop.pyi
new file mode 100644
index 000000000000..691f2188ebb1
--- /dev/null
+++ b/torch/optim/_multi_tensor/rmsprop.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class RMSprop(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=...,  centered: bool=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py
new file mode 100644
index 000000000000..95df563a271f
--- /dev/null
+++ b/torch/optim/_multi_tensor/rprop.py
@@ -0,0 +1,95 @@
+import torch
+from ..optimizer import Optimizer
+
+
+class Rprop(Optimizer):
+    """Implements the resilient backpropagation algorithm.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
+            are multiplicative increase and decrease factors
+            (default: (0.5, 1.2))
+        step_sizes (Tuple[float, float], optional): a pair of minimal and
+            maximal allowed step sizes (default: (1e-6, 50))
+    """
+
+    def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 < etas[0] < 1.0 < etas[1]:
+            raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1]))
+
+        defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes)
+        super(Rprop, self).__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        grads = []
+        states = []
+        params_with_grad = []
+        step_sizes = []
+
+        for group in self.param_groups:
+            for p in group['params']:
+                etaminus, etaplus = group['etas']
+                step_size_min, step_size_max = group['step_sizes']
+
+                if p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise RuntimeError('RMSprop does not support sparse gradients')
+
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+
+                    state = self.state[p]
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        state['step_size'] = p.grad.new().resize_as_(p.grad).fill_(group['lr'])
+
+                        state['step'] += 1
+
+                    states.append(state)
+                    step_sizes.append(state['step_size'])
+
+            signs = torch._foreach_mul(grads, [s['prev'] for s in states])
+            signs = [s.sign() for s in signs]
+            for sign in signs:
+                sign[sign.gt(0)] = etaplus
+                sign[sign.lt(0)] = etaminus
+                sign[sign.eq(0)] = 1
+
+            # update stepsizes with step size updates
+            torch._foreach_mul_(step_sizes, signs)
+            for step_size in step_sizes:
+                step_size.clamp_(step_size_min, step_size_max)
+
+            # for dir<0, dfdx=0
+            # for dir>=0 dfdx=dfdx
+            for i in range(len(grads)): 
+                grads[i] = grads[i].clone(memory_format=torch.preserve_format)
+                grads[i][signs[i].eq(etaminus)] = 0
+
+            # update parameters
+            grad_signs = [grad.sign() for grad in grads]
+            torch._foreach_addcmul_(params_with_grad, grad_signs, step_sizes, value=-1)
+
+            for i in range(len(states)):
+                states[i]['prev'].copy_(grads[i])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/rprop.pyi b/torch/optim/_multi_tensor/rprop.pyi
new file mode 100644
index 000000000000..0ea64c63d25e
--- /dev/null
+++ b/torch/optim/_multi_tensor/rprop.pyi
@@ -0,0 +1,5 @@
+from typing import Tuple
+from ..optimizer import _params_t, Optimizer
+
+class Rprop(Optimizer):
+    def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ...
diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py
new file mode 100644
index 000000000000..8219f771c4c1
--- /dev/null
+++ b/torch/optim/_multi_tensor/sgd.py
@@ -0,0 +1,154 @@
+import torch
+from ..optimizer import Optimizer, required
+
+
+class SGD(Optimizer):
+    r"""Implements stochastic gradient descent (optionally with momentum).
+
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+
+        Considering the specific case of Momentum, the update can be written as
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
+                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
+            \end{aligned}
+
+        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the 
+        parameters, gradient, velocity, and momentum respectively.
+
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
+                p_{t+1} & = p_{t} - v_{t+1}.
+            \end{aligned}
+
+        The Nesterov version is analogously modified.
+    """
+
+    def __init__(self, params, lr=required, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        super(SGD, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(SGD, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+
+            grads = []
+            params_with_grad = []
+            states = []
+            has_sparse_grad = False
+
+            for p in group['params']:
+                if p.grad is not None:
+                    grads.append(p.grad)
+                    params_with_grad.append(p)
+                    states.append(self.state[p])
+
+                    if p.grad.is_sparse:
+                        has_sparse_grad = True
+
+                        if momentum != 0: 
+                            raise RuntimeError('SGD does not support momentum for sparse gradients')
+
+            if grads == []:
+                return loss
+
+            if weight_decay != 0:
+                grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay)
+
+            if momentum != 0:
+                bufs = []
+
+                all_states_with_momentum_buffer = True
+                for i in range(len(states)):
+                    if 'momentum_buffer' not in states[i]:
+                        all_states_with_momentum_buffer = False
+                        break
+                    else:
+                        bufs.append(states[i]['momentum_buffer'])
+
+                if all_states_with_momentum_buffer:
+                    torch._foreach_mul_(bufs, momentum)
+                    torch._foreach_add_(bufs, grads, alpha=1 - dampening)
+                else:
+                    bufs = []
+                    for i in range(len(states)):
+                        if 'momentum_buffer' not in states[i]:
+                            buf = states[i]['momentum_buffer'] = torch.clone(grads[i]).detach()
+                        else:
+                            buf = states[i]['momentum_buffer']
+                            buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)
+
+                        bufs.append(buf)
+
+                if nesterov:
+                    torch._foreach_add_(grads, bufs, alpha=momentum)
+                else:
+                    grads = bufs
+
+            if not has_sparse_grad:
+                torch._foreach_add_(params_with_grad, grads, alpha=-group['lr'])
+            else:
+                # foreach APIs dont support sparse
+                for i in range(len(params_with_grad)): 
+                    params_with_grad[i].add_(grads[i], alpha=-group['lr'])
+
+        return loss
diff --git a/torch/optim/_multi_tensor/sgd.pyi b/torch/optim/_multi_tensor/sgd.pyi
new file mode 100644
index 000000000000..6082e230cd79
--- /dev/null
+++ b/torch/optim/_multi_tensor/sgd.pyi
@@ -0,0 +1,4 @@
+from ..optimizer import _params_t, Optimizer
+
+class SGD(Optimizer):
+    def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ...

From 47debdca4232f7498be9c06d7d3498ef2461d743 Mon Sep 17 00:00:00 2001
From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com>
Date: Mon, 28 Sep 2020 13:11:17 -0700
Subject: [PATCH 206/449] Document change for DDP enabled on Windows platform
 (#45392)

Summary:
Document change for DDP enabled on Windows platform

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45392

Reviewed By: gchanan

Differential Revision: D23962344

Pulled By: mrshenli

fbshipit-source-id: 8924c6ca36d68699871d8add3e0aab6542ea269c
---
 README.md                     |  8 ++++++++
 docs/source/distributed.rst   | 22 ++++++++++++++++------
 torch/distributed/__init__.py |  4 ++--
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 6191cabcb685..9202869dfaa6 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,14 @@ On MacOS
 conda install pkg-config libuv
 ```
 
+On Windows
+```bash
+# Add these packages and set libuv_ROOT environment variable if torch.distributed is needed.
+# Distributed package support on Windows is a prototype feature and is subject to changes.
+conda install -y -q -c rdonnelly libuv
+set libuv_ROOT={conda active env location}\Library
+```
+
 #### Get the PyTorch Source
 ```bash
 git clone --recursive https://github.com/pytorch/pytorch
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index a248d3e4ca83..9c2f4a103b5e 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -52,12 +52,22 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it.
 Backends that come with PyTorch
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-PyTorch distributed currently only supports Linux. By default, the Gloo and NCCL backends
-are built and included in PyTorch distributed (NCCL only when building with CUDA).
-MPI is an
-optional backend that can only be included if you build PyTorch from source. (e.g.
-building PyTorch on a host that has MPI installed.)
-
+PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype).
+By default for Linux, the Gloo and NCCL backends are built and included in PyTorch
+distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be
+included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI
+installed.)
+
+.. warning ::
+    As of PyTorch v1.7, Windows support for the distributed package only covers collective
+    communications with Gloo backend, `FileStore`, and `DistributedDataParallel`. Therefore,
+    the `init_method` argument in :func:`init_process_group` must point to a file. This works
+    for both local and shared file systems:
+
+    - Local file system, ``init_method="file:///d:/tmp/some_file"``
+    - Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"``
+
+    Similarly, if you directly pass in a `store` argument, it must be a ``FileStore`` instance.
 
 Which backend to use?
 ^^^^^^^^^^^^^^^^^^^^^
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index adfd2201a046..ba5ec8bdb5fc 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -6,9 +6,9 @@ def is_available():
     """
     Returns ``True`` if the distributed package is available. Otherwise,
     ``torch.distributed`` does not expose any other APIs. Currently,
-    ``torch.distributed`` is available on Linux and MacOS. Set
+    ``torch.distributed`` is available on Linux, MacOS and Windows. Set
     ``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source.
-    Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and
+    Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows,
     ``USE_DISTRIBUTED=0`` for MacOS.
     """
     return hasattr(torch._C, "_c10d_init")

From 9163e8171e33d5562cb25a251d5f49602f0a8a03 Mon Sep 17 00:00:00 2001
From: Yangxin Zhong <yangxin@fb.com>
Date: Mon, 28 Sep 2020 13:21:18 -0700
Subject: [PATCH 207/449] Adding Type Double to Caffe2 Mean Op

Summary: Adding support for type double to caffe2 MeanOp and MeanGradientOp.

Test Plan:
All tests passed.

Example FBL job failed without this diff:
f221169563

Error message:
```
c10::Error: [enforce fail at mean_op.h:72] . Mean operator only supports 32-bit float, but input was of type double (Error from operator:
input: "dpsgd_8/Copy_3" input: "dpsgd_8/Copy_4" output: "dpsgd_8/Mean_2" name: "" type: "Mean" device_option { device_type: 0 device_id: 0 })
```

Example FBL job is running without failure with the canary package built from this diff:
f221468723

Reviewed By: chenshouyuan

Differential Revision: D23956222

fbshipit-source-id: 6c81bbc390d812ae0ac235e7d025141c8402def1
---
 caffe2/operators/mean_op.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/caffe2/operators/mean_op.h b/caffe2/operators/mean_op.h
index f16914f4a894..beb0b0440505 100644
--- a/caffe2/operators/mean_op.h
+++ b/caffe2/operators/mean_op.h
@@ -65,9 +65,11 @@ class MeanOp final : public Operator<Context> {
   bool RunOnDevice() override {
     if (Input(0).template IsType<float>()) {
       return DoRunWithType<float>();
+    } else if (Input(0).template IsType<double>()) {
+      return DoRunWithType<double>();
     } else {
       CAFFE_THROW(
-          "Mean operator only supports 32-bit float, but",
+          "Mean operator only supports 32-bit float or 64-bit double, but",
           " input was of type ",
           Input(0).dtype().name());
     }
@@ -111,9 +113,11 @@ class MeanGradientOp : public Operator<Context> {
   bool RunOnDevice() override {
     if (Input(0).template IsType<float>()) {
       return DoRunWithType<float>();
+    } else if (Input(0).template IsType<double>()) {
+      return DoRunWithType<double>();
     } else {
       CAFFE_THROW(
-          "Mean operator only supports 32-bit float, but",
+          "Mean operator only supports 32-bit float or 64-bit double, but",
           " input was of type ",
           Input(0).dtype().name());
     }

From 57c18127dc5156e6d8e259e9a816e91b573377a8 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Mon, 28 Sep 2020 13:43:50 -0700
Subject: [PATCH 208/449] [ONNX] Update div export to perform true divide
 (#44831)

Summary:
related https://github.com/pytorch/pytorch/issues/43787

Now that PyTorch div is actually performing true divide, update onnx export code to stay consistent.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44831

Reviewed By: eellison

Differential Revision: D23880316

Pulled By: bzinodev

fbshipit-source-id: 3bb8db34142ac4fed4039295ad3c4cb79487987f
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 18 +++++++++---------
 torch/onnx/symbolic_helper.py              |  8 ++++++--
 torch/onnx/symbolic_opset10.py             |  6 +++---
 torch/onnx/symbolic_opset9.py              | 18 +++++++++---------
 4 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 622795db76d1..4885bb665584 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -899,7 +899,7 @@ def forward(self, x):
     def test_div(self):
         class DivModule(torch.nn.Module):
             def forward(self, x, y):
-                return x / y
+                return x / y, torch.true_divide(x, y)
 
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int)
@@ -913,7 +913,7 @@ def forward(self, x, y):
     def test_div_promotion_trace(self):
         class DivModule(torch.nn.Module):
             def forward(self, x, y):
-                return x / y
+                return x / y, torch.true_divide(x, y)
 
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int)
@@ -931,14 +931,14 @@ def forward(self, x, y):
     # In scripting x, y do not carry shape and dtype info.
     # The following test only works when onnx shape inference is enabled.
     @skipIfONNXShapeInference(False)
-    def test_true_div_script(self):
-        class TrueDivModule(torch.nn.Module):
+    def test_div_promotion_script(self):
+        class DivModule(torch.nn.Module):
             def forward(self, x, y):
                 # Add transpose to hide shape/type information
                 # Otherwise shape and type are still avaiable from input.
                 x = x.transpose(1, 2)
                 y = y.transpose(1, 2)
-                return torch.true_divide(x, y)
+                return x / y, torch.true_divide(x, y)
 
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int)
@@ -949,20 +949,20 @@ def forward(self, x, y):
         #    This can be handled by the default case, where both are cast to float.
         #    It works even if type of x, y are unknown.
         torch.set_default_dtype(torch.float)
-        self.run_test(torch.jit.script(TrueDivModule()), (x, y))
+        self.run_test(torch.jit.script(DivModule()), (x, y))
 
         # 2. x,y are int, and output is double.
         #    This can be handled by the default case, where both are cast to double.
         #    It works even if type of x, y are unknown.
         torch.set_default_dtype(torch.double)
-        self.run_test(torch.jit.script(TrueDivModule()), (x, y))
+        self.run_test(torch.jit.script(DivModule()), (x, y))
 
         # 3. x is int, y is double, and output is double.
         #    This can only be handled when both type of x and y are known.
         torch.set_default_dtype(prev_default)
         x = torch.randn(2, 3, 4).to(torch.int)
         y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.double)
-        self.run_test(torch.jit.script(TrueDivModule()), (x, y))
+        self.run_test(torch.jit.script(DivModule()), (x, y))
 
     def test_slice_trace(self):
         class MyModule(torch.nn.Module):
@@ -2781,7 +2781,7 @@ def test_len_list(self):
         class LenListModel(torch.jit.ScriptModule):
             @torch.jit.script_method
             def forward(self, input):
-                return torch.ones(len(input.shape)) 
+                return torch.ones(len(input.shape))
 
         x = torch.randn(4, 5)
         self.run_test(LenListModel(), x)
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 07462d3f21a5..035de44b4d9e 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -222,8 +222,12 @@ def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False)
 
 def _is_fp(value):
     if value:
-        type = value.type().scalarType()
-        return (type == 'Float') or (type == 'Double') or (type == 'Half')
+        if isinstance(value, torch.Tensor):
+            type = value.dtype
+            return (type == 'torch.float32') or (type == 'torch.float64') or (type == 'torch.float16')
+        else:
+            type = value.type().scalarType()
+            return (type == 'Float') or (type == 'Double') or (type == 'Half')
     return False
 
 
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 88bc6d5fb8b5..a6930ac619ba 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -206,7 +206,7 @@ def embedding_bag(g,
     if scale_grad_by_freq and sym_help._training_mode:
         return sym_help._onnx_unsupported('embedding_bag with scale_grad_by_freq for training mode')
 
-    from torch.onnx.symbolic_opset9 import size, div, select
+    from torch.onnx.symbolic_opset9 import size, select
 
     # Check if initial indices was 2D. In functional.py:
     # offsets is set to torch.arange(0, indices.numel(), indices.size(1))
@@ -217,7 +217,7 @@ def embedding_bag(g,
         assert not include_last_offset
         embeddings = g.op("Gather", embedding_matrix, indices)
         dim_0 = size(g, offsets, g.op("Constant", value_t=torch.LongTensor([0])))
-        dim_1 = div(g, size(g, indices, g.op("Constant", value_t=torch.LongTensor([0]))), dim_0)
+        dim_1 = g.op('Div', size(g, indices, g.op("Constant", value_t=torch.LongTensor([0]))), dim_0)
         dim_2 = g.op("Constant", value_t=torch.LongTensor([-1]))
 
         shape = [dim_0, dim_1, dim_2]
@@ -235,7 +235,7 @@ def embedding_bag(g,
         else:
             embeddings = g.op("ReduceMax", embeddings, axes_i=[1], keepdims_i=0)
         # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.          
+        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
         return embeddings, None, None, None
     elif offsets.type().sizes() is not None:
         if include_last_offset:
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 444e4460eff7..c030793103f5 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -103,11 +103,11 @@ def mul(g, self, other):
 
 
 def div(g, self, other):
-    return g.op("Div", self, other)
+    return true_divide(g, self, other)
 
 
 def floor_divide(g, self, other):
-    out = div(g, self, other)
+    out = g.op('Div', self, other)
     # the correct operation is truncate, which is not supported in ONNX,
     # we cannot call floor since it will behave differently for negative numbers
     # (eg. -0.1 should become -0 )
@@ -144,19 +144,19 @@ def true_divide(g, self, other):
     # Case 1: both values are floating
     # Performs div as usual
     if sym_help._is_fp(self) and sym_help._is_fp(other):
-        return div(g, self, other)
+        return g.op("Div", self, other)
 
     # Case 2: self is floating, other is not
     # Casts other to self's dtype
     if sym_help._is_fp(self):
         other = g.op("Cast", other, to_i=sym_help.cast_pytorch_to_onnx[self.type().scalarType()])
-        return div(g, self, other)
+        return g.op("Div", self, other)
 
     # Case 3: other is floating, self is not
     # Casts self to other's dtype
     if sym_help._is_fp(other):
         self = g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[other.type().scalarType()])
-        return div(g, self, other)
+        return g.op("Div", self, other)
 
     # Case 4: neither is floating
     # Casts both inputs to the default scalar type
@@ -168,7 +168,7 @@ def true_divide(g, self, other):
 
     self = g.op("Cast", self, to_i=onnx_scalar_type)
     other = g.op("Cast", other, to_i=onnx_scalar_type)
-    return div(g, self, other)
+    return g.op("Div", self, other)
 
 
 def reciprocal(g, self):
@@ -220,7 +220,7 @@ def sqrt(g, self):
 
 
 def rsqrt(g, self):
-    return div(g, sym_help._if_scalar_type_as(g, torch.ones(1), self), sqrt(g, self))
+    return g.op("Div", sym_help._if_scalar_type_as(g, torch.ones(1), self), sqrt(g, self))
 
 
 def tanh(g, self):
@@ -1224,7 +1224,7 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable):
     variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
     denominator = sqrt(g, add(g, variance, eps_cst))
 
-    layer_norm = div(g, numerator, denominator)
+    layer_norm = g.op("Div", numerator, denominator)
 
     if not (weight is None or sym_help._is_none(weight)):
         layer_norm = mul(g, layer_norm, weight)
@@ -2475,7 +2475,7 @@ def remainder(g, input, other):
 
 def gelu(g, self):
     _sqrt2 = 1.4142135623730951
-    erf = g.op('Erf', div(g, self, torch.tensor(_sqrt2)))
+    erf = g.op('Erf', g.op('Div', self, torch.tensor(_sqrt2)))
     erf_plusone = add(g, erf, g.op('Constant', value_t=torch.tensor(1, dtype=torch.float)))
     return mul(g, mul(g, self, erf_plusone), g.op('Constant', value_t=torch.tensor(0.5, dtype=torch.float)))
 

From 7a4c417ed391c8e3c63e499884f819fe1742fe0e Mon Sep 17 00:00:00 2001
From: Yi Wang <wayi@fb.com>
Date: Mon, 28 Sep 2020 13:50:54 -0700
Subject: [PATCH 209/449] Fix typo (#45379)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45379

Registeres -> Registers in reducer.h.
ghstack-source-id: 112982279

Test Plan: N/A

Reviewed By: mrshenli

Differential Revision: D23951203

fbshipit-source-id: 96c7dc2e1e12c132339b9ac83ce1da52c812740c
---
 torch/csrc/distributed/c10d/reducer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h
index 486b7337366a..5a17dbe6f1c2 100644
--- a/torch/csrc/distributed/c10d/reducer.h
+++ b/torch/csrc/distributed/c10d/reducer.h
@@ -55,7 +55,7 @@ class Reducer {
     return backward_stats_;
   }
 
-  // Registeres a hook to the reducer. The hook is `CommHookInterface`
+  // Registers a hook to the reducer. The hook is `CommHookInterface`
   // type to allow both Python and CPP hooks. This function can only
   // be called once before calling backward.
   void register_comm_hook(std::unique_ptr<CommHookInterface> iface);

From 986af53be253572f930df62f930a18538541e2f6 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Mon, 28 Sep 2020 13:56:00 -0700
Subject: [PATCH 210/449] type check for torch.testing._internalcodegen:*
 (#45368)

Summary:
part of `torch.testing._internal.*` effort

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45368

Reviewed By: malfet

Differential Revision: D23950512

Pulled By: walterddr

fbshipit-source-id: 399f712d12cdd9795b0136328f512c3f86a15f24
---
 mypy.ini                                            | 3 ---
 torch/testing/_internal/codegen/random_topo_test.py | 4 +++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 195686d15ec3..64d0fead1444 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -53,9 +53,6 @@ ignore_errors = True
 [mypy-torch.distributed.*]
 ignore_errors = True
 
-[mypy-torch.testing._internal.codegen.*]
-ignore_errors = True
-
 [mypy-torch.testing._internal.hypothesis_utils.*]
 ignore_errors = True
 
diff --git a/torch/testing/_internal/codegen/random_topo_test.py b/torch/testing/_internal/codegen/random_topo_test.py
index e2823a97f10b..2eed0bad4e43 100644
--- a/torch/testing/_internal/codegen/random_topo_test.py
+++ b/torch/testing/_internal/codegen/random_topo_test.py
@@ -2,6 +2,8 @@
 import numpy as np
 import argparse
 
+from typing import Dict
+
 # debug print
 DEBUG_PRINT = False
 
@@ -71,7 +73,7 @@ def get_root(x, dependency_map):
             return get_root(dependency_map[x], dependency_map)
         else:
             return x
-    d_map = {}
+    d_map: Dict[int, int] = {}
     num_sets = num_tensor
     candidate = list(range(num_tensor))
 

From 5d1fee23b3c270c6b614e5272f4a4eb7ad34fccb Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 28 Sep 2020 14:19:44 -0700
Subject: [PATCH 211/449] Remove convert_target from NN tests. (#45291)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45291

It's not necessary, you can just check if the dtype is integral.

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D23911963

Pulled By: gchanan

fbshipit-source-id: 230139e1651eb76226f4095e31068dded30e03e8
---
 torch/testing/_internal/common_nn.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index e2d10c52d578..f772f785a36a 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -4482,7 +4482,6 @@ def padding3d_circular(input, pad):
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
-        convert_target=False,
         # `CTCLoss` in C++ frontend doesn't accept integer list for `input_lengths` or `target_lengths`
         test_cpp_api_parity=False,
         check_jit=False,
@@ -4501,7 +4500,6 @@ def padding3d_circular(input, pad):
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
-        convert_target=False,
     ),
     dict(
         module_name='CTCLoss',
@@ -4517,7 +4515,6 @@ def padding3d_circular(input, pad):
         check_sum_reduction=True,
         check_gradgrad=False,
         check_half=False,
-        convert_target=False,
     ),
 ]
 
@@ -5052,7 +5049,6 @@ def __init__(self, *args, **kwargs):
         self.check_gradgrad = kwargs.get('check_gradgrad', True)
         self.check_half = kwargs.get('check_half', True)
         self.check_bfloat16 = kwargs.get('check_bfloat16', False)
-        self.convert_target = kwargs.get('convert_target', True)
         self.test_cpu = kwargs.get('test_cpu', True)
         self.with_tf32 = kwargs.get('with_tf32', True)
         self.tf32_precision = kwargs.get('tf32_precision', 0.001)
@@ -5097,8 +5093,6 @@ def test_cuda(self, test_case, dtype=None, extra_args=None):
         def convert_dtype(obj, dtype, requires_grad=False):
             if isinstance(obj, torch.Tensor):
                 return obj.detach().to(dtype=dtype).requires_grad_(requires_grad)
-            elif isinstance(obj, torch.Tensor):
-                return obj.to(dtype)
             elif isinstance(obj, tuple):
                 return tuple(convert_dtype(o, dtype, requires_grad) for o in obj)
             else:
@@ -5115,8 +5109,7 @@ def convert_dtype(obj, dtype, requires_grad=False):
         # Convert input, target and module parameters to dtype
         if dtype is not None:
             cpu_input = convert_dtype(cpu_input, dtype, True)
-            # NLLLoss requires target to be LongTensor
-            if not isinstance(cpu_target, torch.LongTensor) and self.convert_target:
+            if cpu_target.is_floating_point() or cpu_target.is_complex():
                 cpu_target = convert_dtype(cpu_target, dtype)
             cpu_module.type(dtype)
             gpu_module.type(dtype)

From a77d633db1b6060e4436b141d16bf067f6cf8db9 Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Mon, 28 Sep 2020 14:44:52 -0700
Subject: [PATCH 212/449] [ONNX] Fix view for dynamic input shape (#43558)

Summary:
Export of view op with dynamic input shape is broken when using tensors with a 0-dim.
This fix removes symbolic use of static input size to fix this issue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43558

Reviewed By: ailzhang

Differential Revision: D23965090

Pulled By: bzinodev

fbshipit-source-id: 628e9d7ee5d53375f25052340ca6feabf7ba7c53
---
 .../expect/TestOperators.test_view.expect     | 24 +++++++++++++------
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 11 +++++++++
 torch/onnx/symbolic_opset12.py                |  4 ++--
 torch/onnx/symbolic_opset8.py                 | 16 +------------
 torch/onnx/symbolic_opset9.py                 | 21 +++++++---------
 5 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect
index 75202b5d0da2..abd2276e7716 100644
--- a/test/onnx/expect/TestOperators.test_view.expect
+++ b/test/onnx/expect/TestOperators.test_view.expect
@@ -3,16 +3,26 @@ producer_name: "pytorch"
 producer_version: "CURRENT_VERSION"
 graph {
   node {
-    input: "0"
     output: "1"
-    name: "Flatten_0"
-    op_type: "Flatten"
+    name: "Constant_0"
+    op_type: "Constant"
     attribute {
-      name: "axis"
-      i: 1
-      type: INT
+      name: "value"
+      t {
+        dims: 2
+        data_type: 7
+        raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000"
+      }
+      type: TENSOR
     }
   }
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    name: "Reshape_1"
+    op_type: "Reshape"
+  }
   name: "torch-jit-export"
   input {
     name: "0"
@@ -28,7 +38,7 @@ graph {
     }
   }
   output {
-    name: "1"
+    name: "2"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 4885bb665584..efb2d6b2433d 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2649,6 +2649,17 @@ def forward(self, input, other):
         shape = torch.randn(6, 4)
         self.run_test(ViewModel(), (x, shape))
 
+    def test_view_dynamic_zero_dim(self):
+        class ViewModel(torch.nn.Module):
+            def forward(self, input):
+                input = input.view(-1, 2)
+                return input.view(1, -1)
+
+        x = torch.ones(2)
+        another_x = torch.empty((0,))
+        self.run_test(ViewModel(), x, test_with_inputs=[another_x],
+                      input_names=['input_1'], dynamic_axes={'input_1': [0, ]})
+
     def test_view_as(self):
         class ViewModel(torch.nn.Module):
             def forward(self, input, other):
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 39b162ae95b5..e3a6c418fc0a 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -65,7 +65,7 @@ def celu(g, self, alpha):
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
         from torch.onnx.symbolic_opset9 import reshape
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMax', flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
     else:
         dim = _parse_arg(dim, 'i')
@@ -76,7 +76,7 @@ def argmax(g, input, dim, keepdim):
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
         from torch.onnx.symbolic_opset9 import reshape
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMin', flattened, axis_i=0, keepdims_i=False, select_last_index_i=False)
     else:
         dim = _parse_arg(dim, 'i')
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index 0a1476d88111..c0c1d48ebec0 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -182,20 +182,6 @@ def addmm(g, self, mat1, mat2, beta, alpha):
         return g.op("Gemm", mat1, mat2, self, beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha))
 
 
-def view(g, self, size):
-    size = sym_help._maybe_get_const(size, 'is')
-    if sym_help._is_value(size):
-        shape = size
-    else:
-        if self.isCompleteTensor():
-            self_sizes = self.type().sizes()
-            if self_sizes and len(size) == 2 and self_sizes[0] == size[0]:
-                old_type, self = _try_cast_integer_to_float(g, self)
-                return _cast_to_type(g, g.op("Flatten", self, axis_i=1), old_type)
-        shape = g.op("Constant", value_t=torch.LongTensor(size))
-    return g.op("Reshape", self, shape)
-
-
 def flatten(g, input, start_dim, end_dim):
     start_dim_i = sym_help._get_const(start_dim, 'i', 'start_dim')
     end_dim_i = sym_help._get_const(end_dim, 'i', 'end_dim')
@@ -290,5 +276,5 @@ def repeat(g, self, repeats):
         sizes = self.type().sizes()
         diff_dims = repeat_size_len - len(sizes)
         if diff_dims > 0:
-            self = sym_opset9.view(g, self, [1] * diff_dims + sizes)
+            self = sym_opset9.view(g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes)))
     return g.op("Tile", self, repeats)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index c030793103f5..886be27e1f65 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -374,7 +374,7 @@ def expand(g, self, size, implicit):
         # Expand with -1 dim value means dim is unchanged.
         # Since onnx::expand supports two-way broadcasting,
         # -1 dim value can be exported to onnx as 1
-        size = view(g, stack(g, size, 0), [-1])
+        size = view(g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1])))
     dtype = 4  # dim type is int64
     ones = ones_like(g, size, dtype)
     neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
@@ -461,13 +461,10 @@ def view(g, self, size):
     if sym_help._is_value(size):
         shape = size
     else:
-        if self.isCompleteTensor():
-            self_sizes = self.type().sizes()
-            if self_sizes and len(size) == 2 and self_sizes[0] == size[0]:
-                return g.op("Flatten", self, axis_i=1)
         shape = g.op("Constant", value_t=torch.LongTensor(size))
     return g.op("Reshape", self, shape)
 
+
 def view_as(g, self, other):
     shape = g.op("Shape", other)
     return g.op("Reshape", self, shape)
@@ -1783,12 +1780,12 @@ def pixel_shuffle(g, self, upscale_factor):
     if len(dims) != 4:
         return _unimplemented("pixel_shuffle", "only support 4d input")
     output_channel = dims[1] // upscale_factor // upscale_factor
-    after_view = view(g, self, [-1, output_channel, upscale_factor, upscale_factor,
-                                dims[2], dims[3]])
+    after_view = view(g, self, g.op("Constant", value_t=torch.tensor([-1, output_channel, upscale_factor,
+                                                                      upscale_factor, dims[2], dims[3]])))
     after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
     return view(g, after_transpose,
-                [-1, output_channel, dims[2] * upscale_factor, dims[3] *
-                 upscale_factor])
+                g.op("Constant", value_t=torch.tensor([-1, output_channel, dims[2] * upscale_factor,
+                                                       dims[3] * upscale_factor])))
 
 
 def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases,
@@ -2136,7 +2133,7 @@ def narrow(g, input, dim, start, length):
 
 def argmax(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMax', flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, 'i')
@@ -2146,7 +2143,7 @@ def argmax(g, input, dim, keepdim):
 
 def argmin(g, input, dim, keepdim):
     if sym_help._is_none(dim):
-        flattened = reshape(g, input, (-1,))
+        flattened = reshape(g, input, g.op("Constant", value_t=torch.tensor([-1])))
         return g.op('ArgMin', flattened, axis_i=0, keepdims_i=False)
     else:
         dim = _parse_arg(dim, 'i')
@@ -2453,7 +2450,7 @@ def baddbmm(g, self, batch1, batch2, beta, alpha):
 
 
 def meshgrid(g, tensor_list):
-    tensors = [view(g, t, torch.LongTensor([-1])) for t in sym_help._unpack_list(tensor_list)]
+    tensors = [view(g, t, g.op("Constant", value_t=torch.LongTensor([-1]))) for t in sym_help._unpack_list(tensor_list)]
     tensors_shape = [g.op("Shape", t) for t in tensors]
     out_shape = g.op("Concat", *tensors_shape, axis_i=0)
     out = []

From c6b7eeb654f92c9b42b8d98e51b815471f446453 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Mon, 28 Sep 2020 14:51:49 -0700
Subject: [PATCH 213/449] Gh/taylorrobie/timer cleanup (#45361)

Summary:
This PR cleans up some of the rough edges around `Timer` and `Compare`
* Moves `Measurement` to be dataclass based
* Adds a bunch of type annotations. MyPy is now happy.
* Allows missing entries in `Compare`. This is one of the biggest usability issues with `Compare` right now, both from an API perspective and because the current failure mode is really unpleasant.
* Greatly expands the testing of `Compare`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45361

Test Plan: Changes to Timer are covered under existing tests, changes to `Compare` are covered by the expanded `test_compare` method.

Reviewed By: bwasti

Differential Revision: D23966816

Pulled By: robieta

fbshipit-source-id: 826969f73b42f72fa35f4de3c64d0988b61474cd
---
 test/test_utils.py                      | 141 ++++++++++++-
 torch/utils/_benchmark/utils/common.py  | 261 ++++++++++++------------
 torch/utils/_benchmark/utils/compare.py | 123 ++++++-----
 torch/utils/_benchmark/utils/timer.py   | 123 ++++++-----
 4 files changed, 411 insertions(+), 237 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index c545aae5bfaf..24e2aefe0797 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -623,6 +623,9 @@ def test_timer(self):
         timer = benchmark_utils.Timer(
             stmt="torch.ones(())",
         )
+        sample = timer.timeit(5).median
+        self.assertIsInstance(sample, float)
+
         median = timer.blocked_autorange(min_run_time=0.01).median
         self.assertIsInstance(median, float)
 
@@ -766,13 +769,137 @@ class MockCudaTimer(benchmark_utils.Timer):
             self.assertEqual(measurement.number_per_run, number_per_run)
 
     def test_compare(self):
-        compare = benchmark_utils.Compare([
-            benchmark_utils.Timer(
-                "torch.ones((n,))", globals={"n": n},
-                description="ones", label=str(n)).timeit(3)
-            for n in range(3)
-        ])
-        compare.print()
+        # Simulate several approaches.
+        costs = (
+            # overhead_optimized_fn()
+            (1e-6, 1e-9),
+
+            # compute_optimized_fn()
+            (3e-6, 5e-10),
+
+            # special_case_fn()  [square inputs only]
+            (1e-6, 4e-10),
+        )
+
+        sizes = (
+            (16, 16),
+            (16, 128),
+            (128, 128),
+            (4096, 1024),
+            (2048, 2048),
+        )
+
+        # overhead_optimized_fn()
+        class _MockTimer_0(self._MockTimer):
+            _function_costs = tuple(
+                (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j)
+                for i, j in sizes
+            )
+
+        class MockTimer_0(benchmark_utils.Timer):
+            _timer_cls = _MockTimer_0
+
+        # compute_optimized_fn()
+        class _MockTimer_1(self._MockTimer):
+            _function_costs = tuple(
+                (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j)
+                for i, j in sizes
+            )
+
+        class MockTimer_1(benchmark_utils.Timer):
+            _timer_cls = _MockTimer_1
+
+        # special_case_fn()
+        class _MockTimer_2(self._MockTimer):
+            _function_costs = tuple(
+                (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j)
+                for i, j in sizes if i == j
+            )
+
+        class MockTimer_2(benchmark_utils.Timer):
+            _timer_cls = _MockTimer_2
+
+        results = []
+        for i, j in sizes:
+            results.append(
+                MockTimer_0(
+                    f"fn({i}, {j})",
+                    label="fn",
+                    description=f"({i}, {j})",
+                    sub_label="overhead_optimized",
+                ).blocked_autorange(min_run_time=10)
+            )
+
+            results.append(
+                MockTimer_1(
+                    f"fn({i}, {j})",
+                    label="fn",
+                    description=f"({i}, {j})",
+                    sub_label="compute_optimized",
+                ).blocked_autorange(min_run_time=10)
+            )
+
+            if i == j:
+                results.append(
+                    MockTimer_2(
+                        f"fn({i}, {j})",
+                        label="fn",
+                        description=f"({i}, {j})",
+                        sub_label="special_case (square)",
+                    ).blocked_autorange(min_run_time=10)
+                )
+
+        def check_output(output: str, expected: str):
+            # VSCode will strip trailing newlines from `expected`, so we have to match
+            # this behavior when comparing output.
+            output_str = "\n".join(
+                i.rstrip() for i in output.strip().splitlines(keepends=False))
+
+            self.assertEqual(output_str, textwrap.dedent(expected).strip())
+
+        compare = benchmark_utils.Compare(results)
+
+        check_output(
+            str(compare),
+            """
+            [------------------------------------------------- fn ------------------------------------------------]
+                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
+            1 threads: --------------------------------------------------------------------------------------------
+                  overhead_optimized     |    1.3     |     3.0     |     17.4     |     4174.4     |     4174.4
+                  compute_optimized      |    3.1     |     4.0     |     11.2     |     2099.3     |     2099.3
+                  special_case (square)  |    1.1     |             |      7.5     |                |     1674.7
+
+            Times are in microseconds (us)."""
+        )
+
+        compare.trim_significant_figures()
+        check_output(
+            str(compare),
+            """
+            [------------------------------------------------- fn ------------------------------------------------]
+                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
+            1 threads: --------------------------------------------------------------------------------------------
+                  overhead_optimized     |     1      |     3.0     |      17      |      4200      |      4200
+                  compute_optimized      |     3      |     4.0     |      11      |      2100      |      2100
+                  special_case (square)  |     1      |             |       8      |                |      1700
+
+            Times are in microseconds (us)."""
+        )
+
+        compare.colorize()
+        check_output(
+            str(compare),
+            """
+            [------------------------------------------------- fn ------------------------------------------------]
+                                         |  (16, 16)  |  (16, 128)  |  (128, 128)  |  (4096, 1024)  |  (2048, 2048)
+            1 threads: --------------------------------------------------------------------------------------------
+                  overhead_optimized     |     1      |  \x1b[92m\x1b[1m   3.0   \x1b[0m\x1b[0m  |  \x1b[2m\x1b[91m    17    \x1b[0m\x1b[0m  |      4200      |  \x1b[2m\x1b[91m    4200    \x1b[0m\x1b[0m
+                  compute_optimized      |  \x1b[2m\x1b[91m   3    \x1b[0m\x1b[0m  |     4.0     |      11      |  \x1b[92m\x1b[1m    2100    \x1b[0m\x1b[0m  |      2100
+                  special_case (square)  |  \x1b[92m\x1b[1m   1    \x1b[0m\x1b[0m  |             |  \x1b[92m\x1b[1m     8    \x1b[0m\x1b[0m  |                |  \x1b[92m\x1b[1m    1700    \x1b[0m\x1b[0m
+
+            Times are in microseconds (us)."""  # noqa
+        )
+
 
     @unittest.skipIf(IS_WINDOWS and os.getenv("VC_YEAR") == "2019", "Random seed only accepts int32")
     def test_fuzzer(self):
diff --git a/torch/utils/_benchmark/utils/common.py b/torch/utils/_benchmark/utils/common.py
index 57000cc14897..fc27c9fdf6b7 100644
--- a/torch/utils/_benchmark/utils/common.py
+++ b/torch/utils/_benchmark/utils/common.py
@@ -2,8 +2,8 @@
 
 import collections
 import contextlib
-import logging
-from typing import Any, Dict, List, Optional
+import dataclasses
+from typing import DefaultDict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -22,6 +22,19 @@
 _IQR_GROSS_WARN_THRESHOLD = 0.25
 
 
+@dataclasses.dataclass(init=True, repr=False, eq=True, frozen=True)
+class TaskSpec:
+    """Container for information used to define a Timer. (except globals)"""
+    stmt: str
+    setup: str
+    label: Optional[str]
+    sub_label: Optional[str]
+    description: Optional[str]
+    env: Optional[str]
+    num_threads: int
+_TASKSPEC_FIELDS = tuple(i.name for i in dataclasses.fields(TaskSpec))
+
+@dataclasses.dataclass(init=True, repr=False)
 class Measurement:
     """The result of a Timer measurement.
 
@@ -29,75 +42,53 @@ class Measurement:
     serializable and provides several convenience methods
     (including a detailed __repr__) for downstream consumers.
     """
-    def __init__(
-        self,
-        number_per_run: int,
-        times: List[float],
-        num_threads: int,
-        label: Optional[str],
-        sub_label: Optional[str],
-        description: Optional[str],
-        env: Optional[str],
-        stmt: Optional[str],
-        metadata: Optional[dict] = None,
-    ):
-        self.number_per_run = number_per_run
-        self.times = times
-        self.label = label
-        self.sub_label = sub_label
-        self.description = description
-        self._env = env
-        self.num_threads = num_threads
-        self.stmt = stmt
-        self.metadata = metadata
-
-        # Derived attributes
-        self._sorted_times = sorted([t / number_per_run for t in times])
-        self._median = np.median(self._sorted_times)
-        self._bottom_quartile = np.percentile(self._sorted_times, 25)
-        self._top_quartile = np.percentile(self._sorted_times, 75)
-        self._iqr = self._top_quartile - self._bottom_quartile
-        self._warnings = self._populate_warnings()
-
-    # Pickle support.
-    def __getstate__(self):
-        return {
-            "label": self.label,
-            "sub_label": self.sub_label,
-            "description": self.description,
-            "env": self._env,
-            "num_threads": self.num_threads,
-            "number_per_run": self.number_per_run,
-            "times": self.times,
-            "stmt": self.stmt,
-            "metadata": self.metadata,
-        }
-
-    def __setstate__(self, state: Dict[str, Any]):
-        self.__init__(**state)  # type: ignore
-
-    def meets_confidence(self, threshold=_IQR_WARN_THRESHOLD):
-        return self._iqr / self._median < threshold
-
-    def _populate_warnings(self):
-        warnings, rel_iqr = [], self._iqr / self._median * 100
-
-        def add_warning(msg):
-            warnings.append(
-                f"  WARNING: Interquartile range is {rel_iqr:.1f}% "
-                f"of the median measurement.\n           {msg}"
-            )
-
-        if self._iqr / self._median > _IQR_GROSS_WARN_THRESHOLD:
-            add_warning("This suggests significant environmental influence.")
-        elif not self.meets_confidence():
-            add_warning("This could indicate system fluctuation.")
-        return warnings
+    number_per_run: int
+    raw_times: List[float]
+    task_spec: TaskSpec
+    metadata: Optional[dict] = None
+
+    def __post_init__(self):
+        self._sorted_times: Tuple[float, ...] = ()
+        self._warnings: Tuple[str, ...] = ()
+        self._median: float = -1.0
+        self._mean: float = -1.0
+        self._p25: float = -1.0
+        self._p75: float = -1.0
+
+    def __getattr__(self, name):
+        # Forward TaskSpec fields for convenience.
+        if name in _TASKSPEC_FIELDS:
+            return getattr(self.task_spec, name)
+        return super().__getattribute__(name)
+
+    # =========================================================================
+    # == Convenience methods for statistics ===================================
+    # =========================================================================
+    #
+    # These methods use raw time divided by number_per_run; this is an
+    # extrapolation and hides the fact that different number_per_run will
+    # result in different amortization of overheads, however if Timer has
+    # selected an appropriate number_per_run then this is a non-issue, and
+    # forcing users to handle that division would result in a poor experience.
+    @property
+    def times(self) -> List[float]:
+        return [t / self.number_per_run for t in self.raw_times]
 
     @property
     def median(self) -> float:
+        self._lazy_init()
         return self._median
 
+    @property
+    def mean(self) -> float:
+        self._lazy_init()
+        return self._mean
+
+    @property
+    def iqr(self) -> float:
+        self._lazy_init()
+        return self._p75 - self._p25
+
     @property
     def significant_figures(self) -> int:
         """Approximate significant figure estimate.
@@ -113,10 +104,11 @@ def significant_figures(self) -> int:
         summary. __repr__ does not use this method; it simply displays raw
         values. Significant figure estimation is intended for `Compare`.
         """
+        self._lazy_init()
         n_total = len(self._sorted_times)
         lower_bound = int(n_total // 4)
         upper_bound = int(np.ceil(3 * n_total / 4))
-        interquartile_points = self._sorted_times[lower_bound:upper_bound]
+        interquartile_points: Tuple[float, ...] = self._sorted_times[lower_bound:upper_bound]
         std = np.std(interquartile_points)
         sqrt_n = np.sqrt(len(interquartile_points))
 
@@ -126,6 +118,35 @@ def significant_figures(self) -> int:
         num_significant_figures = int(np.floor(relative_ci))
         return min(max(num_significant_figures, 1), _MAX_SIGNIFICANT_FIGURES)
 
+    @property
+    def has_warnings(self) -> bool:
+        self._lazy_init()
+        return bool(self._warnings)
+
+    def _lazy_init(self):
+        if self.raw_times and not self._sorted_times:
+            self._sorted_times = tuple(sorted(self.times))
+            self._median = np.median(self._sorted_times)
+            self._mean = np.mean(self._sorted_times)
+            self._p25 = np.percentile(self._sorted_times, 25)
+            self._p75 = np.percentile(self._sorted_times, 75)
+
+            def add_warning(msg):
+                rel_iqr = self.iqr / self.median * 100
+                self._warnings += (
+                    f"  WARNING: Interquartile range is {rel_iqr:.1f}% "
+                    f"of the median measurement.\n           {msg}",
+                )
+
+            if not self.meets_confidence(_IQR_GROSS_WARN_THRESHOLD):
+                add_warning("This suggests significant environmental influence.")
+            elif not self.meets_confidence(_IQR_WARN_THRESHOLD):
+                add_warning("This could indicate system fluctuation.")
+
+
+    def meets_confidence(self, threshold=_IQR_WARN_THRESHOLD) -> bool:
+        return self.iqr / self.median < threshold
+
     @property
     def title(self) -> str:
         """Best effort attempt at a string label for the measurement."""
@@ -140,16 +161,12 @@ def title(self) -> str:
 
     @property
     def env(self) -> str:
-        return "Unspecified env" if self._env is None else self._env
+        return "Unspecified env" if self.taskspec.env is None else self.taskspec.env
 
     @property
     def as_row_name(self) -> str:
         return self.sub_label or self.stmt or "[Unknown]"
 
-    @property
-    def has_warnings(self):
-        return bool(self._warnings)
-
     def __repr__(self):
         """
         Example repr:
@@ -161,34 +178,48 @@ def __repr__(self):
               WARNING: Interquartile range is 39.4% of the median measurement.
                        This suggests significant environmental influence.
         """
-        repr = [super().__repr__(), "\n", self.title, "\n"]
-        if self.description:
-            repr.extend([self.description, "\n"])
+        self._lazy_init()
+        skip_line, newline = "MEASUREMENT_REPR_SKIP_LINE", "\n"
         n = len(self._sorted_times)
-
-        time_unit, time_scale = select_unit(self.median)
-        repr.extend([
-            f"  {'Median: ' if n > 1 else ''}"
-            f"{self._median / time_scale:.2f} {time_unit}\n"
-        ])
-        if n >= 4:
-            repr.extend(
-                [
-                    f"  IQR:    {self._iqr / time_scale:.2f} {time_unit} "
-                    f"({self._bottom_quartile / time_scale:.2f} to "
-                    f"{self._top_quartile / time_scale:.2f})\n",
-                ]
+        time_unit, time_scale = select_unit(self._median)
+        iqr_filter = '' if n >= 4 else skip_line
+
+        repr_str = f"""
+{super().__repr__()}
+{self.title}
+  {self.description or skip_line}
+  {'Median: ' if n > 1 else ''}{self._median / time_scale:.2f} {time_unit}
+  {iqr_filter}IQR:    {self.iqr / time_scale:.2f} {time_unit} ({self._p25 / time_scale:.2f} to {self._p75 / time_scale:.2f})
+  {n} measurement{'s' if n > 1 else ''}, {self.number_per_run} runs {'per measurement,' if n > 1 else ','} {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
+{newline.join(self._warnings)}""".strip() # noqa
+
+        return "\n".join(l for l in repr_str.splitlines(keepends=False) if skip_line not in l)
+
+    @staticmethod
+    def merge(measurements):
+        """Convenience method for merging replicates.
+        NB: merge will extrapolate times to `number_per_run=1` and will not
+            transfer any metadata (since it might differ between replicates)
+        """
+        grouped_measurements: DefaultDict[TaskSpec, List[Measurement]] = collections.defaultdict(list)
+        for m in measurements:
+            grouped_measurements[m.task_spec].append(m)
+
+        def merge_group(task_spec, group):
+            times: List[float] = []
+            for m in group:
+                # Different measurements could have different `number_per_run`,
+                # so we call `.times` which normalizes the results.
+                times.extend(m.times)
+
+            return Measurement(
+                number_per_run=1,
+                raw_times=times,
+                task_spec=task_spec,
+                metadata=None,
             )
-        repr.extend(
-            [
-                f"  {len(self.times)} measurement{'s' if n > 1 else ''}, "
-                f"{self.number_per_run} runs {'per measurement,' if n > 1 else ','} "
-                f"{self.num_threads} thread{'s' if self.num_threads > 1 else ''}\n"
-            ]
-        )
-        repr.extend(self._warnings)
 
-        return "".join(repr).strip()
+        return [merge_group(t, g) for t, g in grouped_measurements.items()]
 
 
 def select_unit(t: float):
@@ -222,42 +253,6 @@ def ordered_unique(elements):
     return list(collections.OrderedDict({i: None for i in elements}).keys())
 
 
-def merge_measurements(measurements: List[Measurement]):
-    grouped_measurements = collections.defaultdict(list)
-    for m in measurements:
-        key = (m.label, m.sub_label, m.description, m.env, m.num_threads)
-        grouped_measurements[key].append(m)
-
-    def merge_group(label, sub_label, description, env, num_threads, group):
-        times = []
-        for m in group:
-            # Different measurements could have different `number_per_run`.
-            times.extend([t / m.number_per_run for t in m.times])
-        unique_stmts = {m.stmt for m in group}
-        if len(unique_stmts) != 1:
-            logging.warning(
-                "Merged Examples with identical `label`, `sub_label`,\n"
-                "`description`, `env`, and `num_threads`, but different"
-                "`stmt`s:\n  " + "\n  ".join(unique_stmts)
-            )
-        return Measurement(
-            number_per_run=1,
-            times=times,
-            num_threads=num_threads,
-            label=label,
-            sub_label=sub_label,
-            description=description,
-            env=env,
-            stmt=unique_stmts.pop(),
-            metadata=None,
-        )
-
-    return [
-        merge_group(*(key + (group,)))
-        for key, group in grouped_measurements.items()
-    ]
-
-
 @contextlib.contextmanager
 def set_torch_threads(n: int):
     prior_num_threads = torch.get_num_threads()
diff --git a/torch/utils/_benchmark/utils/compare.py b/torch/utils/_benchmark/utils/compare.py
index 6f29b67fef8c..bb54f8ae9238 100644
--- a/torch/utils/_benchmark/utils/compare.py
+++ b/torch/utils/_benchmark/utils/compare.py
@@ -1,7 +1,7 @@
 """Display class to aggregate and print the results of many measurements."""
 import collections
 import itertools as it
-from typing import cast, List, Optional, Tuple
+from typing import DefaultDict, List, Optional, Tuple
 
 import numpy as np
 
@@ -21,7 +21,7 @@
 class _Column(object):
     def __init__(
         self,
-        grouped_results: List[Tuple[common.Measurement, ...]],
+        grouped_results: List[Tuple[Optional[common.Measurement], ...]],
         time_scale: float,
         time_unit: str,
         trim_significant_figures: bool,
@@ -32,15 +32,19 @@ def __init__(
         self._time_scale = time_scale
         self._time_unit = time_unit
         self._trim_significant_figures = trim_significant_figures
-        self._highlight_warnings = highlight_warnings and any(r.has_warnings for r in self._flat_results)
+        self._highlight_warnings = (
+            highlight_warnings
+            and any(r.has_warnings for r in self._flat_results if r)
+        )
         leading_digits = [
-            int(np.ceil(np.log10(r.median / self._time_scale)))
+            int(np.ceil(np.log10(r.median / self._time_scale))) if r else None
             for r in self._flat_results
         ]
-        unit_digits = max(leading_digits)
+        unit_digits = max(d for d in leading_digits if d is not None)
         decimal_digits = min(
             max(m.significant_figures - digits, 0)
             for digits, m in zip(leading_digits, self._flat_results)
+            if (m is not None) and (digits is not None)
         ) if self._trim_significant_figures else 1
         length = unit_digits + decimal_digits + (1 if decimal_digits else 0)
         self._template = f"{{:>{length}.{decimal_digits}f}}{{:>{7 if self._highlight_warnings else 0}}}"
@@ -48,12 +52,16 @@ def __init__(
     def get_results_for(self, group):
         return self._grouped_results[group]
 
-    def num_to_str(self, value: float, estimated_sigfigs: int, spread: Optional[float]):
+    def num_to_str(self, value: Optional[float], estimated_sigfigs: int, spread: Optional[float]):
+        if value is None:
+            return " " * len(self.num_to_str(1, estimated_sigfigs, None))
+
         if self._trim_significant_figures:
             value = common.trim_sigfig(value, estimated_sigfigs)
+
         return self._template.format(
             value,
-            f" (! {spread:.0f}%)" if self._highlight_warnings and spread is not None else "")
+            f" (! {spread * 100:.0f}%)" if self._highlight_warnings and spread is not None else "")
 
 
 class _Row(object):
@@ -67,27 +75,31 @@ def __init__(self, results, row_group, render_env, env_str_len,
         self._row_name_str_len = row_name_str_len
         self._time_scale = time_scale
         self._colorize = colorize
-        self._columns = None
+        self._columns: Tuple[_Column, ...] = ()
         self._num_threads = num_threads
 
     def register_columns(self, columns: Tuple[_Column, ...]):
         self._columns = columns
 
     def as_column_strings(self):
-        env = f"({self._results[0].env})" if self._render_env else ""
+        concrete_results = [r for r in self._results if r is not None]
+        env = f"({concrete_results[0].env})" if self._render_env else ""
         env = env.ljust(self._env_str_len + 4)
-        output = ["  " + env + self._results[0].as_row_name]
+        output = ["  " + env + concrete_results[0].as_row_name]
         for m, col in zip(self._results, self._columns or ()):
-            output.append(col.num_to_str(
-                m.median / self._time_scale,
-                m.significant_figures,
-                m.median / m._iqr if m.has_warnings else None
-            ))
+            if m is None:
+                output.append(col.num_to_str(None, 1, None))
+            else:
+                output.append(col.num_to_str(
+                    m.median / self._time_scale,
+                    m.significant_figures,
+                    m.iqr / m.median if m.has_warnings else None
+                ))
         return output
 
     @staticmethod
     def color_segment(segment, value, group_values):
-        best_value = min(group_values)
+        best_value = min([v for v in group_values if v is not None])
         if value <= best_value * 1.01 or value <= best_value + 100e-9:
             return BEST + BOLD + segment + TERMINATE * 2
         if value <= best_value * 1.1:
@@ -109,16 +121,21 @@ def finalize_column_strings(self, column_strings, col_widths):
         row_contents = [column_strings[0].ljust(col_widths[0])]
         for col_str, width, result, column in zip(column_strings[1:], col_widths[1:], self._results, self._columns or ()):
             col_str = col_str.center(width)
-            if self._colorize:
-                group_medians = [r.median for r in column.get_results_for(self._row_group)]
+            if self._colorize and result is not None:
+                group_medians = [None if r is None else r.median for r in column.get_results_for(self._row_group)]
                 col_str = self.color_segment(col_str, result.median, group_medians)
             row_contents.append(col_str)
         return row_contents
 
 
 class Table(object):
-    def __init__(self, results: List[common.Measurement], colorize: bool,
-                 trim_significant_figures: bool, highlight_warnings: bool):
+    def __init__(
+            self,
+            results: List[common.Measurement],
+            colorize: bool,
+            trim_significant_figures: bool,
+            highlight_warnings: bool
+    ):
         assert len(set(r.label for r in results)) == 1
 
         self.results = results
@@ -136,17 +153,20 @@ def __init__(self, results: List[common.Measurement], colorize: bool,
         self.rows, self.columns = self.populate_rows_and_columns()
 
     @staticmethod
-    def row_fn(m: common.Measurement):
+    def row_fn(m: common.Measurement) -> Tuple[int, Optional[str], str]:
         return m.num_threads, m.env, m.as_row_name
 
     @staticmethod
-    def col_fn(m: common.Measurement):
+    def col_fn(m: common.Measurement) -> Optional[str]:
         return m.description
 
-    def populate_rows_and_columns(self):
-        rows, columns = [], []
-
-        ordered_results: List[List[Optional[common.Measurement]]] = [[None for _ in self.column_keys] for _ in self.row_keys]
+    def populate_rows_and_columns(self) -> Tuple[Tuple[_Row, ...], Tuple[_Column, ...]]:
+        rows: List[_Row] = []
+        columns: List[_Column] = []
+        ordered_results: List[List[Optional[common.Measurement]]] = [
+            [None for _ in self.column_keys]
+            for _ in self.row_keys
+        ]
         row_position = {key: i for i, key in enumerate(self.row_keys)}
         col_position = {key: i for i, key in enumerate(self.column_keys)}
         for r in self.results:
@@ -187,10 +207,7 @@ def populate_rows_and_columns(self):
             prior_env = env
 
         for i in range(len(self.column_keys)):
-            grouped_results = cast(
-                List[Tuple[common.Measurement, ...]],  # All Nones should be gone.
-                [tuple(row[i] for row in g) for g in rows_by_group],
-            )
+            grouped_results = [tuple(row[i] for row in g) for g in rows_by_group]
             column = _Column(
                 grouped_results=grouped_results,
                 time_scale=self.time_scale,
@@ -204,13 +221,13 @@ def populate_rows_and_columns(self):
             ri.register_columns(columns_tuple)
         return rows_tuple, columns_tuple
 
-    def render(self):
+    def render(self) -> str:
         string_rows = [[""] + self.column_keys]
         for r in self.rows:
             string_rows.append(r.as_column_strings())
         num_cols = max(len(i) for i in string_rows)
-        for r in string_rows:
-            r.extend(["" for _ in range(num_cols - len(r))])
+        for sr in string_rows:
+            sr.extend(["" for _ in range(num_cols - len(sr))])
 
         col_widths = [max(len(j) for j in i) for i in zip(*string_rows)]
         finalized_columns = ["  |  ".join(i.center(w) for i, w in zip(string_rows[0], col_widths))]
@@ -218,12 +235,15 @@ def render(self):
         for string_row, row in zip(string_rows[1:], self.rows):
             finalized_columns.extend(row.row_separator(overall_width))
             finalized_columns.append("  |  ".join(row.finalize_column_strings(string_row, col_widths)))
-        print("[" + (" " + (self.label or "") + " ").center(overall_width - 2, "-") + "]")
-        print("\n".join(finalized_columns))
-        print(f"\nTimes are in {common.unit_to_english(self.time_unit)}s ({self.time_unit}).")
-        if self._highlight_warnings and any(r.has_warnings for r in self.results):
-            print("(! XX%) Measurement has high variance, where XX is the median / IQR * 100.")
-        print("\n")
+
+        newline = "\n"
+        has_warnings = self._highlight_warnings and any(ri.has_warnings for ri in self.results)
+        return f"""
+[{(' ' + (self.label or '') + ' ').center(overall_width - 2, '-')}]
+{newline.join(finalized_columns)}
+
+Times are in {common.unit_to_english(self.time_unit)}s ({self.time_unit}).
+{'(! XX%) Measurement has high variance, where XX is the IQR / median * 100.' + newline if has_warnings else ""}"""[1:]
 
 
 class Compare(object):
@@ -234,6 +254,9 @@ def __init__(self, results: List[common.Measurement]):
         self._colorize = False
         self._highlight_warnings = False
 
+    def __str__(self):
+        return "\n".join(self._render())
+
     def extend_results(self, results):
         for r in results:
             if not isinstance(r, common.Measurement):
@@ -252,21 +275,27 @@ def highlight_warnings(self):
         self._highlight_warnings = True
 
     def print(self):
-        self._render()
+        print(str(self))
 
     def _render(self):
-        results = common.merge_measurements(self._results)
+        results = common.Measurement.merge(self._results)
         results = self._group_by_label(results)
+        output = []
         for group in results.values():
-            self._layout(group)
+            output.append(self._layout(group))
+        return output
 
-    def _group_by_label(self, results):
-        grouped_results = collections.defaultdict(list)
+    def _group_by_label(self, results: List[common.Measurement]):
+        grouped_results: DefaultDict[str, List[common.Measurement]] = collections.defaultdict(list)
         for r in results:
             grouped_results[r.label].append(r)
         return grouped_results
 
     def _layout(self, results: List[common.Measurement]):
-        table = Table(results, self._colorize, self._trim_significant_figures,
-                      self._highlight_warnings)
-        table.render()
+        table = Table(
+            results,
+            self._colorize,
+            self._trim_significant_figures,
+            self._highlight_warnings
+        )
+        return table.render()
diff --git a/torch/utils/_benchmark/utils/timer.py b/torch/utils/_benchmark/utils/timer.py
index c78db2740c2f..c0fba0bfa8a0 100644
--- a/torch/utils/_benchmark/utils/timer.py
+++ b/torch/utils/_benchmark/utils/timer.py
@@ -1,14 +1,14 @@
 """Timer class based on the timeit.Timer class, but torch aware."""
 
 import timeit
-from typing import List, Optional
+from typing import Callable, List, NoReturn, Optional
 
 import numpy as np
 import torch
 from torch.utils._benchmark.utils import common
 
 
-__all__ = ["Timer"]
+__all__ = ["Timer", "timer"]
 
 
 if torch.has_cuda and torch.cuda.is_available():
@@ -43,32 +43,26 @@ def __init__(
         globals = dict(globals or {})
         globals.setdefault("torch", torch)
 
-        self._stmt = stmt
-        self._label = label
-        self._sub_label = sub_label
-        self._description = description
-        self._env = env
-        self._num_threads = num_threads
         self._timer = self._timer_cls(stmt=stmt, setup=setup, timer=timer, globals=globals)
-
-    def _construct_measurement(self, number_per_run: int, times: List[float]):
-        return common.Measurement(
-            number_per_run=number_per_run,
-            times=times,
-            num_threads=self._num_threads,
-            label=self._label,
-            sub_label=self._sub_label,
-            description=self._description,
-            env=self._env,
-            stmt=self._stmt,
+        self._task_spec = common.TaskSpec(
+            stmt=stmt,
+            setup=setup,
+            label=label,
+            sub_label=sub_label,
+            description=description,
+            env=env,
+            num_threads=num_threads,
         )
 
     def timeit(self, number=1000000):
-        # Warmup
-        self._timer.timeit(number=max(int(number // 100), 1))
-        with common.set_torch_threads(self._num_threads):
-            return self._construct_measurement(
-                number_per_run=number, times=[self._timer.timeit(number=number)]
+        with common.set_torch_threads(self._task_spec.num_threads):
+            # Warmup
+            self._timer.timeit(number=max(int(number // 100), 1))
+
+            return common.Measurement(
+                number_per_run=number,
+                raw_times=[self._timer.timeit(number=number)],
+                task_spec=self._task_spec
             )
 
     def repeat(self, repeat=-1, number=-1):
@@ -77,12 +71,19 @@ def repeat(self, repeat=-1, number=-1):
     def autorange(self, callback=None):
         raise NotImplementedError("See `Timer.blocked_autorange.`")
 
-    def _threaded_measurement_loop(self, number, time_hook, stop_hook, min_run_time: float,
-                                   max_run_time: Optional[float] = None, callback=None):
+    def _threaded_measurement_loop(
+        self,
+        number: int,
+        time_hook: Callable[[], float],
+        stop_hook: Callable[[List[float]], bool],
+        min_run_time: float,
+        max_run_time: Optional[float] = None,
+        callback: Optional[Callable[[int, float], NoReturn]] = None
+    ):
         total_time = 0.0
         can_stop = False
-        times = []
-        with common.set_torch_threads(self._num_threads):
+        times: List[float] = []
+        with common.set_torch_threads(self._task_spec.num_threads):
             while (total_time < min_run_time) or (not can_stop):
                 time_spent = time_hook()
                 times.append(time_spent)
@@ -94,23 +95,8 @@ def _threaded_measurement_loop(self, number, time_hook, stop_hook, min_run_time:
                     break
         return times
 
-    def adaptive_autorange(self, threshold=0.1, max_run_time=10, callback=None, min_run_time=0.01):
-        number = self._estimate_block_size(min_run_time=0.05)
-
-        def time_hook():
-            return self._timer.timeit(number)
-
-        def stop_hook(times):
-            if len(times) > 3:
-                measure = self._construct_measurement(number, times)
-                return measure.meets_confidence(threshold=threshold)
-            return False
-        times = self._threaded_measurement_loop(number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
-        measure = self._construct_measurement(number, times)
-        return measure
-
-    def _estimate_block_size(self, min_run_time):
-        with common.set_torch_threads(self._num_threads):
+    def _estimate_block_size(self, min_run_time: float):
+        with common.set_torch_threads(self._task_spec.num_threads):
             # Estimate the block size needed for measurement to be negligible
             # compared to the inner loop. This also serves as a warmup.
             overhead = np.median([self._timer.timeit(0) for _ in range(5)])
@@ -125,14 +111,51 @@ def _estimate_block_size(self, min_run_time):
                 number *= 10
         return number
 
+    def adaptive_autorange(
+            self,
+            threshold=0.1,
+            max_run_time=10,
+            callback: Optional[Callable[[int, float], NoReturn]] = None,
+            min_run_time=0.01
+    ):
+        number = self._estimate_block_size(min_run_time=0.05)
+
+        def time_hook() -> float:
+            return self._timer.timeit(number)
+
+        def stop_hook(times) -> bool:
+            if len(times) > 3:
+                return common.Measurement(
+                    number_per_run=number,
+                    raw_times=times,
+                    task_spec=self._task_spec
+                ).meets_confidence(threshold=threshold)
+            return False
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
     def blocked_autorange(self, callback=None, min_run_time=0.2):
         number = self._estimate_block_size(min_run_time)
 
-        def time_hook():
+        def time_hook() -> float:
             return self._timer.timeit(number)
 
-        def stop_hook(times):
+        def stop_hook(times) -> bool:
             return True
-        times = self._threaded_measurement_loop(number, time_hook, stop_hook, min_run_time=min_run_time,
-                                                callback=callback)
-        return self._construct_measurement(number_per_run=number, times=times)
+
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook,
+            min_run_time=min_run_time,
+            callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )

From a4486fe7ba94caa119dca34e1b0cd76ca815f500 Mon Sep 17 00:00:00 2001
From: lcskrishna <lollachaitanya@gmail.com>
Date: Mon, 28 Sep 2020 14:55:34 -0700
Subject: [PATCH 214/449] [ROCm] Print name irrespective of seq number
 assignment for roctx traces (#45229)

Summary:
Recent changes to the seq_num correlation behavior in profiler (PR https://github.com/pytorch/pytorch/issues/42565)  has changed the behavior for emit_nvtx(record_shapes=True)  which doesn't print the name of the operator properly.

Created PR to dump out the name in roctx traces, irrespective of the sequence number assigned only for ROCm.

cc: jeffdaily sunway513

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45229

Reviewed By: zou3519

Differential Revision: D23932902

Pulled By: albanD

fbshipit-source-id: c782667ff002b70b51f1cc921afd1b1ac533b39d
---
 torch/csrc/autograd/profiler.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index bb5789cd1755..259c2e5bdeb6 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -282,8 +282,15 @@ struct ProfilerThreadLocalState
       const std::vector<std::vector<int64_t>>& shapes) const {
     if (sequence_nr >= 0 || shapes.size() > 0) {
       std::stringstream s;
+#ifdef __HIP_PLATFORM_HCC__
+      s << name.str();
+#endif
       if (sequence_nr >= 0) {
+#ifdef __HIP_PLATFORM_HCC__
+        s << msg << sequence_nr;
+#else
         s << name.str() << msg << sequence_nr;
+#endif
       }
       if (shapes.size() > 0) {
         s << ", sizes = [";

From 1097fe00886e361a96cc85af8a751f1804496abf Mon Sep 17 00:00:00 2001
From: Gregory Chanan <gchanan@fb.com>
Date: Mon, 28 Sep 2020 14:59:15 -0700
Subject: [PATCH 215/449] Remove CriterionTest.test_cuda code for dtype None.
 (#45316)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45316

It's never used.

Test Plan: Imported from OSS

Reviewed By: albanD

Differential Revision: D23919449

Pulled By: gchanan

fbshipit-source-id: f9aaeeabf3940389156bfc01bc3118d348ca4cf6
---
 torch/testing/_internal/common_nn.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index f772f785a36a..a4525831e920 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -5089,7 +5089,7 @@ def apply_fn(input1, input2, target, *params):
         if self.check_gradgrad:
             gradgradcheck(apply_fn, inputs)
 
-    def test_cuda(self, test_case, dtype=None, extra_args=None):
+    def test_cuda(self, test_case, dtype, extra_args=None):
         def convert_dtype(obj, dtype, requires_grad=False):
             if isinstance(obj, torch.Tensor):
                 return obj.detach().to(dtype=dtype).requires_grad_(requires_grad)
@@ -5107,12 +5107,11 @@ def convert_dtype(obj, dtype, requires_grad=False):
         gpu_module = self.constructor(*self.constructor_args)
 
         # Convert input, target and module parameters to dtype
-        if dtype is not None:
-            cpu_input = convert_dtype(cpu_input, dtype, True)
-            if cpu_target.is_floating_point() or cpu_target.is_complex():
-                cpu_target = convert_dtype(cpu_target, dtype)
-            cpu_module.type(dtype)
-            gpu_module.type(dtype)
+        cpu_input = convert_dtype(cpu_input, dtype, True)
+        if cpu_target.is_floating_point() or cpu_target.is_complex():
+            cpu_target = convert_dtype(cpu_target, dtype)
+        cpu_module.type(dtype)
+        gpu_module.type(dtype)
 
         # GPU setup
         gpu_input = to_gpu(cpu_input)
@@ -5128,13 +5127,14 @@ def convert_dtype(obj, dtype, requires_grad=False):
 
         cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
         gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
-        # dtype can be None, so set precision in this way instead of a precision map
+        # dtype used to be able to be None, so set precision in this way instead of a precision map
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         test_case.assertEqualIgnoreType(cpu_output, gpu_output,
                                         atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0)
 
         cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
         gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
+        # dtype used to be able to be None, so set precision in this way instead of a precision map
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput,
                                         atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0)

From 190f91e3dbea9594151c291f0c3a4fd29574467f Mon Sep 17 00:00:00 2001
From: Yangxin Zhong <yangxin@fb.com>
Date: Mon, 28 Sep 2020 15:15:15 -0700
Subject: [PATCH 216/449] Adding Histogram Binning Calibration to DSNN and
 Adding Type Double to Caffe2 ParallelSumOp/SumReluOp

Summary: As title.

Test Plan:
FBL job without this diff failed:
f221545832

Error message:
```
NonRetryableException: AssertionError: Label is missing in training stage for HistogramBinningCalibration
```

FBL job with canary package built in this diff is running without failure:
f221650379

Reviewed By: chenshouyuan

Differential Revision: D23959508

fbshipit-source-id: c077230de29f7abfd092c84747eaabda0b532bcc
---
 caffe2/quantization/server/elementwise_sum_relu_op.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/caffe2/quantization/server/elementwise_sum_relu_op.cc b/caffe2/quantization/server/elementwise_sum_relu_op.cc
index df4b726c7306..dbb14c0c5ce8 100644
--- a/caffe2/quantization/server/elementwise_sum_relu_op.cc
+++ b/caffe2/quantization/server/elementwise_sum_relu_op.cc
@@ -42,11 +42,13 @@ class SumReluOp : public SumOp<Context> {
   bool RunOnDevice() override {
     if (Input(0).template IsType<float>()) {
       return DoRunWithType<float, float>();
+    } else if (Input(0).template IsType<double>()) {
+      return DoRunWithType<double, double>();
     } else if (Input(0).template IsType<int>()) {
       return DoRunWithType<int, int>();
     } else {
       CAFFE_THROW(
-          "Sum operator only supports 32-bit float and ints, but",
+          "Sum operator only supports 32-bit float, 64-bit double and ints, but",
           " input was of type ",
           Input(0).dtype().name());
     }

From 6b65b3cbd806d26cdc66c9bd8184a1465811dceb Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Mon, 28 Sep 2020 15:23:09 -0700
Subject: [PATCH 217/449] [Distributed] DeleteKey API for c10d TCP Store
 (#45401)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45401

Added a DeleteKey API for the TCP Store
ghstack-source-id: 112997162

Test Plan:
Modified the existing get/set test to use delete. verified that the
correct keys were deleted and that the numKeys API returned the right values

Reviewed By: mrshenli

Differential Revision: D23955730

fbshipit-source-id: 5c9f82be34ff4521c59f56f8d9c1abf775c67f9f
---
 caffe2/distributed/file_store_handler.cc  |  5 +++++
 caffe2/distributed/file_store_handler.h   |  2 ++
 caffe2/distributed/redis_store_handler.cc |  5 +++++
 caffe2/distributed/redis_store_handler.h  |  2 ++
 caffe2/distributed/store_handler.h        |  5 +++++
 torch/csrc/distributed/c10d/init.cpp      |  8 ++++++++
 torch/lib/c10d/FileStore.cpp              |  4 ++++
 torch/lib/c10d/FileStore.hpp              |  2 ++
 torch/lib/c10d/HashStore.cpp              |  4 ++++
 torch/lib/c10d/HashStore.hpp              |  2 ++
 torch/lib/c10d/PrefixStore.cpp            |  4 ++++
 torch/lib/c10d/PrefixStore.hpp            |  2 ++
 torch/lib/c10d/Store.hpp                  |  2 ++
 torch/lib/c10d/TCPStore.cpp               | 19 ++++++++++++++++++-
 torch/lib/c10d/TCPStore.hpp               |  3 +++
 torch/lib/c10d/test/TCPStoreTest.cpp      | 14 +++++++++++++-
 16 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc
index 11f2f5f244d4..5a34e53b6947 100644
--- a/caffe2/distributed/file_store_handler.cc
+++ b/caffe2/distributed/file_store_handler.cc
@@ -127,6 +127,11 @@ int64_t FileStoreHandler::getNumKeys() {
   return 0;
 }
 
+bool FileStoreHandler::deleteKey(const std::string& /* unused */) {
+  CHECK(false) << "deleteKey not implemented for FileStoreHandler";
+  return false;
+}
+
 bool FileStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> paths;
   for (const auto& name : names) {
diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h
index 36c7df2a845f..9ca81e4c2c7d 100644
--- a/caffe2/distributed/file_store_handler.h
+++ b/caffe2/distributed/file_store_handler.h
@@ -17,6 +17,8 @@ class CAFFE2_API FileStoreHandler : public StoreHandler {
 
   virtual int64_t add(const std::string& name, int64_t value) override;
 
+  virtual bool deleteKey(const std::string& key) override;
+
   virtual int64_t getNumKeys() override;
 
   virtual bool check(const std::vector<std::string>& names) override;
diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc
index f635a14ba63d..e424c0e719fd 100644
--- a/caffe2/distributed/redis_store_handler.cc
+++ b/caffe2/distributed/redis_store_handler.cc
@@ -81,6 +81,11 @@ int64_t RedisStoreHandler::getNumKeys() {
   return 0;
 }
 
+bool RedisStoreHandler::deleteKey(const std::string& /* unused */) {
+  CHECK(false) << "deleteKey not implemented for RedisStoreHandler";
+  return false;
+}
+
 bool RedisStoreHandler::check(const std::vector<std::string>& names) {
   std::vector<std::string> args;
   args.push_back("EXISTS");
diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h
index 94d3a2f762b4..d5fa76741578 100644
--- a/caffe2/distributed/redis_store_handler.h
+++ b/caffe2/distributed/redis_store_handler.h
@@ -25,6 +25,8 @@ class CAFFE2_API RedisStoreHandler : public StoreHandler {
 
   virtual int64_t getNumKeys() override;
 
+  virtual bool deleteKey(const std::string& key) override;
+
   virtual bool check(const std::vector<std::string>& names) override;
 
   virtual void wait(
diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h
index 1f74d244337f..951fe26c6ec6 100644
--- a/caffe2/distributed/store_handler.h
+++ b/caffe2/distributed/store_handler.h
@@ -46,6 +46,11 @@ class CAFFE2_API StoreHandler {
    */
   virtual int64_t getNumKeys() = 0;
 
+  /*
+   * Removes the specified key from the store.
+   */
+  virtual bool deleteKey(const std::string& key) = 0;
+
   /*
    * Check if a keys exist in the store.
    */
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 3cd5d68daf8e..bc52426f08ac 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -98,6 +98,10 @@ class PythonStore : public ::c10d::Store {
     PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, getNumKeys);
   }
 
+  bool deleteKey(const std::string& key) override {
+    PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, deleteKey, key);
+  }
+
   bool check(const std::vector<std::string>& keys) override {
     PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, check, keys);
   }
@@ -307,6 +311,10 @@ They are used in specifying strategies for reduction collectives, e.g.,
               "add",
               &::c10d::Store::add,
               py::call_guard<py::gil_scoped_release>())
+          .def(
+              "delete_key",
+              &::c10d::Store::deleteKey,
+              py::call_guard<py::gil_scoped_release>())
           .def(
               "num_keys",
               &::c10d::Store::getNumKeys,
diff --git a/torch/lib/c10d/FileStore.cpp b/torch/lib/c10d/FileStore.cpp
index d48f58994473..46642742d307 100644
--- a/torch/lib/c10d/FileStore.cpp
+++ b/torch/lib/c10d/FileStore.cpp
@@ -358,6 +358,10 @@ int64_t FileStore::getNumKeys() {
   TORCH_CHECK(false, "getNumKeys not implemented for FileStore");
 }
 
+bool FileStore::deleteKey(const std::string& /* unused */) {
+  TORCH_CHECK(false, "deleteKey not implemented for FileStore");
+}
+
 bool FileStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> l(activeFileOpLock_);
   File file(path_, O_RDONLY, timeout_);
diff --git a/torch/lib/c10d/FileStore.hpp b/torch/lib/c10d/FileStore.hpp
index daf38baa817a..aa5d9946e5b3 100644
--- a/torch/lib/c10d/FileStore.hpp
+++ b/torch/lib/c10d/FileStore.hpp
@@ -23,6 +23,8 @@ class FileStore : public Store {
 
   int64_t getNumKeys() override;
 
+  bool deleteKey(const std::string& key) override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   void wait(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/HashStore.cpp b/torch/lib/c10d/HashStore.cpp
index a21b5ea535a7..1bc823f0609c 100644
--- a/torch/lib/c10d/HashStore.cpp
+++ b/torch/lib/c10d/HashStore.cpp
@@ -83,6 +83,10 @@ int64_t HashStore::getNumKeys() {
   TORCH_CHECK(false, "getNumKeys not implemented for HashStore");
 }
 
+bool HashStore::deleteKey(const std::string& /* unused */) {
+  TORCH_CHECK(false, "deleteKey not implemented for HashStore");
+}
+
 bool HashStore::check(const std::vector<std::string>& keys) {
   std::unique_lock<std::mutex> lock(m_);
   for (const auto& key : keys) {
diff --git a/torch/lib/c10d/HashStore.hpp b/torch/lib/c10d/HashStore.hpp
index 77123e3b4f58..1bdd67ca603c 100644
--- a/torch/lib/c10d/HashStore.hpp
+++ b/torch/lib/c10d/HashStore.hpp
@@ -32,6 +32,8 @@ class HashStore : public Store {
 
   bool check(const std::vector<std::string>& keys) override;
 
+  bool deleteKey(const std::string& key) override;
+
  protected:
   std::unordered_map<std::string, std::vector<uint8_t>> map_;
   std::mutex m_;
diff --git a/torch/lib/c10d/PrefixStore.cpp b/torch/lib/c10d/PrefixStore.cpp
index a78c17435a19..5f9a3c9c21ec 100644
--- a/torch/lib/c10d/PrefixStore.cpp
+++ b/torch/lib/c10d/PrefixStore.cpp
@@ -35,6 +35,10 @@ int64_t PrefixStore::add(const std::string& key, int64_t value) {
   return store_->add(joinKey(key), value);
 }
 
+bool PrefixStore::deleteKey(const std::string& key) {
+  return store_->deleteKey(joinKey(key));
+}
+
 int64_t PrefixStore::getNumKeys() {
   return store_->getNumKeys();
 }
diff --git a/torch/lib/c10d/PrefixStore.hpp b/torch/lib/c10d/PrefixStore.hpp
index 8f3953a48e78..cad7112fbd76 100644
--- a/torch/lib/c10d/PrefixStore.hpp
+++ b/torch/lib/c10d/PrefixStore.hpp
@@ -17,6 +17,8 @@ class PrefixStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  bool deleteKey(const std::string& key) override;
+
   int64_t getNumKeys() override;
 
   bool check(const std::vector<std::string>& keys) override;
diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp
index 5eb277c687ee..e42bbf300e0b 100644
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@@ -30,6 +30,8 @@ class Store {
 
   virtual int64_t add(const std::string& key, int64_t value) = 0;
 
+  virtual bool deleteKey(const std::string& key) = 0;
+
   virtual bool check(const std::vector<std::string>& keys) = 0;
 
   virtual int64_t getNumKeys() = 0;
diff --git a/torch/lib/c10d/TCPStore.cpp b/torch/lib/c10d/TCPStore.cpp
index 45dcc778f53b..55705005aad0 100644
--- a/torch/lib/c10d/TCPStore.cpp
+++ b/torch/lib/c10d/TCPStore.cpp
@@ -10,7 +10,7 @@ namespace c10d {
 
 namespace {
 
-enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS };
+enum class QueryType : uint8_t { SET, GET, ADD, CHECK, WAIT, GETNUMKEYS, DELETE };
 
 enum class CheckResponseType : uint8_t { READY, NOT_READY };
 
@@ -183,6 +183,9 @@ void TCPStoreDaemon::query(int socket) {
   } else if (qt == QueryType::GETNUMKEYS) {
     getNumKeysHandler(socket);
 
+  } else if (qt == QueryType::DELETE) {
+    deleteHandler(socket);
+
   } else {
     throw std::runtime_error("Unexpected query type");
   }
@@ -235,6 +238,12 @@ void TCPStoreDaemon::getNumKeysHandler(int socket) const {
   tcputil::sendValue<int64_t>(socket, tcpStore_.size());
 }
 
+void TCPStoreDaemon::deleteHandler(int socket) {
+  std::string key = tcputil::recvString(socket);
+  auto numDeleted = tcpStore_.erase(key);
+  tcputil::sendValue<int64_t>(socket, numDeleted);
+}
+
 void TCPStoreDaemon::checkHandler(int socket) const {
   SizeType nargs;
   tcputil::recvBytes<SizeType>(socket, &nargs, 1);
@@ -364,6 +373,14 @@ int64_t TCPStore::add(const std::string& key, int64_t value) {
   return addHelper_(regKey, value);
 }
 
+bool TCPStore::deleteKey(const std::string& key) {
+  std::string regKey = regularPrefix_ + key;
+  tcputil::sendValue<QueryType>(storeSocket_, QueryType::DELETE);
+  tcputil::sendString(storeSocket_, regKey, true);
+  auto numDeleted = tcputil::recvValue<int64_t>(storeSocket_);
+  return (numDeleted == 1);
+}
+
 int64_t TCPStore::addHelper_(const std::string& key, int64_t value) {
   tcputil::sendValue<QueryType>(storeSocket_, QueryType::ADD);
   tcputil::sendString(storeSocket_, key, true);
diff --git a/torch/lib/c10d/TCPStore.hpp b/torch/lib/c10d/TCPStore.hpp
index e19d0d83b4bf..d26df3e9e8ab 100644
--- a/torch/lib/c10d/TCPStore.hpp
+++ b/torch/lib/c10d/TCPStore.hpp
@@ -27,6 +27,7 @@ class TCPStoreDaemon {
   void getHandler(int socket) const;
   void checkHandler(int socket) const;
   void getNumKeysHandler(int socket) const;
+  void deleteHandler(int socket);
   void waitHandler(int socket);
 
   bool checkKeys(const std::vector<std::string>& keys) const;
@@ -62,6 +63,8 @@ class TCPStore : public Store {
 
   int64_t add(const std::string& key, int64_t value) override;
 
+  bool deleteKey(const std::string& key) override;
+
   bool check(const std::vector<std::string>& keys) override;
 
   int64_t getNumKeys() override;
diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp
index dfd0dfc1a534..916e5bedd94a 100644
--- a/torch/lib/c10d/test/TCPStoreTest.cpp
+++ b/torch/lib/c10d/test/TCPStoreTest.cpp
@@ -36,10 +36,22 @@ void testHelper(const std::string& prefix = "") {
     c10d::test::check(*serverStore, "key0", "value0");
     c10d::test::check(*serverStore, "key1", "value1");
     c10d::test::check(*serverStore, "key2", "value2");
+    serverStore->add("counter", 1);
     auto numKeys = serverStore->getNumKeys();
     // We expect 5 keys since 3 are added above, 'counter' is added by the
     // helper thread, and the init key to coordinate workers.
     EXPECT_EQ(numKeys, 5);
+
+    auto delSuccess = serverStore->deleteKey("key0");
+    // Ensure that the key was successfully deleted
+    EXPECT_TRUE(delSuccess);
+    auto delFailure = serverStore->deleteKey("badKeyName");
+    // The key was not in the store so the delete operation should have failed
+    // and returned false.
+    EXPECT_FALSE(delFailure);
+    numKeys = serverStore->getNumKeys();
+    EXPECT_EQ(numKeys, 4);
+    EXPECT_THROW(serverStore->get("key0"), std::runtime_error);
   });
 
   // Hammer on TCPStore
@@ -57,7 +69,7 @@ void testHelper(const std::string& prefix = "") {
         new c10d::PrefixStore(prefix, clientTCPStores[i])));
   }
 
-  std::string expectedCounterRes = std::to_string(numThreads * numIterations);
+  std::string expectedCounterRes = std::to_string(numThreads * numIterations + 1);
 
   for (auto i = 0; i < numThreads; i++) {
     threads.push_back(

From 331ebaf7cbb7fde4827b2a459691713f1f3e6b0c Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Mon, 28 Sep 2020 15:23:09 -0700
Subject: [PATCH 218/449] [Distributed] Adding Python tests for the TCPStore
 getNumKeys and deleteKey (#45402)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45402

Previous diffs in this stack implemented the getNumKeys and deleteKey
APIs in the c10d Store as well as added tests at the C++ layer. This diff adds
tests at the Python level in test_c10d.py
ghstack-source-id: 112997161

Test Plan: Running these new python tests as well as previous C++ tests

Reviewed By: mrshenli

Differential Revision: D23955729

fbshipit-source-id: c7e0af7c884de2d488320e2a1d94aec801a782e5
---
 test/distributed/test_c10d.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 0e893c87efb1..53a118e8f15b 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -273,6 +273,30 @@ def test_address_already_in_use(self):
             store1 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
             store2 = c10d.TCPStore(addr, port, 1, True)  # noqa: F841
 
+    def _test_numkeys_delkeys(self, fs):
+        # We start off with one init key in the store to coordinate workers
+        self.assertEqual(fs.num_keys(), 1)
+        fs.add("key", 1)
+        fs.add("key", 2)
+        fs.add("key", 3)
+        fs.set("key0", "value0")
+        fs.add("key3", 1)
+        fs.set("key1", "value1")
+        self.assertEqual(fs.num_keys(), 5)
+        fs.delete_key("key")
+        self.assertEqual(fs.num_keys(), 4)
+        with self.assertRaises(RuntimeError):
+            fs.get("key")
+        fs.delete_key("key0")
+        fs.delete_key("key3")
+        self.assertEqual(fs.num_keys(), 2)
+        fs.set("key4", "value2")
+        self.assertEqual(fs.num_keys(), 3)
+        self.assertEqual(b"value1", fs.get("key1"))
+        self.assertEqual(b"value2", fs.get("key4"))
+
+    def test_numkeys_delkeys(self):
+        self._test_numkeys_delkeys(self._create_store())
 
 @skip_if_win32()
 class PrefixTCPStoreTest(TestCase, StoreTestBase):

From e54e1fe51e130eabedfd1b02cadea839dafd1da2 Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Mon, 28 Sep 2020 15:29:34 -0700
Subject: [PATCH 219/449] [package] Add dependency viz (#45214)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45214

When in verbose mode the package exporter will produce an html visualization
of dependencies of a module to make it easier to trim out unneeded code,
or debug inclusion of things that cannot be exported.

Test Plan: Imported from OSS

Reviewed By: suo

Differential Revision: D23873525

Pulled By: zdevito

fbshipit-source-id: 6801991573d8dd5ab8c284e09572b36a35e1e5a4
---
 test/test_package.py      |   6 ++
 torch/package/exporter.py | 128 +++++++++++++++++++++++++++++++++-----
 2 files changed, 118 insertions(+), 16 deletions(-)

diff --git a/test/test_package.py b/test/test_package.py
index a25726a53c00..37d7b0f385a2 100644
--- a/test/test_package.py
+++ b/test/test_package.py
@@ -6,6 +6,7 @@
 from tempfile import TemporaryDirectory
 import torch
 from sys import version_info
+from io import StringIO
 
 try:
     from torchvision.models import resnet18
@@ -183,6 +184,11 @@ def test_resnet(self):
             # the objects in the pickle
             e.save_pickle('model', 'model.pkl', resnet)
 
+            # check th debug graph has something reasonable:
+            buf = StringIO()
+            e._write_dep_graph(failing_module='torch', output_file=buf)
+            self.assertIn('torchvision.models.resnet', buf.getvalue())
+
         # we can now load the saved model
         i = PackageImporter(f1)
         r2 = i.load_pickle('model', 'model.pkl')
diff --git a/torch/package/exporter.py b/torch/package/exporter.py
index 8530f6f68f3a..2055cf945334 100644
--- a/torch/package/exporter.py
+++ b/torch/package/exporter.py
@@ -8,11 +8,12 @@
 from ._importlib import _normalize_path
 import types
 import importlib
-from typing import List, Any, Callable, Dict
+from typing import List, Any, Callable, Dict, Tuple
 from distutils.sysconfig import get_python_lib
 from pathlib import Path
 import linecache
 import sys
+from tempfile import NamedTemporaryFile
 
 class PackageExporter:
     """ Exporters allow you to write packages of code, pickled python data, and
@@ -70,6 +71,7 @@ def __init__(self, filename: str, verbose: bool = True):
         self.provided : Dict[str, bool] = {}
         self.verbose = verbose
         self.importers = [importlib.import_module]
+        self.debug_deps : List[Tuple[str, str]] = []
 
     def save_source_file(self, module_name: str, file_or_directory: str, dependencies=True):
         """Adds the local file system `file_or_directory` to the source package to provide the code
@@ -131,15 +133,9 @@ def save_source_string(self, module_name: str, src: str, is_package: bool = Fals
         self._write(filename, src)
         if dependencies:
             package = module_name if is_package else module_name.rsplit('.', maxsplit=1)[0]
-            dep_list = find_files_source_depends_on(src, package)
-            if self.verbose:
-                def fmt_dep(mod, obj):
-                    return f'{mod}' if obj is None else f'{mod}.{obj}'
-                dep_str = ''.join(f'  {fmt_dep(mod, obj)}\n' for mod, obj in dep_list)
-                file_info = f'(from file {orig_file_name}) ' if orig_file_name is not None else ''
-                print(f"{module_name} {file_info}depends on:\n{dep_str}\n")
-
-            for dep_module_name, dep_module_obj in dep_list:
+            dep_pairs = find_files_source_depends_on(src, package)
+            dep_list = {}
+            for dep_module_name, dep_module_obj in dep_pairs:
                 # handle the case where someone did something like `from pack import sub`
                 # where `sub` is a submodule. In this case we don't have to save pack, just sub.
                 # this ensures we don't pick up additional dependencies on pack.
@@ -148,25 +144,117 @@ def fmt_dep(mod, obj):
                 if dep_module_obj is not None:
                     possible_submodule = f'{dep_module_name}.{dep_module_obj}'
                     if self._module_exists(possible_submodule):
-                        self.require_module_if_not_provided(possible_submodule)
+                        dep_list[possible_submodule] = True
                         # we don't need to save `pack`
                         continue
                 if self._module_exists(dep_module_name):
-                    self.require_module_if_not_provided(dep_module_name)
+                    dep_list[dep_module_name] = True
+
+            for dep in dep_list.keys():
+                self.debug_deps.append((module_name, dep))
+
+            if self.verbose:
+                dep_str = ''.join(f'  {dep}\n' for dep in dep_list.keys())
+                file_info = f'(from file {orig_file_name}) ' if orig_file_name is not None else ''
+                print(f"{module_name} {file_info}depends on:\n{dep_str}\n")
+
+            for dep in dep_list.keys():
+                self.require_module_if_not_provided(dep)
 
     def _module_exists(self, module_name: str) -> bool:
         try:
             self._import_module(module_name)
             return True
-        except ModuleNotFoundError:
+        except Exception:
             return False
 
+    def _write_dep_graph(self, failing_module=None, output_file=None):
+        depended_on : Dict[str, List[str]] = {}
+        for f, t in self.debug_deps:
+            if t not in depended_on:
+                depended_on[t] = []
+            if f not in depended_on:
+                depended_on[f] = []
+            depended_on[t].append(f)
+
+        level : Dict[str, int] = {}
+
+        def visit(x: str):
+            if x in level:
+                return level[x]
+            level[x] = 0
+            for e in depended_on[x]:
+                level[x] = max(level[x], visit(e) + 1)
+            return level[x]
+
+        for x in depended_on.keys():
+            visit(x)
+
+        nodes = []
+        node_to_id = {}
+        n = 0
+        for ft in self.debug_deps:
+            for e in ft:
+                if e not in node_to_id:
+                    node_to_id[e] = n
+                    extra = ''
+                    if e == failing_module:
+                        extra = ", color: 'red'"
+                    nodes.append(f"        {{id: {n}, label: '{e}', level: {level[e]}, shape: 'box'{extra}}},\n")
+                    n += 1
+        edges = []
+        for f, t in self.debug_deps:
+            fn, tn = node_to_id[f], node_to_id[t]
+            edges.append(f"            {{from: {fn}, to: {tn}, arrows: 'to'}},\n")
+        nodes_s, edges_s = ''.join(nodes), ''.join(edges)
+        template = f"""\
+<html>
+<head>
+    <script type="text/javascript" src="https://almende.github.io/vis/dist/vis.js"></script>
+    <link href="https://almende.github.io/vis/dist/vis.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<div id="mynetwork"></div>
+
+<script type="text/javascript">
+    var nodes = new vis.DataSet([
+{nodes_s}
+    ]);
+    var edges = new vis.DataSet([
+{edges_s}
+    ]);
+    var options = {{
+        layout: {{
+            hierarchical: {{
+                direction: "LR",
+                levelSeparation: 400,
+            }},
+        }},
+    }};
+    // create a network
+    var container = document.getElementById('mynetwork');
+    var network = new vis.Network(container, {{nodes: nodes, edges: edges}}, options);
+</script>
+</body>
+</html>
+"""
+        if output_file:
+            output_file.write(template)
+            return None
+
+        with NamedTemporaryFile(mode='w', suffix='.html', delete=False) as tf:
+            tf.write(template)
+            return tf.name
+
     def _get_source_of_module(self, module: types.ModuleType) -> str:
         filename = getattr(module, '__file__', None)
-        result = None if filename is None else linecache.getlines(filename, module.__dict__)
+        result = None if filename is None or not filename.endswith('.py') else linecache.getlines(filename, module.__dict__)
         if result is None:
+            extra = ''
+            if self.verbose:
+                extra = f' See the dependency graph for more info: {self._write_dep_graph(module.__name__)}'
             raise ValueError(f'cannot save source for module "{module.__name__}" because '
-                             f'its source file "{filename}" could not be found.')
+                             f'its source file "{filename}" could not be found.{extra}')
         return ''.join(result)
 
     def require_module_if_not_provided(self, module_name: str, dependencies=True):
@@ -211,6 +299,7 @@ def _import_module(self, module_name):
                 return import_module(module_name)
             except ModuleNotFoundError as err:
                 last_err = err
+
         if last_err is not None:
             raise last_err
         else:
@@ -258,6 +347,9 @@ def save_pickle(self, package: str, resource: str, obj: Any, dependencies: bool
                     if module not in all_dependencies:
                         all_dependencies.append(module)
 
+            for dep in all_dependencies:
+                self.debug_deps.append((package + '.' + resource, dep))
+
             if self.verbose:
                 dep_string = ''.join(f'  {dep}\n' for dep in all_dependencies)
                 print(f"{resource} depends on:\n{dep_string}\n")
@@ -377,6 +469,9 @@ def close(self):
             with PackageExporter("file.zip") as e:
                 ...
         """
+        if self.verbose:
+            print(f"Dependency graph for exported package: {self._write_dep_graph()}")
+
         # Write each tensor to a file named tensor/the_tensor_key in the zip archive
         for key in sorted(self.serialized_storages.keys()):
             name = 'data/{}'.format(key)
@@ -395,6 +490,7 @@ def close(self):
         self._write('extern_modules', contents)
         del self.zip_file
 
+
     def _filename(self, package, resource):
         package_path = package.replace('.', '/')
         resource = _normalize_path(resource)
@@ -412,7 +508,7 @@ def _can_implicitly_extern(self, module_name: str):
 def _is_builtin_or_stdlib_module(module: types.ModuleType) -> bool:
     if module.__name__ in sys.builtin_module_names:
         return True
-    filename = module.__file__
+    filename = getattr(module, '__file__', None)
     if filename is None:
         return False
     standard_lib = get_python_lib(standard_lib=True)

From 6a206df891d3203ec917e8efee66457c2d974ae6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Schl=C3=BCter?= <jan.schlueter@ofai.at>
Date: Mon, 28 Sep 2020 15:33:04 -0700
Subject: [PATCH 220/449] 20000x faster audio conversion for SummaryWriter
 (#44201)

Summary:
Stumbled upon a little gem in the audio conversion for `SummaryWriter.add_audio()`: two Python `for` loops to convert a float array to little-endian int16 samples. On my machine, this took 35 seconds for a 30-second 22.05 kHz excerpt. The same can be done directly in numpy in 1.65 milliseconds. (No offense, I'm glad that the functionality was there!)

Would also be ready to extend this to support stereo waveforms, or should this become a separate PR?

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44201

Reviewed By: J0Nreynolds

Differential Revision: D23831002

Pulled By: edward-io

fbshipit-source-id: 5c8f1ac7823d1ed41b53c4f97ab9a7bac33ea94b
---
 torch/utils/tensorboard/summary.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index 8afd94febf4b..c52776a89448 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -489,27 +489,22 @@ def audio(tag, tensor, sample_rate=44100):
         print('warning: audio amplitude out of range, auto clipped.')
         tensor = tensor.clip(-1, 1)
     assert(tensor.ndim == 1), 'input tensor should be 1 dimensional.'
+    tensor = (tensor * np.iinfo(np.int16).max).astype('<i2')
 
-    tensor_list = [int(32767.0 * x) for x in tensor]
     import io
     import wave
-    import struct
     fio = io.BytesIO()
     wave_write = wave.open(fio, 'wb')
     wave_write.setnchannels(1)
     wave_write.setsampwidth(2)
     wave_write.setframerate(sample_rate)
-    tensor_enc = b''
-    for v in tensor_list:
-        tensor_enc += struct.pack('<h', v)
-
-    wave_write.writeframes(tensor_enc)
+    wave_write.writeframes(tensor.data)
     wave_write.close()
     audio_string = fio.getvalue()
     fio.close()
     audio = Summary.Audio(sample_rate=sample_rate,
                           num_channels=1,
-                          length_frames=len(tensor_list),
+                          length_frames=tensor.shape[-1],
                           encoded_audio_string=audio_string,
                           content_type='audio/wav')
     return Summary(value=[Summary.Value(tag=tag, audio=audio)])

From 96f8755034e1c1c19800525319f2911f1cd1f393 Mon Sep 17 00:00:00 2001
From: Heitor Schueroff de Souza <heitorschueroff@fb.com>
Date: Mon, 28 Sep 2020 15:53:12 -0700
Subject: [PATCH 221/449] Fixed handling of nan for evenly_distribute_backward
 (#45280)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45280

Performance is the same on CPU and on CUDA is only 1-1.05x slower. This change is necessary for the future nan ops including nan(min|max|median)

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D23908796

Pulled By: heitorschueroff

fbshipit-source-id: c2b57acbe924cfa59fbd85216811f29f4af05088
---
 test/test_autograd.py                   | 12 +++++++-----
 torch/csrc/autograd/FunctionsManual.cpp |  9 ++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 47c994251395..611a899c5703 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -6044,11 +6044,13 @@ class TestAutogradDeviceType(TestCase):
 
     def test_min_max_median_backprops_to_all_values(self, device):
         for f in [torch.min, torch.max, torch.median]:
-            x = torch.tensor([1., 0., 1., 0., 1., 0.], device=device, requires_grad=True)
-            y = f(x)
-            y.backward()
-            self.assertEqual(x.grad.sum(), 1.)
-            self.assertEqual((x.grad == 1 / 3).sum(), 3)
+            x1 = torch.tensor([1., 0., 1., 0., 1., 0.], device=device, requires_grad=True)
+            x2 = torch.tensor([float('nan'), float('nan'), float('nan')], requires_grad=True)
+            for x in [x1, x2]:
+                y = f(x)
+                y.backward()
+                self.assertEqual(x.grad.sum(), 1.)
+                self.assertEqual((x.grad == 1 / 3).sum(), 3)
 
     # skip this test if running on rocm, because in cdist
     # we use __shfl_down_sync on CUDA for fast reduction
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index c09c916ca3f8..2f1dd09cb328 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -624,13 +624,12 @@ Tensor _fused_dropout_backward(Tensor grad, Tensor mask, double p1m) {
 }
 
 Tensor evenly_distribute_backward(Tensor grad, const Tensor & input, const Tensor & value) {
-  auto mask = (input == value);
-  auto count = mask.sum();
-  auto grad_input = grad / count;
   if (input.is_cuda()) {
-    return mask * grad_input;
+    auto mask = (input == value).logical_or_(input.isnan().logical_and_(value.isnan()));
+    return mask * (grad / mask.sum());
   } else {
-    return at::zeros_like(input).masked_fill_(mask, grad_input);
+    auto mask = value.isnan().item<bool>() ? input.isnan() : input == value;
+    return at::zeros_like(input).masked_fill_(mask, grad / mask.sum());
   }
 }
 

From 49b198c45444eb44919201e6319db38b0ce443c1 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Mon, 28 Sep 2020 16:26:42 -0700
Subject: [PATCH 222/449] type check for torch.testing._internal.common_utils
 (#45375)

Summary:
part of torch.testing._internal.* effort

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45375

Reviewed By: malfet

Differential Revision: D23964315

Pulled By: walterddr

fbshipit-source-id: efdd643297f5c7f75670ffe60ff7e82fc413d18d
---
 mypy.ini                                |  3 --
 tools/pyi/gen_pyi.py                    |  6 ++-
 torch/_C/__init__.pyi.in                |  6 ++-
 torch/testing/_internal/common_utils.py | 50 ++++++++++++++-----------
 4 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 64d0fead1444..9d86e4e1def4 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -62,9 +62,6 @@ ignore_errors = True
 [mypy-torch.testing._internal.common_quantization.*]
 ignore_errors = True
 
-[mypy-torch.testing._internal.common_utils.*]
-ignore_errors = True
-
 [mypy-torch.testing._internal.generated.*]
 ignore_errors = True
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index d24966f9fb52..1b293f687359 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -466,10 +466,12 @@ def gen_pyi(declarations_path, out):
                     ' generator: Optional[Generator]=None, {}) -> Tensor: ...'
                     .format(FACTORY_PARAMS)],
         'full': ['def full(size: _size, fill_value: Number, *,'
-                 ' out: Optional[Tensor]=None, {}) -> Tensor: ...'
+                 ' out: Optional[Tensor]=None,'
+                 ' layout: _layout=strided, {}) -> Tensor: ...'
                  .format(FACTORY_PARAMS),
                  'def full(size: _size, fill_value: Number, *,'
-                 ' names: List[Union[str, None]], {}) -> Tensor: ...'
+                 ' names: List[Union[str, None]],'
+                 ' layout: _layout=strided, {}) -> Tensor: ...'
                  .format(FACTORY_PARAMS)],
         'is_grad_enabled': ['def is_grad_enabled() -> _bool: ...'],
         'nonzero': ['def nonzero(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ...',
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 8fcfb4e176b5..475eb52cfd2e 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -28,7 +28,7 @@ class device:
 
     # THPDevice_pynew
     @overload
-    def __init__(self, device: Union[_int, str]) -> None: ...
+    def __init__(self, device: Union[_device, _int, str]) -> None: ...
 
     @overload
     def __init__(self, type: str, index: _int) -> None: ...
@@ -139,6 +139,8 @@ class Future(object):
   def then(self, callback: Callable) -> Future: ...
   def set_result(self, result: Any) -> None: ...
 
+def _jit_set_num_profiled_runs(num: _size) -> _size: ...
+
 # Defined in torch/csrc/jit/passes/xnnpack_rewrite.h
 class MobileOptimizerType:
     ...
@@ -497,6 +499,7 @@ class _TensorBase(object):
 def _cuda_getCurrentStream(device: _int) -> _int: ...
 def _cuda_getDefaultStream(device: _int) -> _int: ...
 def _cuda_getCurrentBlasHandle() -> _int: ...
+def _cuda_setDevice(device: _int) -> None: ...
 def _cuda_setStream(cuda_stream: _int) -> None: ...
 def _cuda_getCompiledVersion() -> _int: ...
 def _cuda_cudaHostAllocator() -> _int: ...
@@ -549,6 +552,7 @@ class _CudaDeviceProperties:
 
 # Defined in torch/csrc/cuda/Stream.cpp
 class _CudaStreamBase:
+    _cdata: _int
     device: _device
     cuda_stream: _int
     priority: _int
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 36434ff8aa2f..6b49f9c9d899 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -15,6 +15,7 @@
 from functools import partial
 import inspect
 import io
+import operator
 import argparse
 import unittest
 import warnings
@@ -32,9 +33,9 @@
 import tempfile
 import json
 from urllib.request import urlopen
-import __main__
+import __main__  # type: ignore[import]
 import errno
-from typing import cast, Any, Iterable, Optional
+from typing import cast, Any, Dict, Iterable, Optional
 
 from torch.testing._internal import expecttest
 from torch.testing import \
@@ -127,8 +128,9 @@ def prof_func_call(*args, **kwargs):
 def prof_meth_call(*args, **kwargs):
     return prof_callable(meth_call, *args, **kwargs)
 
-torch._C.ScriptFunction.__call__ = prof_func_call
-torch._C.ScriptMethod.__call__ = prof_meth_call
+# TODO fix when https://github.com/python/mypy/issues/2427 is address
+torch._C.ScriptFunction.__call__ = prof_func_call  # type: ignore[assignment]
+torch._C.ScriptMethod.__call__ = prof_meth_call  # type: ignore[assignment]
 
 def _get_test_report_path():
     # allow users to override the test file location. We need this
@@ -276,7 +278,7 @@ def run_tests(argv=UNITTEST_ARGS):
         assert not failed, "Some test shards have failed"
     elif TEST_SAVE_XML is not None:
         # import here so that non-CI doesn't need xmlrunner installed
-        import xmlrunner
+        import xmlrunner  # type: ignore[import]
         test_report_path = TEST_SAVE_XML + LOG_SUFFIX
         os.makedirs(test_report_path, exist_ok=True)
         verbose = '--verbose' in argv or '-v' in argv
@@ -705,11 +707,11 @@ def settings(*args, **kwargs):
 except ImportError:
     print('Fail to import hypothesis in common_utils, tests are not derandomized')
 
-disabled_test_from_issues = None
+disabled_test_from_issues: Optional[Dict[str, Any]] = None
 def check_disabled(test_name):
     global disabled_test_from_issues
     if disabled_test_from_issues is None:
-        disabled_test_from_issues = {}
+        _disabled_test_from_issues: Dict = {}
 
         def read_and_process():
             url = 'https://raw.githubusercontent.com/zdevito/pytorch_disabled_tests/master/result.json'
@@ -720,18 +722,21 @@ def read_and_process():
                 key = 'DISABLED '
                 if title.startswith(key):
                     test_name = title[len(key):].strip()
-                    disabled_test_from_issues[test_name] = item['html_url']
+                    _disabled_test_from_issues[test_name] = item['html_url']
 
         if not IS_SANDCASTLE and os.getenv("PYTORCH_RUN_DISABLED_TESTS", "0") != "1":
             try:
                 read_and_process()
+                disabled_test_from_issues = _disabled_test_from_issues
             except Exception:
                 print("Couldn't download test skip set, leaving all tests enabled...")
+                disabled_test_from_issues = {}
 
-    if test_name in disabled_test_from_issues:
-        raise unittest.SkipTest(
-            "Test is disabled because an issue exists disabling it: {}".format(disabled_test_from_issues[test_name]) +
-            " To enable set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
+    if disabled_test_from_issues is not None:
+        if test_name in disabled_test_from_issues:
+            raise unittest.SkipTest(
+                "Test is disabled because an issue exists disabling it: {}".format(disabled_test_from_issues[test_name]) +
+                " To enable set the environment variable PYTORCH_RUN_DISABLED_TESTS=1")
 
 # Acquires the comparison dtype, required since isclose
 # requires both inputs have the same dtype, and isclose is not supported
@@ -880,7 +885,7 @@ def safeCoalesce(self, t):
             self.assertEqual(t._values(), tc._values())
             return tc
 
-        value_map = {}
+        value_map: Dict[Any, Any] = {}
         for idx, val in zip(t._indices().t(), t._values()):
             idx_tup = tuple(idx.tolist())
             if idx_tup in value_map:
@@ -889,11 +894,11 @@ def safeCoalesce(self, t):
                 value_map[idx_tup] = val.clone() if isinstance(val, torch.Tensor) else val
 
         new_indices = sorted(list(value_map.keys()))
-        new_values = [value_map[idx] for idx in new_indices]
+        _new_values = [value_map[idx] for idx in new_indices]
         if t._values().ndimension() < 2:
-            new_values = t._values().new(new_values)
+            new_values = t._values().new(_new_values)
         else:
-            new_values = torch.stack(new_values)
+            new_values = torch.stack(_new_values)
 
         new_indices = t._indices().new(new_indices).t()
         tg = t.new(new_indices, new_values, t.size())
@@ -1169,8 +1174,8 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *,
         else:
             super().assertEqual(x, y, msg=msg)
 
-    def assertNotEqual(self, x, y, msg: Optional[str] = None, *,
-                       atol: Optional[float] = None, rtol: Optional[float] = None, **kwargs) -> None:
+    def assertNotEqual(self, x, y, msg: Optional[str] = None, *,                                       # type: ignore[override] 
+                       atol: Optional[float] = None, rtol: Optional[float] = None, **kwargs) -> None:  # type: ignore[override]
         with self.assertRaises(AssertionError, msg=msg):
             self.assertEqual(x, y, msg, atol=atol, rtol=rtol, **kwargs)
 
@@ -1229,7 +1234,7 @@ def maybeWarnsRegex(self, category, regex=''):
                     msg = 'Caught unexpected warnings:\n'
                     for w in ws:
                         msg += warnings.formatwarning(
-                            w.message, w.category, w.filename, w.lineno, w.line)
+                            str(w.message), w.category, w.filename, w.lineno, w.line)
                         msg += '\n'
                     self.fail(msg)
 
@@ -1608,7 +1613,8 @@ def random_sparse_matrix(rows, columns, density=0.01, **kwargs):
     values = torch.randn(nonzero_elements, dtype=dtype, device=device)
     # ensure that the diagonal dominates
     values *= torch.tensor([-float(i - j)**2 for i, j in zip(*indices)], dtype=dtype, device=device).exp()
-    A = torch.sparse_coo_tensor(indices, values, (rows, columns), device=device)
+    indices_tensor = torch.tensor(indices)
+    A = torch.sparse_coo_tensor(indices_tensor, values, (rows, columns), device=device)
     return A.coalesce()
 
 
@@ -1665,8 +1671,8 @@ def multiply(data, N, i, j, cs, sn, left=True):
         icoords.append(i)
         jcoords.append(j)
         values.append(v)
-    indices = [icoords, jcoords]
-    return torch.sparse_coo_tensor(indices, values, (matrix_size, matrix_size), dtype=dtype, device=device)
+    indices_tensor = torch.tensor([icoords, jcoords])
+    return torch.sparse_coo_tensor(indices_tensor, values, (matrix_size, matrix_size), dtype=dtype, device=device)
 
 
 def do_test_dtypes(self, dtypes, layout, device):

From 5855aa8dac7e8804647666782fcf65767309b227 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Mon, 28 Sep 2020 16:47:55 -0700
Subject: [PATCH 223/449] Type check quasirandom (#45434)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/42978.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45434

Reviewed By: walterddr

Differential Revision: D23967139

Pulled By: ajitmaths

fbshipit-source-id: bcee6627f367fd01aa9a5c10a7c24331fc1823ad
---
 mypy.ini             | 3 ---
 torch/quasirandom.py | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 9d86e4e1def4..bcff0da0c42d 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -90,9 +90,6 @@ ignore_errors = True
 [mypy-torch.quantization.fx.*]
 ignore_errors = True
 
-[mypy-torch.quasirandom]
-ignore_errors = True
-
 [mypy-torch.distributions.*]
 ignore_errors = True
 
diff --git a/torch/quasirandom.py b/torch/quasirandom.py
index c4b75f4cd5ce..b738dcf7b60c 100644
--- a/torch/quasirandom.py
+++ b/torch/quasirandom.py
@@ -1,4 +1,5 @@
 import torch
+from typing import Optional
 
 
 class SobolEngine(object):
@@ -57,11 +58,10 @@ def __init__(self, dimension, scramble=False, seed=None):
         torch._sobol_engine_initialize_state_(self.sobolstate, self.dimension)
 
         if self.scramble:
+            g: Optional[torch.Generator] = None
             if self.seed is not None:
                 g = torch.Generator()
                 g.manual_seed(self.seed)
-            else:
-                g = None
 
             shift_ints = torch.randint(2, (self.dimension, self.MAXBIT), device=cpu, generator=g)
             self.shift = torch.mv(shift_ints, torch.pow(2, torch.arange(0, self.MAXBIT, device=cpu)))

From 7ac872b9347db7af0265b16d2e49110e1479bde3 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Mon, 28 Sep 2020 17:15:06 -0700
Subject: [PATCH 224/449] [JIT] Modify to_backend API so that it accepts
 wrapped modules (#43612)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43612

**Summary**
This commit modifies the `torch._C._jit_to_backend` function so that it
accepts `ScriptModules` as inputs. It already returns `ScriptModules`
(as opposed to C++ modules), so this makes sense and makes the API more
intuitive.

**Test Plan**
Continuous integration, which includes unit tests and out-of-tree tests
for custom backends.

**Fixes**
This commit fixes #41432.

Test Plan: Imported from OSS

Reviewed By: suo, jamesr66a

Differential Revision: D23339854

Pulled By: SplitInfinity

fbshipit-source-id: 08ecef729c4e1e6bddf3f483276947fc3559ea88
---
 test/custom_backend/backend.py           | 2 +-
 test/jit/test_backends.py                | 4 ++--
 torch/csrc/jit/backends/backend_init.cpp | 8 +++++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/test/custom_backend/backend.py b/test/custom_backend/backend.py
index 17e399d320a7..8b48ed0a4108 100644
--- a/test/custom_backend/backend.py
+++ b/test/custom_backend/backend.py
@@ -33,7 +33,7 @@ def to_custom_backend(module):
     Returns:
         The module, lowered so that it can run on TestBackend.
     """
-    lowered_module = torch._C._jit_to_backend("custom_backend", module._c, {"forward": {"": ""}})
+    lowered_module = torch._C._jit_to_backend("custom_backend", module, {"forward": {"": ""}})
     return lowered_module
 
 
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index e2eaa0b2a1e5..9902ec4748f6 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -101,7 +101,7 @@ def setUp(self):
         self.module = BasicModule()
         self.scripted_module = torch.jit.script(BasicModule())
         self.lowered_module = to_test_backend_multi(
-            self.scripted_module._c,
+            self.scripted_module,
             {"accum": {"": ""}, "sub_accum": {"": ""}, "forward": {"": ""}},
         )
 
@@ -161,7 +161,7 @@ def setUp(self):
         # Both modules in self.scripted_module are ScriptModules.
         self.scripted_module = torch.jit.script(NestedModuleTest.NestedModule(BasicModule()))
         lowered_module = to_test_backend_multi(
-            self.scripted_module._c, {"forward": {"": ""}}
+            self.scripted_module, {"forward": {"": ""}}
         )
         # self.lowered_module is a ScriptModule, but its submodule is a lowered module.
         self.lowered_module = torch.jit.script(NestedModuleTest.NestedModule(lowered_module))
diff --git a/torch/csrc/jit/backends/backend_init.cpp b/torch/csrc/jit/backends/backend_init.cpp
index b01cb62dc3a2..17c92cb14023 100644
--- a/torch/csrc/jit/backends/backend_init.cpp
+++ b/torch/csrc/jit/backends/backend_init.cpp
@@ -226,11 +226,13 @@ void initJitBackendBindings(PyObject* module) {
   m.def(
       "_jit_to_backend",
       [=](const std::string& backend_name,
-          const Module& orig_module,
+          py::handle orig_module,
           const py::dict& method_compile_spec) {
         return py::module::import("torch.jit._recursive")
-            .attr("wrap_cpp_module")(
-                codegen_lambda(backend_name, orig_module, method_compile_spec));
+            .attr("wrap_cpp_module")(codegen_lambda(
+                backend_name,
+                py::cast<Module>(orig_module.attr("_c")),
+                method_compile_spec));
       });
 }
 } // namespace jit

From 52cbc9e4ece50bae501b5c74032fcb20458e1a37 Mon Sep 17 00:00:00 2001
From: Alex Suhan <asuhan@fb.com>
Date: Mon, 28 Sep 2020 18:07:05 -0700
Subject: [PATCH 225/449] [TensorExpr] Always inline and DCE in the LLVM
 backend (#45445)

Summary:
Inline pytorch into wrapper, which is especially helpful in combination
with dead code elimination to reduce IR size and compilation times when
a lot of parameters are unused.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45445

Test Plan: CI

Reviewed By: ZolotukhinM

Differential Revision: D23969009

Pulled By: asuhan

fbshipit-source-id: a21509d07e4c130b6aa6eae5236bb64db2748a3d
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index e4331bc2d824..d817f62339ba 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -13,7 +13,9 @@
 #include <llvm/IR/Verifier.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/IPO/AlwaysInliner.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm/Transforms/Scalar.h>
 
 #include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/execution_counter.h>
@@ -299,6 +301,9 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
   llvm::FunctionType* fntype = llvm::FunctionType::get(retTy, params, false);
   fn_ = llvm::Function::Create(
       fntype, llvm::Function::PrivateLinkage, "pytorch", module_.get());
+  fn_->addAttribute(
+      llvm::AttributeList::AttrIndex::FunctionIndex,
+      llvm::Attribute::AlwaysInline);
   for (size_t i = 0; i < args.size(); i++) {
     if (!args[i].isVar()) {
       fn_->addParamAttr(i, llvm::Attribute::NoAlias);
@@ -1642,6 +1647,8 @@ void LLVMCodeGenImpl::optimize(llvm::Module& M) {
   PMB.populateFunctionPassManager(FPM);
   PMB.populateModulePassManager(PM);
   FPM.doInitialization();
+  PM.add(llvm::createDeadCodeEliminationPass());
+  PM.add(llvm::createAlwaysInlinerLegacyPass());
   PM.run(M);
   for (auto& FF : M) {
     FPM.run(FF);

From 4af4b71fdc0055b2f110ba2ab5388c01529422fd Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Mon, 28 Sep 2020 18:07:32 -0700
Subject: [PATCH 226/449] [JIT] Update docs for recently added features
 (#45232)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45232

**Summary**
This commit updates the TorchScript language reference to include
documentation on recently-added TorchScript enums. It also removed
`torch.no_grad` from the list of known unsupported `torch` modules and
classes because it is now supported.

**Test Plan**
Continuous integration.

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D23971884

Pulled By: SplitInfinity

fbshipit-source-id: 5e2c164ed59bc0926b11201106952cff86e9356e
---
 docs/source/jit_language_reference.rst | 33 ++++++++++++++++++++++++++
 docs/source/jit_unsupported.rst        |  1 -
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/docs/source/jit_language_reference.rst b/docs/source/jit_language_reference.rst
index 4cca46fdc005..205195f59f6b 100644
--- a/docs/source/jit_language_reference.rst
+++ b/docs/source/jit_language_reference.rst
@@ -72,6 +72,7 @@ net models. In particular, TorchScript supports:
    "``Optional[T]``", "A value which is either None or type ``T``"
    "``Dict[K, V]``", "A dict with key type ``K`` and value type ``V``. Only ``str``, ``int``, and ``float`` are allowed as key types."
    "``T``", "A `TorchScript Class`_"
+   "``E``", "A `TorchScript Enum`_"
    "``NamedTuple[T0, T1, ...]``", "A :func:`collections.namedtuple <collections.namedtuple>` tuple type"
 
 Unlike Python, each variable in TorchScript function must have a single static type.
@@ -271,6 +272,7 @@ Example (refining types on parameters and locals):
     module = torch.jit.script(M(2))
     module = torch.jit.script(M(None))
 
+
 .. _TorchScript Class:
 .. _TorchScript Classes:
 .. _torchscript-classes:
@@ -346,6 +348,37 @@ like any other TorchScript type:
     print(sum_pair(p))
 
 
+.. _TorchScript Enum:
+.. _TorchScript Enums:
+.. _torchscript-enums:
+
+TorchScript Enums
+^^^^^^^^^^^^^^^^^^^
+
+Python enums can be used in TorchScript without any extra annotation or code:
+
+::
+
+    from enum import Enum
+
+
+    class Color(Enum):
+        RED = 1
+        GREEN = 2
+
+    @torch.jit.script
+    def enum_fn(x: Color, y: Color) -> bool:
+        if x == Color.RED:
+            return True
+
+        return x == y
+
+After an enum is defined, it can be used in both TorchScript and Python interchangeably
+like any other TorchScript type. The type of the values of an enum must be ``int``,
+``float``, or ``str``. All values must be of the same type; heterogenous types for enum
+values are not supported.
+
+
 Named Tuples
 ^^^^^^^^^^^^
 Types produced by :func:`collections.namedtuple <collections.namedtuple>` can be used in TorchScript.
diff --git a/docs/source/jit_unsupported.rst b/docs/source/jit_unsupported.rst
index 8bf3e78d672a..7368abad1e30 100644
--- a/docs/source/jit_unsupported.rst
+++ b/docs/source/jit_unsupported.rst
@@ -87,6 +87,5 @@ we suggest using :meth:`torch.jit.trace`.
   * :class:`torch.nn.RNN`
   * :class:`torch.nn.AdaptiveLogSoftmaxWithLoss`
   * :class:`torch.autograd.Function`
-  * :class:`torch.autograd.no_grad`
   * :class:`torch.autograd.enable_grad`
   * :class:`torch.Generator`

From a0f0cb1608427a0d196ce635de04afe54f7ca0e1 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Mon, 28 Sep 2020 18:07:32 -0700
Subject: [PATCH 227/449] [JIT] Add test for ignored class type property
 (#45233)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45233

**Summary**
This commit modifies `TestClassType.test_properties` to check that
properties on class types can be ignored with the same syntax as
ignoring properties on `Modules`.

**Test Plan**
`python test/test_jit.py TestClassType.test_properties`

Test Plan: Imported from OSS

Reviewed By: ZolotukhinM

Differential Revision: D23971885

Pulled By: SplitInfinity

fbshipit-source-id: f2228f61fe26dff219024668cc0444a2baa8834c
---
 test/jit/test_class_type.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index 3fcd89347091..dda6916b5591 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -1167,6 +1167,8 @@ def free_function(x: int) -> int:
 
         @torch.jit.script
         class Properties(object):
+            __ignored_properties__ = ["unsupported"]
+
             def __init__(self, a: int):
                 self.a = a
 
@@ -1174,6 +1176,10 @@ def __init__(self, a: int):
             def attr(self) -> int:
                 return self.a - 1
 
+            @property
+            def unsupported(self) -> int:
+                return sum([self.a])
+
             @attr.setter
             def attr(self, value: int):
                 self.a = value + 3

From 0c8a6008acf14a2d083f7cea78a595127597d36b Mon Sep 17 00:00:00 2001
From: Xiong Wei <xiongw.fnst@cn.fujitsu.com>
Date: Mon, 28 Sep 2020 18:17:40 -0700
Subject: [PATCH 228/449] Fix torch.pow when the scalar base is a complex
 number (#45259)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/43829

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45259

Reviewed By: gchanan

Differential Revision: D23962073

Pulled By: anjali411

fbshipit-source-id: 1b16afbb98f33fa7bc53c6ca296c5ddfcbdd2b72
---
 aten/src/ATen/native/Pow.cpp | 4 +++-
 test/test_torch.py           | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index 414c8a6f6390..c10a617a5928 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -43,7 +43,9 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) {
 }
 
 Tensor& pow_out(Tensor& result, Scalar base, const Tensor& exp) {
-  if (base.toDouble() == 1.0) {
+  if (base.isComplex() && base.toComplexDouble() == 1.0) {
+    result.resize_as_(exp).fill_(1);
+  } else if (!base.isComplex() && base.toDouble() == 1.0) {
     result.resize_as_(exp).fill_(1);
   } else {
     native::pow_out(result, c10::scalar_to_tensor(base, exp.device()), exp);
diff --git a/test/test_torch.py b/test/test_torch.py
index 90368696a0e9..76c6e9bd2b78 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -13716,6 +13716,15 @@ def test_float_scalar_pow_float_tensor(self, device):
         for base in floats:
             self._test_pow(base, tensor)
 
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
+    @dtypes(*(torch.testing.get_all_dtypes(include_bool=False, include_bfloat16=False)))
+    def test_complex_scalar_pow_tensor(self, device, dtype):
+        complexes = [0.5j, 1. + 1.j, -1.5j, 2.2 - 1.6j]
+        tensor = torch.rand(100).to(dtype=dtype, device=device)
+        for base in complexes:
+            self._test_pow(base, tensor)
+
     @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
     def test_tensor_pow_tensor(self, dev):
         def rotate(l, n):

From 8c66cd120bd52055dab215033c2b487a8aff7fbb Mon Sep 17 00:00:00 2001
From: kiyosora <xueht.fnst@cn.fujitsu.com>
Date: Mon, 28 Sep 2020 19:05:11 -0700
Subject: [PATCH 229/449] Disable complex inputs to torch.round (#45330)

Summary:
- Related with https://github.com/pytorch/pytorch/issues/44612
- Disable complex inputs to `torch.round`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45330

Reviewed By: gchanan

Differential Revision: D23970781

Pulled By: anjali411

fbshipit-source-id: b8c9ac315ae0fc872701aa132367c3171fd56185
---
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp       | 2 +-
 aten/src/ATen/native/cuda/UnaryFractionKernels.cu | 2 +-
 test/test_autograd.py                             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 9435921039b3..433a1054e366 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -682,7 +682,7 @@ IMPLEMENT_COMPLEX_KERNEL(FLOATING, log10)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, log1p)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, log2)
 IMPLEMENT_FLOAT_KERNEL(FLOATING, i0)
-IMPLEMENT_COMPLEX_KERNEL(FLOATING, round)
+IMPLEMENT_FLOAT_KERNEL(FLOATING, round)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, sin)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, sqrt)
 IMPLEMENT_COMPLEX_KERNEL(FLOATING, tan)
diff --git a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
index eb9250befd56..6046bc9a1f01 100644
--- a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
+++ b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu
@@ -114,7 +114,7 @@ __host__ __device__ static inline c10::complex<double> nearbyint_wrapper(c10::co
 }
 
 void round_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.dtype(), "round_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "round_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       // We do not use std::round because we would like to round midway numbers to the nearest even integer.
       return nearbyint_wrapper(a);
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 611a899c5703..bcc775aa5713 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4798,7 +4798,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
 complex_list = ['t', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone',
                 'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose',
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
-                'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'round',
+                'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
                 'cosh', '__rmul__', 'sgn'] + separate_complex_tests
 

From 208df1aeb871736a89bc621107ac6396da2c4660 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Mon, 28 Sep 2020 19:20:03 -0700
Subject: [PATCH 230/449] Use python 3.8 in pytorch docker image (#45466)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45466

Test Plan: Imported from OSS

Reviewed By: seemethere

Differential Revision: D23975294

Pulled By: tierex

fbshipit-source-id: 964de7928b541121963e9de792630bcef172bb5c
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index d328a75c7c6c..3706aa38b461 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@
 #       For reference: 
 #           https://docs.docker.com/develop/develop-images/build_enhancements/
 ARG BASE_IMAGE=ubuntu:18.04
-ARG PYTHON_VERSION=3.7
+ARG PYTHON_VERSION=3.8
 
 FROM ${BASE_IMAGE} as dev-base
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \

From 534f2ae582f48aa06ff4bed856b1d4e0303f5808 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Mon, 28 Sep 2020 20:31:21 -0700
Subject: [PATCH 231/449] Disable inplace abs for complex tensors (#45069)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45069

`torch.abs` is a `C -> R` function for complex input. Following the general semantics in torch, the in-place version of abs should be disabled for complex input.

Test Plan: Imported from OSS

Reviewed By: glaringlee, malfet

Differential Revision: D23818397

Pulled By: anjali411

fbshipit-source-id: b23b8d0981c53ba0557018824d42ed37ec13d4e2
---
 aten/src/ATen/native/UnaryOps.cpp |  5 ++++-
 test/test_autograd.py             | 11 ++++++++---
 test/test_torch.py                |  4 ++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index f9af400ba2f4..2c66a1970863 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -162,7 +162,10 @@ Tensor& abs_out(Tensor& result, const Tensor& self) {
 Tensor abs(const Tensor& self) {
   return unary_op_impl_with_complex_to_float(self, at::abs_out);
 }
-Tensor& abs_(Tensor& self) { return unary_op_impl_(self, at::abs_out); }
+Tensor& abs_(Tensor& self) {
+  TORCH_CHECK(!self.is_complex(), "In-place abs is not supported for complex tensors.");
+  return unary_op_impl_(self, at::abs_out);
+}
 
 // Absolute, alias for abs
 Tensor& absolute_out(Tensor& result, const Tensor& self) {
diff --git a/test/test_autograd.py b/test/test_autograd.py
index bcc775aa5713..e68f1f667d6b 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4800,7 +4800,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
                 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
-                'cosh', '__rmul__', 'sgn'] + separate_complex_tests
+                'cosh', '__rmul__', 'sgn', 'abs'] + separate_complex_tests
 
 # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
 # complex_list += ['fill_', 't', '__rdiv__', 'tanh']
@@ -4925,7 +4925,9 @@ def fn(*inputs):
                                         'broadcast_all' in test_name or
                                         'atanh' in test_name or
                                         'acosh' in test_name or
-                                        'asinh' in test_name)
+                                        'asinh' in test_name or
+                                        'abs_complex' in test_name or
+                                        'abs_scalar_complex' in test_name)
                         if hasattr(torch.ones(1), inplace_name) and not skip_inplace:
                             output_variable = getattr(self_variable, name)(*args_variable, **kwargs_variable)
                             if not isinstance(output_variable, tuple):
@@ -4972,7 +4974,10 @@ def fn(*inputs):
                 inplace_name = name + '_'
                 # can't broadcast inplace to left hand side
                 broadcast_skip_inplace = 'broadcast_lhs' in test_name or 'broadcast_all' in test_name
-                if hasattr(torch.ones(1), inplace_name) and not broadcast_skip_inplace:
+                # skip C -> R inplace tests
+                skip_c_to_r_inplace = 'abs_complex' in test_name or 'abs_scalar_complex' in test_name
+                skip_inplace = broadcast_skip_inplace or skip_c_to_r_inplace
+                if hasattr(torch.ones(1), inplace_name) and not skip_inplace:
                     check(inplace_name)
 
             assert not hasattr(TestAutograd, test_name), 'Two tests have the same name: ' + test_name
diff --git a/test/test_torch.py b/test/test_torch.py
index 76c6e9bd2b78..8badba980925 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -4781,6 +4781,10 @@ def test_abs_angle_complex_to_float(self, device, dtype):
                 if fn_name == 'abs':
                     torch_inplace_method = getattr(torch.Tensor, fn_name + "_")
                     np_fn(a, out=a)
+                    if dtype.is_complex:
+                        with self.assertRaisesRegex(RuntimeError, "In-place abs is not supported for complex tensors."):
+                            torch_inplace_method(t)
+                        return
                     torch_inplace_method(t)
                     self.assertEqual(torch.from_numpy(a), t.cpu())
 

From 6967e6295ea7dc26e1b7c192b2fc7dfb1145d971 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 28 Sep 2020 20:41:08 -0700
Subject: [PATCH 232/449] Fix DDP docs (#45454)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45454

Test Plan: Imported from OSS

Reviewed By: pritamdamania87

Differential Revision: D23973367

Pulled By: mrshenli

fbshipit-source-id: 11f20d51d0d0f92f199e4023f02b86623867bae0
---
 torch/nn/parallel/distributed.py | 262 +++++++++++++++----------------
 1 file changed, 126 insertions(+), 136 deletions(-)

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 5ec2b0148a21..715cbfeb62f4 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -112,37 +112,36 @@ class DistributedDataParallel(Module):
     :class:`torch.nn.DataParallel` for single-node multi-GPU data
     parallel training.
 
-    Here is how to use it: on each host with N GPUs, you should spawn up N
-    processes, while ensuring that each process individually works on a single GPU
-    from 0 to N-1. Therefore, it is your job to ensure that your training script
-    operates on a single given GPU by calling:
+    To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn
+    up ``N`` processes, ensuring that each process exclusively works on a single
+    GPU from 0 to N-1. This can be done by either setting
+    ``CUDA_VISIBLE_DEVICES`` for every process or by calling:
 
         >>> torch.cuda.set_device(i)
 
     where i is from 0 to N-1. In each process, you should refer the following
     to construct this module:
 
-        >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+        >>> torch.distributed.init_process_group(
+        >>>     backend='nccl', world_size=N, init_method='...'
+        >>> )
         >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i)
 
     In order to spawn up multiple processes per node, you can use either
-    ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``
+    ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``.
 
     .. note ::
         Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__
         for a brief introduction to all features related to distributed training.
 
-    .. note:: ``nccl`` backend is currently the fastest and
-        highly recommended backend to be used with Multi-Process Single-GPU
-        distributed training and this applies to both single-node and multi-node
-        distributed training
+    .. note:: ``nccl`` backend is currently the fastest and highly recommended
+        backend when using GPUs. This applies to both single-node and
+        multi-node distributed training.
 
     .. note:: This module also supports mixed-precision distributed training.
         This means that your model can have different types of parameters such
-        as mixed types of fp16 and fp32, the gradient reduction on these
+        as mixed types of ``fp16`` and ``fp32``, the gradient reduction on these
         mixed types of parameters will just work fine.
-        Also note that ``nccl`` backend is currently the fastest and highly
-        recommended backend for fp16/fp32 mixed-precision training.
 
     .. note:: If you use ``torch.save`` on one process to checkpoint the module,
         and ``torch.load`` on some other processes to recover it, make sure that
@@ -155,16 +154,56 @@ class DistributedDataParallel(Module):
         trained on a single node with ``batch=M*N`` (because the gradients
         between different nodes are averaged). You should take this into
         consideration when you want to obtain a mathematically equivalent
-        training process compared to the non-DistributedDataParallel
-        counterpart.
+        training process compared to the local training counterpart.
 
-    .. warning::
-        This module works only with the ``gloo`` and ``nccl`` backends.
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. note::
+        If you are using DistributedDataParallel in conjunction with the
+        :ref:`distributed-rpc-framework`, you should always use
+        :meth:`torch.distributed.autograd.backward` to compute gradients and
+        :class:`torch.distributed.optim.DistributedOptimizer` for optimizing
+        parameters.
+
+        Example::
+
+            >>> import torch.distributed.autograd as dist_autograd
+            >>> from torch.nn.parallel import DistributedDataParallel as DDP
+            >>> from torch import optim
+            >>> from torch.distributed.optim import DistributedOptimizer
+            >>> from torch.distributed.rpc import RRef
+            >>>
+            >>> t1 = torch.rand((3, 3), requires_grad=True)
+            >>> t2 = torch.rand((3, 3), requires_grad=True)
+            >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2))
+            >>> ddp_model = DDP(my_model)
+            >>>
+            >>> # Setup optimizer
+            >>> optimizer_params = [rref]
+            >>> for param in ddp_model.parameters():
+            >>>     optimizer_params.append(RRef(param))
+            >>>
+            >>> dist_optim = DistributedOptimizer(
+            >>>     optim.SGD,
+            >>>     optimizer_params,
+            >>>     lr=0.05,
+            >>> )
+            >>>
+            >>> with dist_autograd.context() as context_id:
+            >>>     pred = ddp_model(rref.to_here())
+            >>>     loss = loss_func(pred, loss)
+            >>>     dist_autograd.backward(context_id, loss)
+            >>>     dist_optim.step()
 
     .. warning::
         Constructor, forward method, and differentiation of the output (or a
-        function of the output of this module) is a distributed synchronization
-        point. Take that into account in case different processes might be
+        function of the output of this module) are distributed synchronization
+        points. Take that into account in case different processes might be
         executing different code.
 
     .. warning::
@@ -175,7 +214,7 @@ class DistributedDataParallel(Module):
     .. warning::
         This module assumes all parameters are registered in the model of each
         distributed processes are in the same order. The module itself will
-        conduct gradient all-reduction following the reverse order of the
+        conduct gradient ``allreduce`` following the reverse order of the
         registered parameters of the model. In other words, it is users'
         responsibility to ensure that each distributed process has the exact
         same model and thus the exact same parameter registration order.
@@ -208,139 +247,84 @@ class DistributedDataParallel(Module):
 
     .. warning::
         You should never try to change your model's parameters after wrapping
-        up your model with DistributedDataParallel. In other words, when
-        wrapping up your model with DistributedDataParallel, the constructor of
-        DistributedDataParallel will register the additional gradient
+        up your model with ``DistributedDataParallel``. Because, when
+        wrapping up your model with ``DistributedDataParallel``, the constructor
+        of ``DistributedDataParallel`` will register the additional gradient
         reduction functions on all the parameters of the model itself at the
-        time of construction. If you change the model's parameters after
-        the DistributedDataParallel construction, this is not supported and
-        unexpected behaviors can happen, since some parameters' gradient
-        reduction functions might not get called.
-
-    .. note::
-        Parameters are never broadcast between processes. The module performs
-        an all-reduce step on gradients and assumes that they will be modified
-        by the optimizer in all processes in the same way. Buffers
-        (e.g. BatchNorm stats) are broadcast from the module in process of rank
-        0, to all other replicas in the system in every iteration.
-
-    .. note::
-        If you are using DistributedDataParallel in conjunction with the
-        :ref:`distributed-rpc-framework`, you should always use
-        :meth:`torch.distributed.autograd.backward` to compute gradients and
-        :class:`torch.distributed.optim.DistributedOptimizer` for optimizing
+        time of construction. If you change the model's parameters afterwards,
+        gradient redunction functions no longer match the correct set of
         parameters.
 
-    Example::
-
-        >>> import torch.distributed.autograd as dist_autograd
-        >>> from torch.nn.parallel import DistributedDataParallel as DDP
-        >>> from torch import optim
-        >>> from torch.distributed.optim import DistributedOptimizer
-        >>> from torch.distributed.rpc import RRef
-        >>>
-        >>> t1 = torch.rand((3, 3), requires_grad=True)
-        >>> t2 = torch.rand((3, 3), requires_grad=True)
-        >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2))
-        >>> ddp_model = DDP(my_model)
-        >>>
-        >>> # Setup optimizer
-        >>> optimizer_params = [rref]
-        >>> for param in ddp_model.parameters():
-        >>>     optimizer_params.append(RRef(param))
-        >>>
-        >>> dist_optim = DistributedOptimizer(
-        >>>     optim.SGD,
-        >>>     optimizer_params,
-        >>>     lr=0.05,
-        >>> )
-        >>>
-        >>> with dist_autograd.context() as context_id:
-        >>>     pred = ddp_model(rref.to_here())
-        >>>     loss = loss_func(pred, loss)
-        >>>     dist_autograd.backward(context_id, loss)
-        >>>     dist_optim.step()
-
     .. warning::
-        Using DistributedDataParallel in conjuction with the
+        Using ``DistributedDataParallel`` in conjunction with the
         :ref:`distributed-rpc-framework` is experimental and subject to change.
 
+    .. warning::
+        The ``gradient_as_bucket_view`` mode  does not yet work with Automatic
+        Mixed Precision (AMP). AMP maintains stashed gradients that are used for
+        unscaling gradients. With ``gradient_as_bucket_view=True``, these
+        stashed gradients will point to communication buckets in the first
+        iteration. In the next iteration, the communication buckets are mutated
+        and thus these stashed gradients will be unexpectedly mutated as well,
+        which might lead to wrong results.
+
     Args:
         module (Module): module to be parallelized
         device_ids (list of int or torch.device): CUDA devices. This should
                    only be provided when the input module resides on a single
-                   CUDA device. For single-device modules, the ``i``th
+                   CUDA device. For single-device modules, the i'th
                    :attr:`module` replica is placed on ``device_ids[i]``. For
-                   multi-device modules and CPU modules, device_ids must be None
-                   or an empty list, and input data for the forward pass must be
-                   placed on the correct device. (default: all devices for
-                   single-device modules)
-        output_device (int or torch.device): device location of output for
+                   multi-device modules and CPU modules, ``device_ids`` must be
+                   ``None`` or an empty list, and input data for the forward
+                   pass must be placed on the correct device. (default: all
+                   visible devices for single-device modules)
+        output_device (int or torch.device): Device location of output for
                       single-device CUDA modules. For multi-device modules and
-                      CPU modules, it must be None, and the module itself
-                      dictates the output location. (default: device_ids[0] for
-                      single-device modules)
-        broadcast_buffers (bool): flag that enables syncing (broadcasting) buffers of
-                          the module at beginning of the forward function.
-                          (default: ``True``)
-        process_group: the process group to be used for distributed data
+                      CPU modules, it must be ``None``, and the module itself
+                      dictates the output location. (default: ``device_ids[0]``
+                      for single-device modules)
+        broadcast_buffers (bool): Flag that enables syncing (broadcasting)
+                          buffers of the module at beginning of the ``forward``
+                          function. (default: ``True``)
+        process_group: The process group to be used for distributed data
                        all-reduction. If ``None``, the default process group, which
-                       is created by ```torch.distributed.init_process_group```,
+                       is created by :func:`torch.distributed.init_process_group`,
                        will be used. (default: ``None``)
-        bucket_cap_mb: DistributedDataParallel will bucket parameters into
+        bucket_cap_mb: ``DistributedDataParallel`` will bucket parameters into
                        multiple buckets so that gradient reduction of each
                        bucket can potentially overlap with backward computation.
-                       :attr:`bucket_cap_mb` controls the bucket size in MegaBytes (MB)
-                       (default: 25)
-        find_unused_parameters (bool): Traverse the autograd graph of all tensors
-                                       contained in the return value of the wrapped
-                                       module's ``forward`` function.
-                                       Parameters that don't receive gradients as
-                                       part of this graph are preemptively marked
-                                       as being ready to be reduced. Note that all
-                                       ``forward`` outputs that are derived from
-                                       module parameters must participate in
-                                       calculating loss and later the gradient
-                                       computation. If they don't, this wrapper will
-                                       hang waiting for autograd to produce gradients
-                                       for those parameters. Any outputs derived from
-                                       module parameters that are otherwise unused can
-                                       be detached from the autograd graph using
-                                       ``torch.Tensor.detach``. (default: ``False``)
-        check_reduction: when setting to ``True``, it enables DistributedDataParallel
-                         to automatically check if the previous iteration's
-                         backward reductions were successfully issued at the
-                         beginning of every iteration's forward function.
-                         You normally don't need this option enabled unless you
-                         are observing weird behaviors such as different ranks
-                         are getting different gradients, which should not
-                         happen if DistributedDataParallel is correctly used.
-                         (default: ``False``)
-        gradient_as_bucket_view (bool): this is a prototype feature. When set to ``True``,
-                      gradients will be views pointing to different offsets of
-                      allreduce communication buckets. This can reduce peak memory
-                      usage, where the saved memory size will be equal to the total
-                      gradients size. Moreover, it avoids the overhead of copying
-                      between gradients and allreduce communication buckets.
-                      When gradients are views, ``detach_()`` cannot be called on the
-                      gradients. If hitting such errors, please fix it by referring to
-                      the :meth:`~torch.optim.Optimizer.zero_grad` function in
-                      ``torch/optim/optimizer.py`` as the solution.
-                      Warning! It is also found that ``gradient_as_bucket_view = true``
-                      does not work as expected when ``apex.amp`` is used for
-                      mixed precision training. ``apex.amp`` maintained stashed gradients
-                      that are used for unscaling gradients. These stashed gradients
-                      are pointed to gradients (will be communication buckets when
-                      ``gradient_as_bucket_view = true``) before starting new iteration.
-                      In new iteration, the communication buckets are mutated and thus
-                      these stashed gradients will be unexpectedly mutated as well,
-                      the unexpectedly muated stashed gradients may result in wrong
-                      results. To fix it, these stashed gradients should not be pointed
-                      to gradients, instead they should be copied from gradients when
-                      ``gradient_as_bucket_view = true``.
+                       :attr:`bucket_cap_mb` controls the bucket size in
+                       MegaBytes (MB). (default: 25)
+        find_unused_parameters (bool): Traverse the autograd graph from all
+                               tensors contained in the return value of the
+                               wrapped module's ``forward`` function. Parameters
+                               that don't receive gradients as part of this
+                               graph are preemptively marked as being ready to
+                               be reduced. Note that all ``forward`` outputs
+                               that are derived from module parameters must
+                               participate in calculating loss and later the
+                               gradient computation. If they don't, this wrapper
+                               will hang waiting for autograd to produce
+                               gradients for those parameters. Any outputs
+                               derived from module parameters that are otherwise
+                               unused can be detached from the autograd graph
+                               using ``torch.Tensor.detach``. (default: ``False``)
+        check_reduction: This argument is deprecated.
+        gradient_as_bucket_view (bool): This is a prototype feature and subject
+                      to changes. When set to ``True``, gradients will be views
+                      pointing to different offsets of ``allreduce`` communication
+                      buckets. This can reduce peak memory usage, where the
+                      saved memory size will be equal to the total gradients
+                      size. Moreover, it avoids the overhead of copying between
+                      gradients and ``allreduce`` communication buckets. When
+                      gradients are views, ``detach_()`` cannot be called on the
+                      gradients. If hitting such errors, please fix it by
+                      referring to the :meth:`~torch.optim.Optimizer.zero_grad`
+                      function in ``torch/optim/optimizer.py`` as a solution.
+
 
     Attributes:
-        module (Module): the module to be parallelized
+        module (Module): the module to be parallelized.
 
     Example::
 
@@ -410,6 +394,10 @@ def __init__(self, module, device_ids=None,
             # This argument is no longer used since the reducer
             # will ensure reduction completes even if some parameters
             # do not receive gradients.
+            warnings.warn(
+                "The `check_reduction` argument in `DistributedDataParallel` "
+                "module is deprecated. Please avoid using it."
+            )
             pass
 
         # used for intra-node param sync and inter-node sync as well
@@ -807,7 +795,9 @@ def join(self, divide_by_initial_world_size=True, enable=True):
           >>>      dist.init_process_group("nccl", rank=rank, world_size=2)
           >>>      torch.cuda.set_device(rank)
           >>>      model = nn.Linear(1, 1, bias=False).to(rank)
-          >>>      model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank], output_device=rank)
+          >>>      model = torch.nn.parallel.DistributedDataParallel(
+          >>>          model, device_ids=[rank], output_device=rank
+          >>>      )
           >>>      # Rank 1 gets one more input than rank 0.
           >>>      inputs = [torch.tensor([1]).float() for _ in range(10 + rank)]
           >>>      with model.join():

From c5ade5f69852914b7da0e6f30cd0f56f29fa1989 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 28 Sep 2020 20:41:08 -0700
Subject: [PATCH 233/449] Fix no_sync docs (#45455)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45455

Test Plan: Imported from OSS

Reviewed By: pritamdamania87

Differential Revision: D23973365

Pulled By: mrshenli

fbshipit-source-id: 87c9878cdc7310754670b83efa65ae6f877f86fb
---
 torch/nn/parallel/distributed.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 715cbfeb62f4..3d2f00ead9a9 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -576,9 +576,9 @@ def no_sync(self):
 
             >>> ddp = torch.nn.DistributedDataParallel(model, pg)
             >>> with ddp.no_sync():
-            ...   for input in inputs:
-            ...     ddp(input).backward()  # no synchronization, accumulate grads
-            ... ddp(another_input).backward()  # synchronize grads
+            >>>   for input in inputs:
+            >>>     ddp(input).backward()  # no synchronization, accumulate grads
+            >>> ddp(another_input).backward()  # synchronize grads
         """
         old_require_backward_grad_sync = self.require_backward_grad_sync
         self.require_backward_grad_sync = False

From 8e47fcba5fe49a1922431bbc4df96b28f7fa8b90 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 28 Sep 2020 20:41:08 -0700
Subject: [PATCH 234/449] Update docs for RPC async_execution (#45458)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45458

Test Plan: Imported from OSS

Reviewed By: pritamdamania87

Differential Revision: D23973366

Pulled By: mrshenli

fbshipit-source-id: 3697f07fa972db21746aa25eaf461c1b93293f58
---
 docs/source/rpc.rst                |  4 ++--
 torch/distributed/rpc/functions.py | 13 +++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst
index bb15c2af2e0a..1d786710d15c 100644
--- a/docs/source/rpc.rst
+++ b/docs/source/rpc.rst
@@ -113,8 +113,6 @@ and move it to the desired devices on the callee if necessary.
 The RPC package also provides decorators which allow applications to specify
 how a given function should be treated on the callee side.
 
-.. warning::
-  The ``rpc.functions`` package is a prototype feature and subject to change.
 
 .. autofunction:: torch.distributed.rpc.functions.async_execution
 
@@ -302,3 +300,5 @@ to use `the profiler <https://pytorch.org/docs/stable/autograd.html#profiler>`__
 -  `Implementing a Parameter Server using Distributed RPC Framework <https://pytorch.org/tutorials/intermediate/rpc_param_server_tutorial.html>`__
 -  `Combining Distributed DataParallel with Distributed RPC Framework <https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html>`__
 -  `Profiling RPC-based Workloads <https://pytorch.org/tutorials/recipes/distributed_rpc_profiling.html>`__
+-  `Implementing batch RPC processing <https://pytorch.org/tutorials/intermediate/rpc_async_execution.html>`__
+-  `Distributed Pipeline Parallel <https://pytorch.org/tutorials/intermediate/dist_pipeline_parallel_tutorial.html>`__
diff --git a/torch/distributed/rpc/functions.py b/torch/distributed/rpc/functions.py
index 5c807741f5f9..d761f7b4046b 100644
--- a/torch/distributed/rpc/functions.py
+++ b/torch/distributed/rpc/functions.py
@@ -17,8 +17,9 @@ def async_execution(fn):
     :meth:`~torch.distributed.rpc.rpc_async` or waiting for other signals.
 
     .. note:: To enable asynchronous execution, applications must pass the
-        function object returned by this decorator to RPC APIs. Otherwise, RPC
-        will not be able to detect the attributes installed by this decorator.
+        function object returned by this decorator to RPC APIs. If RPC detected
+        attributes installed by this decorator, it knows that this function
+        returns a ``Future`` object and will handle that accordingly.
         However, this does not mean this decorator has to be outmost one when
         defining a function. For example, when combined with ``@staticmethod``
         or ``@classmethod``, ``@rpc.functions.async_execution`` needs to be the
@@ -27,14 +28,14 @@ def async_execution(fn):
         because, when accessed, the static or class method preserves attributes
         installed by ``@rpc.functions.async_execution``.
 
-    .. warning:: `autograd profiler <https://pytorch.org/docs/stable/autograd.html#profiler>`_
-        does not work with ``async_execution`` functions.
 
     Example::
         The returned :class:`~torch.futures.Future` object can come from
-        ``rpc.rpc_async``, ``Future.then(cb)``, or :class:`~torch.futures.Future`
+        :meth:`~torch.distributed.rpc.rpc_async`,
+        :meth:`~torch.futures.Future.then`, or :class:`~torch.futures.Future`
         constructor. The example below shows directly using the
-        :class:`~torch.futures.Future` returned by ``Future.then(cb)``.
+        :class:`~torch.futures.Future` returned by
+        :meth:`~torch.futures.Future.then`.
 
         >>> from torch.distributed import rpc
         >>>

From 5be954b50239c9e8dd503fbd96f2c7c65be188b9 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Mon, 28 Sep 2020 20:41:08 -0700
Subject: [PATCH 235/449] Fix WorkerInfo link format (#45476)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45476

Test Plan: Imported from OSS

Reviewed By: rohan-varma

Differential Revision: D23982069

Pulled By: mrshenli

fbshipit-source-id: 6d932e77c1941dfd96592b388353f0fc8968dde6
---
 torch/csrc/distributed/rpc/init.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 6f30666518c4..2dba0a02f2c9 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -81,7 +81,7 @@ PyObject* rpc_init(PyObject* /* unused */) {
             be constructed directly, rather, an instance can be retrieved
             through :meth:`~torch.distributed.rpc.get_worker_info` and the
             result can be passed in to functions such as
-            :meth:`~torch.distributed.rpc.rpc_sync`, :class:`~torch.distributed.rpc.rpc_async`,
+            :meth:`~torch.distributed.rpc.rpc_sync`, :meth:`~torch.distributed.rpc.rpc_async`,
             :meth:`~torch.distributed.rpc.remote` to avoid copying a string on
             every invocation.)")
           .def(

From 5a6a31168ffe8e8bb3cad57fa4bbf47c43ad1d88 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Mon, 28 Sep 2020 20:43:50 -0700
Subject: [PATCH 236/449] add circle ci job name dimension to report test stats
 (#45457)

Summary:
To support abnormal detection for test time spike

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45457

Reviewed By: malfet

Differential Revision: D23975628

Pulled By: walterddr

fbshipit-source-id: f28d0f12559070004d637d5bde83289f029b15b8
---
 .circleci/config.yml                                       | 1 +
 .circleci/verbatim-sources/job-specs/pytorch-job-specs.yml | 1 +
 test/print_test_stats.py                                   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 700a4155441d..4029eed41b53 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -640,6 +640,7 @@ jobs:
           export CIRCLE_SHA1="$CIRCLE_SHA1"
           export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
           export CIRCLE_BRANCH="$CIRCLE_BRANCH"
+          export CIRCLE_JOB="$CIRCLE_JOB"
           cd workspace
           python test/print_test_stats.py test
           EOL
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 0f0dd76636b4..3bc7e5855a41 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -206,6 +206,7 @@ jobs:
           export CIRCLE_SHA1="$CIRCLE_SHA1"
           export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
           export CIRCLE_BRANCH="$CIRCLE_BRANCH"
+          export CIRCLE_JOB="$CIRCLE_JOB"
           cd workspace
           python test/print_test_stats.py test
           EOL
diff --git a/test/print_test_stats.py b/test/print_test_stats.py
index 522e6652efe1..339f6800f61b 100755
--- a/test/print_test_stats.py
+++ b/test/print_test_stats.py
@@ -84,6 +84,7 @@ def build_message(test_case):
             "build_tag": os.environ.get("CIRCLE_TAG"),
             "build_sha1": os.environ.get("CIRCLE_SHA1"),
             "build_branch": os.environ.get("CIRCLE_BRANCH"),
+            "build_job": os.environ.get("CIRCLE_JOB"),
             "test_suite_name": test_case.class_name,
             "test_case_name": test_case.name,
         },

From 35596d39e970ac8df70f365c9d377c6daf7bcec9 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Mon, 28 Sep 2020 21:40:16 -0700
Subject: [PATCH 237/449] Coalesce TLS accesses in RecordFunction constructor
 (#44970)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44970

Right now, when RecordFunction is not active (usual case),
we do two TLS accesses (check for thread local callbacks, and check for
thread local boolean).
Experimenting with reducing number of TLS accesses in RecordFunction
constructor.

Test Plan: record_function_benchmark

Reviewed By: dzhulgakov

Differential Revision: D23791165

Pulled By: ilia-cher

fbshipit-source-id: 6137ce4bface46f540ece325df9864fdde50e0a4
---
 aten/src/ATen/ThreadLocalState.cpp    |  9 ++--
 aten/src/ATen/ThreadLocalState.h      |  6 +--
 aten/src/ATen/record_function.cpp     | 51 ++++++++++++--------
 aten/src/ATen/record_function.h       | 12 +++++
 binaries/record_function_benchmark.cc | 69 +++++++++++++++++----------
 5 files changed, 91 insertions(+), 56 deletions(-)

diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 7ed7f66e2522..6d74e2f47ce0 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -10,9 +10,8 @@ namespace at {
 
 ThreadLocalState::ThreadLocalState(bool keep_grad_mode)
     : dispatch_key_(c10::impl::tls_local_dispatch_key_set()),
-      debug_info_(c10::ThreadLocalDebugInfo::current()),
-      observers_enabled_(at::isRecordFunctionEnabled()) {
-  callbacks_ = _getTLSCallbacks();
+      debug_info_(c10::ThreadLocalDebugInfo::current()) {
+  rf_tls_ = at::get_record_function_tls_();
 
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   keep_grad_mode_ = keep_grad_mode;
@@ -31,9 +30,7 @@ void ThreadLocalState::setThreadLocalState(
   }
 #endif
 
-  _setTLSCallbacks(state.callbacks_);
-
-  at::enableRecordFunction(state.observers_enabled_);
+  at::set_record_function_tls_(state.rf_tls_);
 
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index 186e521f01bd..f0cb85f0ff84 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -30,10 +30,8 @@ class TORCH_API ThreadLocalState {
   // with DebugInfoGuard
   std::shared_ptr<c10::ThreadLocalDebugInfo> debug_info_;
 
-  // RecordFunction TLS callbacks
-  RecordFunctionCallbacks callbacks_;
-
-  bool observers_enabled_ = false;
+  // RecordFunction TLS
+  RecordFunctionTLS rf_tls_;
 
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   bool keep_grad_mode_ = true;
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index 8bf25c3cac2f..26e9fd9f21fa 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -18,9 +18,7 @@ RecordFunctionHandle next_unique_record_function_handle() {
   return RecordFunctionHandle(++unique_rf_id);
 }
 
-// Thread local vector of callbacks, holds pairs (callbacks, unique_id);
-// must be sorted in increasing handles order
-thread_local RecordFunctionCallbacks sorted_tls_callbacks_;
+thread_local RecordFunctionTLS rf_tls_;
 
 std::atomic<int64_t> defaultNodeId(-1);
 
@@ -52,13 +50,21 @@ double sample_zero_one() {
 
 } // namespace
 
+const RecordFunctionTLS& get_record_function_tls_() {
+  return rf_tls_;
+}
+
+void set_record_function_tls_(const RecordFunctionTLS& tls) {
+  rf_tls_ = tls;
+}
+
 class CallbackManager {
  public:
   CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) {
     // note: monotonically increasing callbacks_unique_id keeps
     // sorted_tls_callbacks_ sorted
     auto handle = next_unique_callback_handle();
-    sorted_tls_callbacks_.emplace_back(std::move(cb), handle);
+    rf_tls_.sorted_tls_callbacks_.emplace_back(std::move(cb), handle);
     return handle;
   }
 
@@ -85,7 +91,7 @@ class CallbackManager {
       }
       return false;
     };
-    auto found = find_and_remove(sorted_tls_callbacks_);
+    auto found = find_and_remove(rf_tls_.sorted_tls_callbacks_);
     if (!found) {
       found = find_and_remove(sorted_global_callbacks_);
     }
@@ -99,7 +105,7 @@ class CallbackManager {
   }
 
   void clearThreadLocalCallbacks() {
-    sorted_tls_callbacks_.clear();
+    rf_tls_.sorted_tls_callbacks_.clear();
   }
 
   inline bool hasGlobalCallbacks() const {
@@ -107,7 +113,7 @@ class CallbackManager {
   }
 
   inline bool hasThreadLocalCallbacks() const {
-    return !sorted_tls_callbacks_.empty();
+    return !rf_tls_.sorted_tls_callbacks_.empty();
   }
 
   // init is called by RecordFunction in constructor to
@@ -141,7 +147,7 @@ class CallbackManager {
       ctx_list.resize(num_callbacks);
     };
 
-    init_handles(rec_fn.sorted_active_tls_handles_, sorted_tls_callbacks_, rec_fn.tls_ctx_);
+    init_handles(rec_fn.sorted_active_tls_handles_, rf_tls_.sorted_tls_callbacks_, rec_fn.tls_ctx_);
     init_handles(rec_fn.sorted_active_global_handles_, sorted_global_callbacks_, rec_fn.global_ctx_);
     rec_fn.active = found_active_cb;
     rec_fn.needs_inputs = found_needs_inputs;
@@ -158,7 +164,7 @@ class CallbackManager {
         /* is_start */ true,
         rf);
     mergeRunCallbacks(
-        sorted_tls_callbacks_,
+        rf_tls_.sorted_tls_callbacks_,
         rf.sorted_active_tls_handles_,
         rf.tls_ctx_,
         /* is_start */ true,
@@ -174,13 +180,16 @@ class CallbackManager {
         /* is_start */ false,
         rf);
     mergeRunCallbacks(
-        sorted_tls_callbacks_,
+        rf_tls_.sorted_tls_callbacks_,
         rf.sorted_active_tls_handles_,
         rf.tls_ctx_,
         /* is_start */ false,
         rf);
   }
 
+  // Global callbacks; must be sorted in increasing handle order
+  RecordFunctionCallbacks sorted_global_callbacks_;
+
  private:
   bool tryRunCallback(
       const RecordFunctionCallback& rfcb,
@@ -235,9 +244,6 @@ class CallbackManager {
           << "the code after profiler is finished";
     }
   }
-
-  // Global callbacks; must be sorted in increasing handle order
-  RecordFunctionCallbacks sorted_global_callbacks_;
 };
 
 namespace {
@@ -281,15 +287,15 @@ bool RecordFunctionCallback::shouldRun(RecordScope scope) const {
 }
 
 RecordFunctionCallbacks _getTLSCallbacks() {
-  return sorted_tls_callbacks_;
+  return rf_tls_.sorted_tls_callbacks_;
 }
 
 void _setTLSCallbacks(const RecordFunctionCallbacks& callbacks) {
   // keep the original handles
-  sorted_tls_callbacks_ = callbacks;
+  rf_tls_.sorted_tls_callbacks_ = callbacks;
   std::sort(
-      sorted_tls_callbacks_.begin(),
-      sorted_tls_callbacks_.end(),
+      rf_tls_.sorted_tls_callbacks_.begin(),
+      rf_tls_.sorted_tls_callbacks_.end(),
       [](const std::pair<RecordFunctionCallback, CallbackHandle>& l,
           const std::pair<RecordFunctionCallback, CallbackHandle>& r) {
         return l.second < r.second;
@@ -338,16 +344,19 @@ void clearCallbacks() {
 }
 
 bool isRecordFunctionEnabled() {
-  return tls_record_function_enabled_;
+  return rf_tls_.tls_record_function_enabled_;
 }
 
 void enableRecordFunction(bool enable) {
-  tls_record_function_enabled_ = enable;
+  rf_tls_.tls_record_function_enabled_ = enable;
 }
 
 RecordFunction::RecordFunction(RecordScope scope) : scope_(scope) {
-  if (hasCallbacks() && isRecordFunctionEnabled()) {
-    manager().init(*this);
+  auto* rf_tls_ptr = &rf_tls_;
+  auto& m = manager();
+  if (rf_tls_ptr->tls_record_function_enabled_ &&
+      (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty())) {
+    m.init(*this);
   }
 }
 
diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index 9b4d11ef1d5f..69f8581c9274 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -471,4 +471,16 @@ class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard {
 TORCH_API RecordFunctionCallbacks _getTLSCallbacks();
 TORCH_API void _setTLSCallbacks(const RecordFunctionCallbacks& callbacks);
 
+struct TORCH_API RecordFunctionTLS {
+  // Thread local vector of callbacks, holds pairs (callbacks, unique_id);
+  // must be sorted in increasing handles order
+  RecordFunctionCallbacks sorted_tls_callbacks_;
+
+  bool tls_record_function_enabled_ = true;
+};
+
+TORCH_API const RecordFunctionTLS& get_record_function_tls_();
+
+TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls);
+
 } // namespace at
diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc
index a7e3383b97f4..d924003b9270 100644
--- a/binaries/record_function_benchmark.cc
+++ b/binaries/record_function_benchmark.cc
@@ -9,8 +9,8 @@
 
 C10_DEFINE_int(iter, 100, "Number of iterations");
 C10_DEFINE_int(warmup_iter, 10, "Number of warmup iterations");
-C10_DEFINE_int(rec_fn_iter, 10e6,
-    "Number of iterations for the pure RecordFunction benchmark");
+C10_DEFINE_int(sampled_iter, 10e6,
+    "Number of iterations for the sampled observer benchmark");
 
 namespace {
 const int kInnerIter = 100;
@@ -23,6 +23,8 @@ const float kLowSamplingProb = 0.0001;
 }
 
 void setupBenchmarkCallbacks() {
+  at::enableRecordFunction();
+  at::clearCallbacks();
   // non-sampled callback
   at::addGlobalCallback(at::RecordFunctionCallback(
       [&](const at::RecordFunction& fn) {},
@@ -40,7 +42,7 @@ void setupBenchmarkCallbacks() {
   }
 }
 
-float runBench(int tensor_size, int outer_iter) {
+float runTensorBench(int tensor_size, int outer_iter) {
   typedef std::chrono::high_resolution_clock clock;
   typedef std::chrono::microseconds us;
   std::chrono::time_point<clock> start_time = clock::now();
@@ -54,30 +56,53 @@ float runBench(int tensor_size, int outer_iter) {
   return duration;
 }
 
-int main(int argc, char** argv) {
-  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
-    std::cout << "Failed to parse command line flags" << std::endl;
-    return -1;
+float runPureRecordFunctionBench(int outer_iter) {
+  typedef std::chrono::high_resolution_clock clock;
+  typedef std::chrono::microseconds us;
+  std::chrono::time_point<clock> start_time = clock::now();
+  for (auto n = 0; n < outer_iter; ++n) {
+    RECORD_USER_SCOPE("test");
   }
+  auto duration = static_cast<float>(
+      std::chrono::duration_cast<us>(clock::now() - start_time).count());
+  return duration;
+}
 
-  at::enableRecordFunction();
-  setupBenchmarkCallbacks();
-
-  auto duration = runBench(kSmallTensorSize, FLAGS_warmup_iter);
-  std::cout << "Warmup time: " << duration << " us." << std::endl;
-
+void runBenchmark() {
+  float duration = 0;
   for (auto tensor_size : std::set<int>({kSmallTensorSize, kTensorSize})) {
-    duration = runBench(tensor_size, FLAGS_iter);
-    std::cout << "Time per iteration ("
+    duration = runTensorBench(tensor_size, FLAGS_iter);
+    std::cout << "Running tensor benchmark, time per iteration ("
               << tensor_size
               << "x"
               << tensor_size
               << "): " << (duration/FLAGS_iter)
               << " us." << std::endl;
   }
+  duration = runPureRecordFunctionBench(FLAGS_iter * 100);
+  std::cout << "Running pure RecordFunction benchmark, time per iteration: "
+              << (duration/FLAGS_iter)
+              << " us." << std::endl;
+}
+
+int main(int argc, char** argv) {
+  if (!c10::ParseCommandLineFlags(&argc, &argv)) {
+    std::cout << "Failed to parse command line flags" << std::endl;
+    return -1;
+  }
+
+  auto duration = runTensorBench(kSmallTensorSize, FLAGS_warmup_iter);
+  std::cout << "Warmup time: " << duration << " us." << std::endl;
+
+  setupBenchmarkCallbacks();
+  std::cout << "Running with empty observers" << std::endl;
+  runBenchmark();
 
   at::clearCallbacks();
+  std::cout << "Running without observers" << std::endl;
+  runBenchmark();
 
+  std::cout << "Running sampled observer benchmark" << std::endl;
   int cb_count = 0;
   at::addGlobalCallback(at::RecordFunctionCallback(
       [&](const at::RecordFunction& fn) {
@@ -88,18 +113,12 @@ int main(int argc, char** argv) {
     .samplingProb(kLowSamplingProb)
   );
 
-  typedef std::chrono::high_resolution_clock clock;
-  typedef std::chrono::microseconds us;
-  std::chrono::time_point<clock> start_time = clock::now();
-  for (auto n = 0; n < FLAGS_rec_fn_iter; ++n) {
-    RECORD_USER_SCOPE("test");
-  }
-  duration = static_cast<float>(
-      std::chrono::duration_cast<us>(clock::now() - start_time).count());
-  std::cout << "Pure RecordFunction runtime of " << FLAGS_rec_fn_iter
+  runPureRecordFunctionBench(FLAGS_sampled_iter);
+
+  std::cout << "Pure RecordFunction runtime of " << FLAGS_sampled_iter
             << " iterations " << duration
             << " us, number of callback invocations: " << cb_count
-            << ", expected number: ~" << (int)(FLAGS_rec_fn_iter * kLowSamplingProb)
+            << ", expected number: ~" << (int)(FLAGS_sampled_iter * kLowSamplingProb)
             << " invocations" << std::endl;
 
   at::clearCallbacks();

From 50b91103a90ac87c7836a330d692f5db3949f1ee Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Mon, 28 Sep 2020 21:45:34 -0700
Subject: [PATCH 238/449] add self cuda time to avoid double/quadruple counting
 (#45209)

Summary:
In profiler, cuda did not report self time, so for composite functions there was no way to determine which function is really taking time. In addition, "total cuda time" reported was frequently more than total wallclock time. This PR adds "self CUDA time" in profiler, and computes total cuda time based on self cuda time, similar to how it's done for CPU. Also, slight formatting changes to make table more compact. Before:
```
--------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                  Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     CUDA total %     CUDA total       CUDA time avg    Number of Calls
--------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
aten::matmul          0.17%            890.805us        99.05%           523.401ms        5.234ms          49.91%           791.184ms        7.912ms          100
aten::mm              98.09%           518.336ms        98.88%           522.511ms        5.225ms          49.89%           790.885ms        7.909ms          100
aten::t               0.29%            1.530ms          0.49%            2.588ms          25.882us         0.07%            1.058ms          10.576us         100
aten::view            0.46%            2.448ms          0.46%            2.448ms          12.238us         0.06%            918.936us        4.595us          200
aten::transpose       0.13%            707.204us        0.20%            1.058ms          10.581us         0.03%            457.802us        4.578us          100
aten::empty           0.14%            716.056us        0.14%            716.056us        7.161us          0.01%            185.694us        1.857us          100
aten::as_strided      0.07%            350.935us        0.07%            350.935us        3.509us          0.01%            156.380us        1.564us          100
aten::stride          0.65%            3.458ms          0.65%            3.458ms          11.527us         0.03%            441.258us        1.471us          300
--------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
Self CPU time total: 528.437ms
CUDA time total: 1.585s

Recorded timeit time:  789.0814 ms

```
Note recorded timeit time (with proper cuda syncs) is 2 times smaller than "CUDA time total" reported by profiler

After
```
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
        aten::matmul         0.15%     802.716us        99.06%     523.548ms       5.235ms     302.451us         0.04%     791.151ms       7.912ms           100
            aten::mm        98.20%     519.007ms        98.91%     522.745ms       5.227ms     790.225ms        99.63%     790.848ms       7.908ms           100
             aten::t         0.27%       1.406ms         0.49%       2.578ms      25.783us     604.964us         0.08%       1.066ms      10.662us           100
          aten::view         0.45%       2.371ms         0.45%       2.371ms      11.856us     926.281us         0.12%     926.281us       4.631us           200
     aten::transpose         0.15%     783.462us         0.22%       1.173ms      11.727us     310.016us         0.04%     461.282us       4.613us           100
         aten::empty         0.11%     591.603us         0.11%     591.603us       5.916us     176.566us         0.02%     176.566us       1.766us           100
    aten::as_strided         0.07%     389.270us         0.07%     389.270us       3.893us     151.266us         0.02%     151.266us       1.513us           100
        aten::stride         0.60%       3.147ms         0.60%       3.147ms      10.489us     446.451us         0.06%     446.451us       1.488us           300
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
Self CPU time total: 528.498ms
CUDA time total: 793.143ms

Recorded timeit time:  788.9832 ms

```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45209

Reviewed By: zou3519

Differential Revision: D23925491

Pulled By: ngimel

fbshipit-source-id: 7f9c49238d116bfd2db9db3e8943355c953a77d0
---
 torch/autograd/profiler.py                    | 30 ++++--
 torch/csrc/autograd/profiler.cpp              | 92 +++++++++++++------
 .../_internal/distributed/rpc/rpc_test.py     | 25 +++--
 3 files changed, 96 insertions(+), 51 deletions(-)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 94b1aae844f1..7c9fe643a991 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -230,6 +230,7 @@ def get_key(event, group_by_input_shapes):
         for evt in self:
             stats[get_key(evt, group_by_input_shapes)].add(
                 evt, group_by_input_shapes)
+
         return EventList(stats.values(), use_cuda=self._use_cuda, profile_memory=self._profile_memory)
 
     def total_average(self):
@@ -639,6 +640,7 @@ class FormattedTimesMixin(object):
     cpu_time_total_str = attr_formatter('cpu_time_total')
     cuda_time_total_str = attr_formatter('cuda_time_total')
     self_cpu_time_total_str = attr_formatter('self_cpu_time_total')
+    self_cuda_time_total_str = attr_formatter('self_cuda_time_total')
 
     @property
     def cpu_time(self):
@@ -735,6 +737,11 @@ def self_cpu_time_total(self):
     def cuda_time_total(self):
         return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels)
 
+    @property
+    def self_cuda_time_total(self):
+        return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \
+            sum([child.cuda_time_total for child in self.cpu_children])
+
     @property
     def cpu_time_total(self):
         return self.cpu_interval.elapsed_us()
@@ -778,6 +785,7 @@ def __init__(self):
         self.cpu_time_total = 0
         self.cuda_time_total = 0
         self.self_cpu_time_total = 0
+        self.self_cuda_time_total = 0
         self.input_shapes = None
         self.cpu_memory_usage = 0
         self.cuda_memory_usage = 0
@@ -808,6 +816,7 @@ def add(self, other, group_by_input_shapes=False):
         self.cpu_time_total += other.cpu_time_total
         self.cuda_time_total += other.cuda_time_total
         self.self_cpu_time_total += other.self_cpu_time_total
+        self.self_cuda_time_total += other.self_cuda_time_total
         self.cpu_memory_usage += other.cpu_memory_usage
         self.cuda_memory_usage += other.cuda_memory_usage
         self.self_cpu_memory_usage += other.self_cpu_memory_usage
@@ -821,11 +830,12 @@ def __iadd__(self, other):
     def __repr__(self):
         return (
             '<FunctionEventAvg key={} self_cpu_time={} cpu_time={} '
-            'cuda_time={} input_shapes={}> '
+            ' self_cuda_time={} cuda_time={} input_shapes={}> '
             'cpu_memory_usage={} cuda_memory_usage={}'.format(
                 self.key,
                 self.self_cpu_time_total_str,
                 self.cpu_time_str,
+                self.self_cuda_time_total_str,
                 self.cuda_time_str,
                 str(self.input_shapes),
                 self.cpu_memory_usage,
@@ -1090,20 +1100,21 @@ def build_table(
     has_input_shapes = any(
         [event.input_shapes is not None for event in events])
     name_column_width = max([len(evt.key) for evt in events]) + 4
-    DEFAULT_COLUMN_WIDTH = 15
+    DEFAULT_COLUMN_WIDTH = 12
     SHAPES_COLUMN_WIDTH = 45
 
     headers = [
         'Name',
-        'Self CPU total %',
-        'Self CPU total',
+        'Self CPU %',
+        'Self CPU',
         'CPU total %',
         'CPU total',
         'CPU time avg',
     ]
     if use_cuda:
         headers.extend([
-            'CUDA total %',
+            'Self CUDA',
+            'Self CUDA %',
             'CUDA total',
             'CUDA time avg',
         ])
@@ -1118,7 +1129,7 @@ def build_table(
                 'Self CUDA Mem',
             ])
     headers.append(
-        'Number of Calls'
+        '# of Calls'
     )
     # Only append Node ID if any event has a valid (>= 0) Node ID
     append_node_id = any([evt.node_id != -1 for evt in events])
@@ -1132,7 +1143,7 @@ def build_table(
     line_length = [-SPACING_SIZE]
 
     def add_column(padding):
-        row_format[0] += '{: <' + str(padding) + '}  '
+        row_format[0] += '{: >' + str(padding) + '}  '
         header_sep[0] += '-' * padding + '  '
         line_length[0] += padding + SPACING_SIZE
 
@@ -1157,7 +1168,7 @@ def append(s):
         result.append('\n')  # Yes, newline after the end as well
 
     self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
-    cuda_time_total = sum([evt.cuda_time_total for evt in events])
+    cuda_time_total = sum([evt.self_cuda_time_total for evt in events])
     # Actual printing
     if header is not None:
         append('=' * line_length)
@@ -1191,8 +1202,9 @@ def append(s):
         ]
         if use_cuda:
             row_values.extend([
+                evt.self_cuda_time_total_str,
                 # CUDA time total %
-                format_time_share(evt.cuda_time_total, cuda_time_total),
+                format_time_share(evt.self_cuda_time_total, cuda_time_total),
                 evt.cuda_time_total_str,
                 evt.cuda_time_str,  # Cuda time avg
             ])
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 259c2e5bdeb6..1a0887ec8908 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -47,6 +47,24 @@ namespace {
     NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
   };
 
+  const std::unordered_set<std::string> disable_cuda_profiling = {
+      "aten::view",
+      "aten::t",
+      "aten::transpose",
+      "aten::stride",
+      "aten::empty",
+      "aten::empty_like",
+      "aten::empty_strided",
+      "aten::as_strided",
+      "aten::expand",
+      "aten::resize_",
+      "aten::squeeze",
+      "aten::unsqueeze",
+      "aten::slice",
+      "aten::_unsafe_view",
+      "aten::size"
+      };
+
 CUDAStubs default_stubs;
 constexpr CUDAStubs* default_stubs_addr = &default_stubs;
 // Constant initialization, so it is guaranteed to be initialized before
@@ -117,8 +135,9 @@ static CUDAStubs* cuda_stubs = default_stubs_addr;
 //  - TorchScript functions/methods
 //  - user defined named ranges (see `record_function` python context manager)
 //
-// Profiler setups a pair of callbacks that record profiling events and save them
-// into the thread local profiler struct (ThreadLocalDebugInfo, PROFILER_STATE slot)
+// Profiler setups a pair of callbacks that record profiling events and save
+// them into the thread local profiler struct (ThreadLocalDebugInfo,
+// PROFILER_STATE slot)
 //
 //
 // Thus, the overall logic is:
@@ -144,11 +163,9 @@ static CUDAStubs* cuda_stubs = default_stubs_addr;
 //
 
 // Profiler state
-struct ProfilerThreadLocalState
-    : public c10::MemoryReportingInfoBase {
-  explicit ProfilerThreadLocalState(
-      const ProfilerConfig& config)
-    : config_(config), remoteProfiledEvents_{c10::nullopt} {}
+struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
+  explicit ProfilerThreadLocalState(const ProfilerConfig& config)
+      : config_(config), remoteProfiledEvents_{c10::nullopt} {}
   ~ProfilerThreadLocalState() override = default;
 
   inline const ProfilerConfig& config() const {
@@ -172,9 +189,7 @@ struct ProfilerThreadLocalState
     return result;
   }
 
-  void mark(
-      std::string name,
-      bool include_cuda = true) {
+  void mark(std::string name, bool include_cuda = true) {
     if (config_.state == ProfilerState::Disabled) {
       return;
     }
@@ -182,17 +197,17 @@ struct ProfilerThreadLocalState
       cuda_stubs->nvtxMarkA(name.c_str());
     } else {
       Event evt(
-        EventKind::Mark,
-        at::StringView(std::move(name)),
-        at::RecordFunction::currentThreadId(),
-        include_cuda && config_.state == ProfilerState::CUDA
-      );
+          EventKind::Mark,
+          at::StringView(std::move(name)),
+          at::RecordFunction::currentThreadId(),
+          include_cuda && config_.state == ProfilerState::CUDA);
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
       getEventList().record(std::move(evt));
     }
   }
 
-  void setOrAddRemoteProfiledEvents(std::vector<Event>&& remoteProfiledEvents) {
+  void setOrAddRemoteProfiledEvents(
+      std::vector<Event>&& remoteProfiledEvents) {
     // Lock to serialize access from multiple callback threads.
     std::lock_guard<std::mutex> guard(state_mutex_);
     if (remoteProfiledEvents_) {
@@ -204,6 +219,7 @@ struct ProfilerThreadLocalState
 
   void pushRange(
       const at::StringView& name,
+      const bool record_cuda,
       const char* msg = "",
       int64_t sequence_nr = -1,
       std::vector<std::vector<int64_t>>&& shapes = {},
@@ -212,13 +228,14 @@ struct ProfilerThreadLocalState
       return;
     }
     if (config_.state == ProfilerState::NVTX) {
-      cuda_stubs->nvtxRangePushA(getNvtxStr(
-          name, msg, sequence_nr, shapes).c_str());
+      cuda_stubs->nvtxRangePushA(
+          getNvtxStr(name, msg, sequence_nr, shapes).c_str());
     } else {
-      Event evt(EventKind::PushRange,
+      Event evt(
+          EventKind::PushRange,
           name,
           at::RecordFunction::currentThreadId(),
-          config_.state == ProfilerState::CUDA,
+          record_cuda,
           handle,
           std::move(shapes),
           at::RecordFunction::getDefaultNodeId());
@@ -227,7 +244,10 @@ struct ProfilerThreadLocalState
     }
   }
 
-  void popRange(uint64_t thread_id, at::RecordFunctionHandle handle) {
+  void popRange(
+      uint64_t thread_id,
+      const bool record_cuda,
+      at::RecordFunctionHandle handle) {
     if (config_.state == ProfilerState::Disabled) {
       return;
     }
@@ -238,10 +258,11 @@ struct ProfilerThreadLocalState
       // called on a different thread than pushRange
       // As a convention, we put the async pop on the original
       // thread and save current thread id in pop event
-      Event evt(EventKind::PopRange,
+      Event evt(
+          EventKind::PopRange,
           at::StringView(""),
           at::RecordFunction::currentThreadId(),
-          config_.state == ProfilerState::CUDA,
+          record_cuda,
           handle);
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
       getEventList(thread_id).record(std::move(evt));
@@ -257,7 +278,9 @@ struct ProfilerThreadLocalState
   }
 
   void reportMemoryUsage(
-      void* /* unused */, int64_t alloc_size, c10::Device device) override {
+      void* /* unused */,
+      int64_t alloc_size,
+      c10::Device device) override {
     if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
       uint64_t thread_id = at::RecordFunction::currentThreadId();
       Event evt(
@@ -274,7 +297,7 @@ struct ProfilerThreadLocalState
     return config_.profile_memory;
   }
 
- private:
+  private:
   std::string getNvtxStr(
       const at::StringView& name,
       const char* msg,
@@ -340,7 +363,8 @@ struct ProfilerThreadLocalState
   std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
       event_lists_map_;
 
-  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled, false, false);
+  ProfilerConfig config_ =
+      ProfilerConfig(ProfilerState::Disabled, false, false);
   at::CallbackHandle handle_ = 0;
   c10::optional<std::vector<std::vector<Event>>> remoteProfiledEvents_;
 };
@@ -359,6 +383,11 @@ void pushProfilingCallbacks() {
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
+        bool record_cuda =
+            state_ptr->config().state == ProfilerState::CUDA;
+        if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
+          record_cuda = false;
+        }
 
         auto* msg = (fn.seqNr() >= 0) ? ", seq = " : "";
         if (state_ptr->config().report_input_shapes) {
@@ -377,9 +406,9 @@ void pushProfilingCallbacks() {
             }
           }
           state_ptr->pushRange(
-              fn.name(), msg, fn.seqNr(), std::move(inputSizes), fn.handle());
+              fn.name(), record_cuda, msg, fn.seqNr(), std::move(inputSizes), fn.handle());
         } else {
-          state_ptr->pushRange(fn.name(), msg, fn.seqNr(), {}, fn.handle());
+          state_ptr->pushRange(fn.name(), record_cuda, msg, fn.seqNr(), {}, fn.handle());
         }
       },
       [](const at::RecordFunction& fn) {
@@ -387,7 +416,12 @@ void pushProfilingCallbacks() {
         if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) {
           return;
         }
-        state_ptr->popRange(fn.getStartCallbacksThreadId(), fn.handle());
+        bool record_cuda =
+            state_ptr->config().state == ProfilerState::CUDA;
+        if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
+          record_cuda = false;
+        }
+        state_ptr->popRange(fn.getStartCallbacksThreadId(), record_cuda, fn.handle());
       })
     .needsInputs(state_ptr->config().report_input_shapes)
     .needsIds(true));
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 4b005c6933f9..b525adbf31c0 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1124,6 +1124,9 @@ def test_profiler_remote_cuda(self):
             fut1.wait()
             fut2.wait()
 
+        def get_name(event):
+            return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR):]
+
         function_events = p.function_events
         for event in function_events:
             if event.is_async:
@@ -1134,22 +1137,18 @@ def test_profiler_remote_cuda(self):
                 if event.node_id == 1:
                     continue
                 self.assertTrue(event.node_id in [dst_cuda_0, dst_cuda_1])
-                self.assertGreater(event.cuda_time_total, 0)
-                self.assertEqual(1, len(event.kernels))
-                kernel = event.kernels[0]
-                if event.node_id == dst_cuda_0:
-                    self.assertEqual(kernel.device, 0)
-                if event.node_id == dst_cuda_1:
-                    self.assertEqual(kernel.device, 1)
-
-                self.assertGreater(event.cuda_time, 0)
+                if get_name(event) in EXPECTED_REMOTE_EVENTS:
+                    self.assertGreater(event.cuda_time_total, 0)
+                    self.assertEqual(1, len(event.kernels))
+                    kernel = event.kernels[0]
+                    if event.node_id == dst_cuda_0:
+                        self.assertEqual(kernel.device, 0)
+                    if event.node_id == dst_cuda_1:
+                        self.assertEqual(kernel.device, 1)
+                    self.assertGreater(event.cuda_time, 0)
 
         # Validate that EXPECTED_REMOTE_EVENTS is a subset of remotely profiled
         # events.
-
-        def get_name(event):
-            return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR):]
-
         remote_events = [event for event in function_events if event.is_remote]
         remote_event_names = [get_name(event) for event in remote_events if get_name(event) in EXPECTED_REMOTE_EVENTS]
         self.assertEqual(set(remote_event_names), set(EXPECTED_REMOTE_EVENTS))

From dddb685c114a94df01a1919c7238f331fa63fc6d Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Mon, 28 Sep 2020 21:47:54 -0700
Subject: [PATCH 239/449] This PR flips a switch to enable PE + TE (#45396)

Summary:
This PR flips a switch to enable PE + TE
next PR: https://github.com/pytorch/pytorch/pull/45397

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45396

Reviewed By: suo

Differential Revision: D23966878

Pulled By: Krovatkin

fbshipit-source-id: 2010a0b07c595992a88b3fe0792d6af315cf421e
---
 torch/csrc/jit/passes/tensorexpr_fuser.cpp               | 2 +-
 torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 8e90fbefa77f..fd70f2963b8b 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -192,7 +192,7 @@ bool isSupported(Node* node) {
 
 } // namespace tensorexpr
 
-static bool texpr_fuser_enabled_ = false;
+static bool texpr_fuser_enabled_ = true;
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index 5d63d78d4765..cb733938c033 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -45,7 +45,7 @@ static std::atomic<bool> executor_mode{true};
 static std::atomic<bool> profiling_mode{false};
 #else
 static std::atomic<bool> executor_mode{true};
-static std::atomic<bool> profiling_mode{false};
+static std::atomic<bool> profiling_mode{true};
 #endif
 
 static std::atomic<size_t> num_profiled_runs{1};

From 417e3f85e52cdd7439c16c692f7a154d90729dfc Mon Sep 17 00:00:00 2001
From: lixinyu <lixinyu@devgpu175.prn2.facebook.com>
Date: Mon, 28 Sep 2020 22:03:16 -0700
Subject: [PATCH 240/449] Support tuple inputs in NN Module test (#44853)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44853

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D23750441

Pulled By: glaringlee

fbshipit-source-id: 1b111a370a726b40521134b711c35f48dda99411
---
 test/cpp_api_parity/parity-tracker.md |   6 +-
 test/test_nn.py                       |  17 +-
 torch/testing/_internal/common_nn.py  | 218 ++++++++++++++++----------
 3 files changed, 150 insertions(+), 91 deletions(-)

diff --git a/test/cpp_api_parity/parity-tracker.md b/test/cpp_api_parity/parity-tracker.md
index b7ec61a5a958..66931b6f9316 100644
--- a/test/cpp_api_parity/parity-tracker.md
+++ b/test/cpp_api_parity/parity-tracker.md
@@ -88,11 +88,11 @@ torch::nn::GRU|Yes|No
 torch::nn::RNNCell|Yes|No
 torch::nn::LSTMCell|Yes|No
 torch::nn::GRUCell|Yes|No
-torch::nn::Transformer|No|No
+torch::nn::Transformer|Yes|No
 torch::nn::TransformerEncoder|No|No
 torch::nn::TransformerDecoder|No|No
-torch::nn::TransformerEncoderLayer|No|No
-torch::nn::TransformerDecoderLayer|No|No
+torch::nn::TransformerEncoderLayer|Yes|No
+torch::nn::TransformerDecoderLayer|Yes|No
 torch::nn::Identity|Yes|No
 torch::nn::Linear|Yes|No
 torch::nn::Bilinear|Yes|No
diff --git a/test/test_nn.py b/test/test_nn.py
index 9a531b12373a..7a1943f82691 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -53,6 +53,7 @@
 from torch.testing._internal.common_utils import _assertGradAndGradgradChecks
 from torch.testing._internal.common_utils import dtype2prec_DONTUSE
 from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, tf32_off, tf32_on
+from torch.types import _TensorOrTensors
 
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32()
@@ -316,15 +317,19 @@ class TestNN(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
 
-    def _forward(self, module, input):
+    def _forward(self, module, input: _TensorOrTensors):
         with freeze_rng_state():
-            return module(input)
+            if isinstance(input, tuple):
+                return module(*input)
+            else:
+                return module(input)
 
-    def _backward(self, module, input, output, grad_output, create_graph=False):
+    def _backward(self, module, input: _TensorOrTensors, output, grad_output, create_graph=False):
         output.backward(grad_output, retain_graph=True, create_graph=create_graph)
-        if input.grad is None:
-            return None
-        return input.grad.data
+        if isinstance(input, tuple):
+            return tuple(map(lambda i: i.grad.data if i.grad is not None else None, input))
+        else:
+            return input.grad.data if input.grad is not None else None
 
     def _forward_criterion(self, criterion, input, target, extra_args=None):
         if extra_args is None:
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index a4525831e920..a379fa10b864 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -22,6 +22,7 @@
 from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors, \
     gradcheck, gradgradcheck
 from torch.autograd import Variable
+from torch.types import _TensorOrTensors
 import torch.backends.cudnn
 
 # tarfile module tries to obtain a file object name in python 3.3
@@ -3533,6 +3534,62 @@ def fractional_max_pool3d_test(test_case):
         skip_double=TEST_WITH_ROCM,
         pickle=False,
     ),
+    dict(
+        module_name='TransformerEncoderLayer',
+        constructor_args=(4, 2, 16, 0.0),
+        cpp_constructor_args='''torch::nn::TransformerEncoderLayerOptions(4, 2)
+                                .dim_feedforward(16)
+                                .dropout(0.0)''',
+        input_size=(2, 3, 4),
+        desc='relu_activation',
+    ),
+    dict(
+        module_name='TransformerEncoderLayer',
+        constructor_args=(4, 2, 8, 0.0, 'gelu'),
+        cpp_constructor_args='''torch::nn::TransformerEncoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kGELU)''',
+        input_size=(2, 3, 4),
+        check_gradgrad=False,
+        desc='gelu_activation',
+    ),
+    dict(
+        module_name='TransformerDecoderLayer',
+        constructor_args=(4, 2, 8, 0.0),
+        cpp_constructor_args='''torch::nn::TransformerDecoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)''',
+        input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
+        check_gradgrad=False,
+        desc='relu_activation',
+    ),
+    dict(
+        module_name='TransformerDecoderLayer',
+        constructor_args=(4, 2, 8, 0.0, 'gelu'),
+        cpp_constructor_args='''torch::nn::TransformerDecoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kGELU)''',
+        input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
+        check_gradgrad=False,
+        desc='gelu_activation',
+    ),
+    dict(
+        module_name='Transformer',
+        constructor_args=(4, 2, 2, 2, 8, 0.0, "relu"),
+        cpp_constructor_args='''torch::nn::TransformerOptions()
+                                .d_model(4)
+                                .nhead(2)
+                                .num_encoder_layers(2)
+                                .num_decoder_layers(2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kReLU)''',
+        input_fn=lambda:(torch.rand(3, 3, 4), torch.rand(2, 3, 4), torch.rand(3, 3)),
+        check_gradgrad=False,
+        desc='multilayer_coder'
+    )
 ]
 
 # add conv padding mode tests:
@@ -4547,7 +4604,7 @@ def _zero_grad_input(self, input):
             for i in input:
                 self._zero_grad_input(i)
 
-    def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
+    def _analytical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True, jacobian_parameters=True):
         output = self._forward(module, input)
         output_size = output.nelement()
 
@@ -4589,7 +4646,7 @@ def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_para
 
         return res
 
-    def _numerical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
+    def _numerical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True, jacobian_parameters=True):
         def fw(input):
             return self._forward(module, input).detach()
 
@@ -4601,19 +4658,20 @@ def fw(input):
             res += torch.cat([get_numerical_jacobian(fw, input, p, eps=1e-6) for p in param], 0),
         return res
 
-    def check_jacobian(self, module, input, jacobian_input=True):
+    def check_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True):
         jacobian_parameters = bool(self._get_parameters(module)[0])
         analytical = self._analytical_jacobian(module, input, jacobian_input, jacobian_parameters)
         numerical = self._numerical_jacobian(module, input, jacobian_input, jacobian_parameters)
         analytical_t = list(iter_tensors(analytical))
         numerical_t = list(iter_tensors(numerical))
 
-        # TODO: compare structure
-        if input.numel() != 0:
-            self.assertLessEqual(
-                max(a.add(n, alpha=-1).abs().max() for a, n in zip(analytical_t, numerical_t)),
-                PRECISION
-            )
+        differences = []
+        for a, n in zip(analytical_t, numerical_t):
+            if a.numel() != 0:
+                differences.append(a.add(n, alpha=-1).abs().max())
+            # TODO: compare structure (ensure analytic jacobian has correct shape)
+        if len(differences) > 0:
+            self.assertLessEqual(max(differences), PRECISION)
 
 
 class TestBase(object):
@@ -4736,6 +4794,8 @@ def __call__(self, test_case):
     def noncontiguize(self, obj):
         if isinstance(obj, list):
             return [self.noncontiguize(o) for o in obj]
+        elif isinstance(obj, tuple):
+            return tuple(self.noncontiguize(o) for o in obj)
         tensor = obj
         ndim = tensor.dim()
         # Always making only the last dimension noncontiguous is easy to hide
@@ -4791,7 +4851,8 @@ def test_cuda(self, test_case):
 
         cpu_input = self._get_input()
         type_map = {'torch.DoubleTensor': torch.cuda.FloatTensor}
-        gpu_input = to_gpu(cpu_input, type_map=type_map)
+        cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,)
+        gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map)
 
         cpu_module = self.constructor(*self.constructor_args)
         gpu_module = self.constructor(*self.constructor_args).float().cuda()
@@ -4800,12 +4861,12 @@ def test_cuda(self, test_case):
         for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
             gpu_p.data.copy_(cpu_p)
 
-        test_case._zero_grad_input(cpu_input)
-        test_case._zero_grad_input(gpu_input)
+        test_case._zero_grad_input(cpu_input_tuple)
+        test_case._zero_grad_input(gpu_input_tuple)
         test_case._zero_grad_parameters(cpu_module)
         test_case._zero_grad_parameters(gpu_module)
-        cpu_output = test_case._forward(cpu_module, cpu_input)
-        gpu_output = test_case._forward(gpu_module, gpu_input)
+        cpu_output = test_case._forward(cpu_module, cpu_input_tuple)
+        gpu_output = test_case._forward(gpu_module, gpu_input_tuple)
         # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
         test_case.assertEqualIgnoreType(cpu_output, gpu_output, atol=self.precision, rtol=0)
 
@@ -4813,8 +4874,8 @@ def test_cuda(self, test_case):
         for _ in range(5):
             cpu_gradOutput = cpu_output.clone().normal_()
             gpu_gradOutput = cpu_gradOutput.type('torch.cuda.FloatTensor')
-            cpu_gradInput = test_case._backward(cpu_module, cpu_input, cpu_output, cpu_gradOutput)
-            gpu_gradInput = test_case._backward(gpu_module, gpu_input, gpu_output, gpu_gradOutput)
+            cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
+            gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0)
             for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
@@ -4822,8 +4883,8 @@ def test_cuda(self, test_case):
 
         # Run double-backwards on CPU and GPU and compare results
         if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
-            cpu_output = cpu_module(cpu_input)
-            gpu_output = gpu_module(gpu_input)
+            cpu_output = cpu_module(*cpu_input_tuple)
+            gpu_output = gpu_module(*gpu_input_tuple)
 
             cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True)
             gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach()
@@ -4831,12 +4892,12 @@ def test_cuda(self, test_case):
 
             cpu_gradInputs = torch.autograd.grad(
                 cpu_output,
-                (cpu_input,) + tuple(cpu_module.parameters()),
+                cpu_input_tuple + tuple(cpu_module.parameters()),
                 cpu_gradOutput,
                 create_graph=True)
             gpu_gradInputs = torch.autograd.grad(
                 gpu_output,
-                (gpu_input,) + tuple(gpu_module.parameters()),
+                gpu_input_tuple + tuple(gpu_module.parameters()),
                 gpu_gradOutput,
                 create_graph=True)
 
@@ -4850,11 +4911,11 @@ def test_cuda(self, test_case):
             # only on the gradient.
             cpu_gg = torch.autograd.grad(
                 cpu_output.sum() + sum(map(lambda x: x.sum(), cpu_gradInputs)),
-                (cpu_input, cpu_gradOutput) + tuple(cpu_module.parameters()),
+                cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()),
                 retain_graph=True)
             gpu_gg = torch.autograd.grad(
                 gpu_output.sum() + sum(map(lambda x: x.sum(), gpu_gradInputs)),
-                (gpu_input, gpu_gradOutput) + tuple(gpu_module.parameters()),
+                gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                 retain_graph=True)
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             test_case.assertEqualIgnoreType(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0)
@@ -4862,8 +4923,7 @@ def test_cuda(self, test_case):
                 # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
                 test_case.assertEqualIgnoreType(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
 
-        self.test_noncontig(test_case, gpu_module, gpu_input)
-
+        self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
 
 class InputVariableMixin(object):
     def _get_input(self):
@@ -4894,12 +4954,14 @@ def __init__(self, *args, **kwargs):
     def _do_test(self, test_case, module, input):
         num_threads = torch.get_num_threads()
         torch.set_num_threads(1)
-        test_case.check_jacobian(module, input, self.jacobian_input)
+        input_tuple = input if isinstance(input, tuple) else (input,)
+        test_case.check_jacobian(module, input_tuple, self.jacobian_input)
         if self.check_gradgrad:
             # could probably unify check_jacobian above with this.
             params = tuple(x for x in module.parameters())
-            _assertGradAndGradgradChecks(test_case,
-                                         lambda x, *args, **kw: test_case._forward(module, x), (input,) + params)
+            num_inputs = len(input_tuple)
+            _assertGradAndGradgradChecks(
+                test_case, lambda *args, **kw: test_case._forward(module, args[:num_inputs]), input_tuple + params)
 
         # check if module can be printed
         module.__repr__()
@@ -4908,6 +4970,11 @@ def _do_test(self, test_case, module, input):
             # check if the inplace variant of the module gives the same result
             # as the out-of-place
 
+            # check_inplace doesn't support multiple input tensors, since we don't have any modules
+            # that modify the inputs in-place and that accept more than one input
+            assert len(input_tuple) == 1
+            input = input_tuple[0]
+
             module_ip = self.constructor(*self.constructor_args, inplace=True)
 
             input_version = input._version
@@ -4927,106 +4994,93 @@ def _do_test(self, test_case, module, input):
             output_ip.backward(grad)
             test_case.assertEqual(input.grad, input_ip.grad)
 
-        if isinstance(input, torch.LongTensor) and TEST_CUDA:
+        def assert_module_parameters_are(tensor_type, device_id=None):
+            for p in module.parameters():
+                test_case.assertIsInstance(p, tensor_type)
+                if device_id is not None:
+                    test_case.assertEqual(p.get_device(), device_id)
+
+        if all(isinstance(t, torch.LongTensor) for t in input_tuple) and TEST_CUDA:
             # check that cuda() moves module parameters to correct GPU device,
             # and that float() casts parameters correctly
-
-            input = input.cuda()
+            input_tuple = tuple(t.cuda() for t in input_tuple)
             module.float().cuda()
-            module(input)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                test_case.assertEqual(p.get_device(), 0)
+            module(*input_tuple)
+            assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
             if torch.cuda.device_count() > 1:
-                input = input.cuda(1)
+                input_tuple = tuple(t.cuda(1) for t in input_tuple)
                 module.cuda(1)
                 with torch.cuda.device(1):
-                    module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 1)
+                    module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 1)
         else:
             # check that float()/double() casters work correctly
 
             # to float
-            if not isinstance(input, torch.LongTensor):
-                input = input.float()
+            input_tuple = tuple(t.float() if not isinstance(t, torch.LongTensor) else t for t in input_tuple)
             module.float()
-            module(input)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.FloatTensor)
+            module(*input_tuple)
+            assert_module_parameters_are(torch.FloatTensor)
 
             # and back to double
-            if not isinstance(input, torch.LongTensor):
-                input = input.double()
+            input_tuple = tuple(t.double() if not isinstance(t, torch.LongTensor) else t for t in input_tuple)
             module.double()
-            module(input)
-            for p in module.parameters():
-                test_case.assertIsInstance(p, torch.DoubleTensor)
+            module(*input_tuple)
+            assert_module_parameters_are(torch.DoubleTensor)
 
             if TEST_CUDA and self.should_test_cuda:
                 # check that cuda() moves module parameters to correct GPU device,
                 # and that float() casts parameters correctly
 
                 # to GPU0
-                input = input.float().cuda()
+                input_tuple = tuple(
+                    t.float().cuda() if not isinstance(t, torch.LongTensor) else t.cuda() for t in input_tuple)
                 module.float().cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
                 # to CPU
-                input = input.cpu()
+                input_tuple = tuple(t.cpu() for t in input_tuple)
                 module.cpu()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.FloatTensor)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.FloatTensor)
 
                 # back to GPU0
-                input = input.cuda()
+                input_tuple = tuple(t.cuda() for t in input_tuple)
                 module.cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
                 # test that forwards of module runs correctly without cuDNN
                 if self.cudnn:
                     with torch.backends.cudnn.flags(enabled=False):
-                        module(input)
-                        for p in module.parameters():
-                            test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                            test_case.assertEqual(p.get_device(), 0)
+                        module(*input_tuple)
+                        assert_module_parameters_are(torch.cuda.FloatTensor, 0)
 
                 if torch.cuda.device_count() >= 2:
                     # test cross-GPU transfer works
                     # to GPU1
-                    input = input.cuda(1)
+                    input_tuple = tuple(t.cuda(1) for t in input_tuple)
                     module.cuda(1)
                     with torch.cuda.device(1):
-                        module(input)
-                    for p in module.parameters():
-                        test_case.assertIsInstance(p, torch.cuda.FloatTensor)
-                        test_case.assertEqual(p.get_device(), 1)
+                        module(*input_tuple)
+                    assert_module_parameters_are(torch.cuda.FloatTensor, 1)
 
                 if not self.skip_double:
                     # test double()
-                    input = input.double().cuda()
+                    input_tuple = tuple(
+                        t.double().cuda() if not isinstance(t, torch.LongTensor) else t.cuda() for t in input_tuple)
                     module.double().cuda()
-                    module(input)
-                    for p in module.parameters():
-                        test_case.assertIsInstance(p, torch.cuda.DoubleTensor)
-                        test_case.assertEqual(p.get_device(), 0)
+                    module(*input_tuple)
+                    assert_module_parameters_are(torch.cuda.DoubleTensor, 0)
 
                 # test half()
-                input = input.half().cuda()
+                input_tuple = tuple(
+                    t.half().cuda() if not isinstance(t, torch.LongTensor) else t.cuda() for t in input_tuple)
                 module.half().cuda()
-                module(input)
-                for p in module.parameters():
-                    test_case.assertIsInstance(p, torch.cuda.HalfTensor)
-                    test_case.assertEqual(p.get_device(), 0)
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.HalfTensor, 0)
         torch.set_num_threads(num_threads)
 
     def _get_target(self):

From b0bdc82a0006439af1c7794f70c64699555d74c9 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Mon, 28 Sep 2020 22:50:49 -0700
Subject: [PATCH 241/449] [FX][EZ] Fix bug where copying node made non-unique
 name (#45311)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45311

Test Plan: Imported from OSS

Reviewed By: dzhulgakov

Differential Revision: D23917864

Pulled By: jamesr66a

fbshipit-source-id: 10d0a4017ffe160bce4ba0d830e035616bbded74
---
 test/test_fx.py                   | 31 +++++++++++++++++++++++++++++++
 torch/fx/graph.py                 | 21 +++++++++++++++++++--
 torch/quantization/fx/quantize.py |  6 +++---
 3 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index ca63706f9a5d..0b107483a23b 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -183,6 +183,37 @@ def forward(self, a, b):
         gm = GraphModule(m, new_g)
         self.assertEqual(gm(3, 4), 14)
 
+    def test_graph_unique_names(self):
+        class M(torch.nn.Module):
+            def forward(self, a, b):
+                return a + b
+        m = M()
+        g = symbolic_trace(m).graph
+        new_g = torch.fx.Graph()
+        new_g.graph_copy(g)
+        t = Proxy(new_g.nodes[-1])
+        # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules.
+        new_g.output((t + t).node)
+        gm = GraphModule(m, new_g)
+        seen_names : Set[str] = set()
+        for node in gm.graph.nodes:
+            assert node.name not in seen_names
+            seen_names.add(node.name)
+
+    def test_graph_unique_names_manual(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        a : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_module', 'linear_mod', args=(a,), name='foo_1_1')
+        c : torch.fx.Node = graph.create_node('get_attr', 'y_attr', name='foo_1')
+        d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
+        graph.output(d)
+        graph2 = torch.fx.Graph()
+        graph2.graph_copy(graph)
+        seen_names : Set[str] = set()
+        for node in graph2.nodes:
+            assert node.name not in seen_names
+            seen_names.add(node.name)
+
     @skipIfNoTorchVision
     def test_resnet(self):
         resnet = resnet18()
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 6ca60f6211aa..c64778f23e03 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -97,7 +97,8 @@ def create_node(self, op: str, target: Target,
         kwargs = {} if kwargs is None else kwargs
         self._mark_uses(args)
         self._mark_uses(kwargs)
-        n = Node(self, name if name is not None else self._name(target), op, target, args, kwargs)
+        sanitized_name = self._register_name_used(name) if name is not None else self._name(target)
+        n = Node(self, sanitized_name, op, target, args, kwargs)
         self._nodes.append(n)
         return n
 
@@ -137,7 +138,15 @@ def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lamb
             # Placeholder names are user-visible, so they should be copied as-is without normalizing them.
             name = node.name
         else:
-            name = self._name(node.name)
+            sanitized_name = node.name
+            if '_' in node.name:
+                base, maybe_idx = node.name.rsplit('_', 1)
+                try:
+                    int(maybe_idx)
+                    sanitized_name = base
+                except ValueError:
+                    pass
+            name = self._name(sanitized_name)
         return self.create_node(node.op, node.target, args, kwargs, name)
 
     def output(self, result: Argument):
@@ -159,6 +168,14 @@ def _name(self, target: Target) -> str:
         if op[0].isdigit():
             op = f'_{op}'
 
+        return self._register_name_used(op)
+
+    def _register_name_used(self, op : str) -> str:
+        """
+        Even if a user provides us with a name, we must register that that
+        name is used to prevent duplication of names from further nodes as
+        well as ensure that the name provided does not shadow a builtin.
+        """
         if op not in self._used_names:
             self._used_names[op] = 0
             # Avoid shadowing PyTorch and Python builtins.
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 67e538b40433..e79887bde2ca 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -108,10 +108,10 @@ def graph_module_from_producer_nodes(root, producer_nodes):
     env = {}
 
     def load_arg(a):
-        return map_arg(a, lambda node: env[node.name])
+        return map_arg(a, lambda node: env[node])
     for producer_node in producer_nodes:
-        env[producer_node.name] = graph.node_copy(producer_node, load_arg)
-    graph.output(load_arg(producer_nodes[-1].name))
+        env[producer_node] = graph.node_copy(producer_node, load_arg)
+    graph.output(load_arg(producer_nodes[-1]))
     graph_module = GraphModule(root, graph)
     return graph_module
 

From 6bdb871d47f8810fb627d908c4fa2408fa6a632f Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Mon, 28 Sep 2020 22:50:49 -0700
Subject: [PATCH 242/449] [FX] Lint pass for Graphs (#44973)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44973

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D23792631

Pulled By: jamesr66a

fbshipit-source-id: d8faef0c311d8bd611ba0a7e1e2f353e3e5a1068
---
 test/test_fx.py   | 43 ++++++++++++++++++++++++++++++
 torch/fx/graph.py | 66 ++++++++++++++++++++++++++++++++++++++++++++---
 torch/fx/node.py  |  5 +++-
 3 files changed, 110 insertions(+), 4 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 0b107483a23b..317fc4723ebe 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -35,6 +35,7 @@ def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         kwargs = kwargs if kwargs else {}
         ref_outs = m(*args, **kwargs)
         gm = symbolic_trace(m)
+        gm.graph.lint(gm)
         test_outs = gm(*args, **kwargs)
         self.assertEqual(ref_outs, test_outs)
 
@@ -168,6 +169,7 @@ def forward(self, x):
         sym = NoLeafModulesTracer().trace(mrm)
         for node in sym.graph.nodes:
             self.assertNotEqual(node.op, 'call_module')
+        sym.graph.lint(sym)
 
     def test_graph_edit_with_proxy(self):
         class M(torch.nn.Module):
@@ -181,6 +183,7 @@ def forward(self, a, b):
         # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules.
         new_g.output((t + t).node)
         gm = GraphModule(m, new_g)
+        gm.graph.lint(gm)
         self.assertEqual(gm(3, 4), 14)
 
     def test_graph_unique_names(self):
@@ -236,6 +239,7 @@ def test_resnet(self):
             quantizer.observe((torch.rand(1, 3, 224, 224),))
 
         qgraph = quantizer.quantize()
+        qgraph.graph.lint(qgraph)
         qgraph_script = torch.jit.script(qgraph)
 
         d = qgraph(ip)
@@ -379,6 +383,8 @@ def __init__(self, interpreter):
             # Register output
             graph.output(output_node)
 
+            graph.lint(wrapper)
+
             # Return final GraphModule!!!
             return GraphModule(wrapper, graph)
 
@@ -410,6 +416,7 @@ def forward(self, a):
 
         m = M()
         m_g = symbolic_trace(m)
+        m_g.graph.lint(m_g)
         for node in m_g.graph.nodes:
             self.assertTrue(node.name != "getattr")
 
@@ -427,6 +434,7 @@ def forward(self, a, b):
 
         m = M()
         g = TaggingTracer().trace(m).graph
+        g.lint(m)
         for n in g.nodes:
             self.assertTrue(hasattr(n, 'tag'))
             self.assertEqual(n.tag, 'foo')
@@ -454,6 +462,7 @@ def forward(self, x):
 
         wfq = WrapperForQualname()
         traced2 = symbolic_trace(wfq)
+        traced2.graph.lint(traced2)
         traced2(torch.rand(4, 4))
 
     def test_symbolic_trace_sequential(self):
@@ -467,6 +476,7 @@ def forward(self, x):
             Simple()
         )
         traced = symbolic_trace(seq)
+        traced.graph.lint(traced)
         x = torch.rand(3, 4)
         self.assertEqual(traced(x), seq(x))
 
@@ -477,6 +487,7 @@ def forward(self, x):
 
         ct = ConstTensor()
         traced = symbolic_trace(ct)
+        traced.graph.lint(traced)
         traced(torch.rand(4, 4))
 
     def test_pickle_graphmodule(self):
@@ -490,14 +501,17 @@ def forward(self, x):
 
         n = Nested()
         traced = symbolic_trace(n)
+        traced.graph.lint(traced)
         pickled = pickle.dumps(traced)
         loaded = pickle.loads(pickled)
+        loaded.graph.lint(loaded)
         x = torch.rand(3, 4)
         self.assertEqual(loaded(x), traced(x))
 
     def test_deepcopy_graphmodule_with_transform(self):
         st = SimpleTest()
         traced = symbolic_trace(st)
+        traced.graph.lint(traced)
 
         def transform(traced):
             new_graph = torch.fx.Graph()
@@ -507,6 +521,7 @@ def transform(traced):
             new_graph.output(relu_out)
             return GraphModule(traced, new_graph)
         transformed = transform(traced)
+        transformed.graph.lint(transformed)
         copied = copy.deepcopy(transformed)
         self.assertNotEqual(id(type(transformed)), id(type(copied)))
         x = torch.randn(3, 4)
@@ -532,7 +547,9 @@ def forward(self, x):
 
         baz = Baz()
         traced = symbolic_trace(baz)
+        traced.graph.lint(traced)
         copied = copy.deepcopy(traced)
+        copied.graph.lint(copied)
 
     def test_unpack_list_better_error(self):
         class SomeArgs(torch.nn.Module):
@@ -578,12 +595,14 @@ def forward(self, a):
         input = torch.randn(3)
         ref_out = m(input)
         gm = symbolic_trace(m)
+        gm.graph.lint(gm)
         out = gm(input)
         self.assertEqual(out, ref_out)
 
     def test_pretty_print(self):
         st = SimpleTest()
         traced = symbolic_trace(st)
+        traced.graph.lint(traced)
         printed = str(traced)
         assert 'GraphModuleImpl()' in printed
         assert 'torch.relu' in printed
@@ -594,6 +613,7 @@ def forward(self, x):
                 return torch.squeeze(x + 3.0, dim=2)
         st = KwargPrintTest()
         traced = symbolic_trace(st)
+        traced.graph.lint(traced)
         stringed = str(traced.graph)
         for s in ['args', 'kwargs', 'uses']:
             assert s in stringed
@@ -610,6 +630,7 @@ def test_graph_fns(self):
         mod.linear = torch.nn.Linear(3, 4)
         mod.bias = torch.rand(4)
         gm = GraphModule(mod, g)
+        gm.graph.lint(gm)
         input = torch.rand(3)
         r = gm(input)
         ref = torch.sin(mod.linear(input) + mod.bias)
@@ -627,6 +648,7 @@ def test_construct_root_dict(self):
         add_param : torch.Tensor = torch.rand(3, 4)
         gm : torch.fx.GraphModule = torch.fx.GraphModule(
             {'foo.bar.baz': linear_mod, 'zip.zap.zam' : add_param}, graph)
+        gm.graph.lint(gm)
 
         assert 'self.foo.bar.baz' in gm.code
 
@@ -672,5 +694,26 @@ def test_get_all_users_of(self):
             user_indexes = GraphManipulation.get_all_users_of(gm, i)
             assert user_indexes == expected_uses[i]
 
+    def test_copy_no_remap(self):
+        traced = symbolic_trace(SimpleTest())
+        g = traced.graph
+        copied = torch.fx.Graph()
+        for node in g.nodes:
+            copied.node_copy(node)
+        with self.assertRaisesRegex(RuntimeError, 'does not belong to this Graph'):
+            copied.lint()
+
+    def test_wrong_topo(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        a : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_module', 'foo.bar.baz', args=(a,))
+        c : torch.fx.Node = graph.create_node('get_attr', 'zip.zap.zam')
+        d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
+        graph.output(d)
+        nodes = graph._nodes
+        nodes[2], nodes[3] = nodes[3], nodes[2]
+        with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'):
+            graph.lint()
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index c64778f23e03..381d4befd2bf 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1,6 +1,6 @@
 from .node import Node, Argument, Target
 
-from typing import Callable, Any, List, Dict, Optional, Tuple
+from typing import Callable, Any, List, Dict, Optional, Tuple, Set
 import builtins
 import torch
 import keyword
@@ -129,7 +129,14 @@ def call_function(self,
 
     def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lambda x: x) -> Node:
         """ copy a node from one graph into another. arg_transform needs to transform arguments from the graph of node
-            to the graph of self"""
+            to the graph of self. Example:
+
+            g : torch.fx.Graph = ...
+            new_graph = torch.fx.graph()
+            value_remap = {}
+            for node in g.nodes:
+                value_remap[node] = new_graph.node_copy(node, lambda n : value_remap[n])
+        """
         args = map_arg(node.args, arg_transform)
         kwargs = map_arg(node.kwargs, arg_transform)
         assert isinstance(args, tuple)
@@ -272,10 +279,63 @@ def format_node(n : Node) -> Optional[str]:
         for node_str in node_strs:
             if node_str:
                 s += '\n    ' + node_str
-        if self.result:
+        if hasattr(self, 'result'):
             s += f'\n    return {format_arg(self.result)}'
         return s
 
+    def lint(self, root : Optional[torch.nn.Module] = None):
+        """
+        Runs various checks on this Graph to make sure it is well-formed. In
+        particular:
+            - Checks Nodes have correct ownership (owned by this graph)
+            - Checks Nodes appear in topological order
+            - If `root` is provided, checks that `target`s exist in `root`
+        """
+
+        # Check topo order
+        def check_arg(arg : Node, n : Optional[Node] = None) -> None:
+            context_str = f' of Node \'{n}\' ' if n else ' '
+            if arg.graph is not self:
+                raise RuntimeError(f'Argument \'{arg}\'{context_str}does not belong to this Graph, '
+                                   f'but was used as an argument! If you are copying nodes from another graph, make '
+                                   f'sure to use `arg_transform` on node_copy() to remap values\n{self}')
+            if arg not in seen_values:
+                raise RuntimeError(f'Argument \'{arg}\'{context_str}was used before it has been '
+                                   f'defined! Please check that Nodes in the graph are topologically ordered\n{self}')
+
+        seen_names : Set[str] = set()
+        seen_values : Set[Node] = set()
+        for node in self._nodes:
+            if node.op not in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr']:
+                raise RuntimeError(f'Node {node} had unknown opcode {node.op}!')
+            if node.graph is not self:
+                raise RuntimeError(f'Node \'{node}\' does not belong to this Graph!')
+            map_arg(node.args, lambda arg: check_arg(arg, node))
+            map_arg(node.kwargs, lambda arg: check_arg(arg, node))
+            seen_values.add(node)
+
+            if node.name in seen_names:
+                raise RuntimeError(f'Node redefined name {node.name}!')
+            seen_names.add(node.name)
+
+        if hasattr(self, 'result'):
+            map_arg(self.result, check_arg)
+
+        # Check targets are legit
+        if root:
+            for node in self._nodes:
+                if node.op in ['get_attr', 'call_module']:
+                    assert isinstance(node.target, str)
+                    target_atoms = node.target.split('.')
+                    m_itr = root
+                    for i, atom in enumerate(target_atoms):
+                        m_itr = getattr(m_itr, atom, None)
+                        if m_itr is None:
+                            seen_qualname = '.'.join(target_atoms[:i])
+                            raise RuntimeError(f'Node {node} target {node.target} references nonexistent attribute '
+                                               f'{atom} of {seen_qualname}')
+
+
 reflectable_magic_methods = {
     'add': '{} + {}',
     'sub': '{} - {}',
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 666c627ac3e6..b008ad63bd2a 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -25,7 +25,10 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
                  args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> None:
         self.graph = graph
         self.name = name  # unique name of value being created
-        self.op = op  # the kind of operation = placeholder|call_method|call_module|call_function|getattr
+        assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr']
+        self.op = op  # the kind of operation = placeholder|call_method|call_module|call_function|get_attr
+        if op in ['call_method', 'call_module']:
+            assert isinstance(target, str)
         self.target = target  # for method/module/function, the name of the method/module/function/attr
         # being invoked, e.g add, layer1, or torch.add
         self.args = args

From 8c309fc052d79bad4746029eb419013782aa6b24 Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@devfair004.maas>
Date: Mon, 28 Sep 2020 23:58:08 -0700
Subject: [PATCH 243/449] Add more tests for mt optimizers (#45475)

Summary:
Add more test cases for mt optimizers and fix Adam/AdamW

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45475

Reviewed By: soumith

Differential Revision: D23982727

Pulled By: izdeby

fbshipit-source-id: 4b24d37bd52a2fa3719d3e3a5dcf3b96990b0f5b
---
 test/test_optim.py                 | 70 +++++++++++++++++++++++-------
 torch/optim/_multi_tensor/adam.py  |  2 +-
 torch/optim/_multi_tensor/adamw.py |  2 +-
 3 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index 0b1949235017..6d06758a5367 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -309,23 +309,60 @@ def test_multi_tensor_optimizers(self):
         if not torch.cuda.is_available():
             return
 
-        orig_optimizers = [optim.Adam, optim.AdamW,
-                           optim.SGD, optim.RMSprop,
-                           optim.Rprop, optim.ASGD,
-                           optim.Adamax, optim.Adadelta]
-
-        mt_optimizers = [optim._multi_tensor.Adam, optim._multi_tensor.AdamW,
-                         optim._multi_tensor.SGD, optim._multi_tensor.RMSprop,
-                         optim._multi_tensor.Rprop, optim._multi_tensor.ASGD,
-                         optim._multi_tensor.Adamax, optim._multi_tensor.Adadelta]
+        optimizer_pairs = [
+            (optim.Adam, optim._multi_tensor.Adam),
+            (optim.Adam, optim._multi_tensor.Adam),
+            (optim.Adam, optim._multi_tensor.Adam),
+            (optim.Adam, optim._multi_tensor.Adam),
+            (optim.AdamW, optim._multi_tensor.AdamW),
+            (optim.AdamW, optim._multi_tensor.AdamW),
+            (optim.AdamW, optim._multi_tensor.AdamW),
+            (optim.AdamW, optim._multi_tensor.AdamW),
+            (optim.SGD, optim._multi_tensor.SGD),
+            (optim.SGD, optim._multi_tensor.SGD),
+            (optim.RMSprop, optim._multi_tensor.RMSprop),
+            (optim.RMSprop, optim._multi_tensor.RMSprop),
+            (optim.RMSprop, optim._multi_tensor.RMSprop),
+            (optim.RMSprop, optim._multi_tensor.RMSprop),
+            (optim.Rprop, optim._multi_tensor.Rprop),
+            (optim.ASGD, optim._multi_tensor.ASGD),
+            (optim.ASGD, optim._multi_tensor.ASGD),
+            (optim.Adamax, optim._multi_tensor.Adamax),
+            (optim.Adamax, optim._multi_tensor.Adamax),
+            (optim.Adadelta, optim._multi_tensor.Adadelta),
+            (optim.Adadelta, optim._multi_tensor.Adadelta),
+        ]
+
+        flag_params = [
+            dict(weight_decay=1., amsgrad=True),  # Adam
+            dict(weight_decay=1., amsgrad=False),  # Adam
+            dict(weight_decay=0., amsgrad=True),  # Adam
+            dict(weight_decay=0., amsgrad=False),  # Adam
+            dict(weight_decay=1., amsgrad=True),  # AdamW
+            dict(weight_decay=1., amsgrad=False),  # AdamW
+            dict(weight_decay=0., amsgrad=True),  # AdamW
+            dict(weight_decay=0., amsgrad=False),  # AdamW
+            dict(lr=0.2, momentum=1, dampening=0, weight_decay=1, nesterov=True),  # SGD
+            dict(lr=0.2, momentum=1, dampening=0.5, weight_decay=1, nesterov=False),  # SGD
+            dict(weight_decay=1, momentum=1, centered=True),  # RMSprop
+            dict(weight_decay=1, momentum=0, centered=True),  # RMSprop
+            dict(weight_decay=1, momentum=1, centered=False),  # RMSprop
+            dict(weight_decay=0, momentum=1, centered=False),  # RMSprop
+            dict(lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)),  # Rprop
+            dict(weight_decay=0),  # ASGD
+            dict(weight_decay=1),  # ASGD
+            dict(weight_decay=0),  # Adamax
+            dict(weight_decay=1),  # Adamax
+            dict(weight_decay=0),  # Adadelta
+            dict(weight_decay=1),  # Adadelta
+        ]
 
         kIterations = 1001
         device = 'cuda'
 
-        for opt1, opt2 in zip(orig_optimizers, mt_optimizers):
-            optimizers = [opt1, opt2]
+        for index in range(len(optimizer_pairs)):
             res = []
-            for opt in optimizers:
+            for opt in optimizer_pairs[index]:
                 weight = torch.tensor([[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]], 
                                       dtype=torch.float64, device=device, requires_grad=True)
                 bias = torch.tensor([-0.1085, -0.2979, 0.6892], dtype=torch.float64, device=device, requires_grad=True)
@@ -347,7 +384,7 @@ def test_multi_tensor_optimizers(self):
                 pretrained_dict['2.bias'] = bias2
                 model.load_state_dict(pretrained_dict)
 
-                optimizer = opt(model.parameters(), lr=1.0)
+                optimizer = opt(model.parameters(), **flag_params[index])
 
                 for _ in range(kIterations): 
                     optimizer.zero_grad()
@@ -355,16 +392,17 @@ def test_multi_tensor_optimizers(self):
                     loss = output.sum()
                     loss.backward()
 
-                    def closure():
-                        return torch.Tensor([10])
+                    if iter == 0:
+                        model.parameters().__next__().grad = None
 
-                    optimizer.step(closure)
+                    optimizer.step()
 
                 res.append(model.parameters())
 
             for p1, p2 in zip(res[0], res[1]):
                 self.assertEqual(p1, p2)
 
+
     def test_adam(self):
         for optimizer in [optim.Adam, optim_mt.Adam]:
             self._test_basic_cases(
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
index 4711febf7153..0303024ff528 100644
--- a/torch/optim/_multi_tensor/adam.py
+++ b/torch/optim/_multi_tensor/adam.py
@@ -122,7 +122,7 @@ def step(self, closure=None):
 
             if amsgrad:
                 # Maintains the maximum of all 2nd moment running avg. till now
-                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                [torch.max(a, b, out=a) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
                 # Use the max. for normalizing running avg. of gradient
                 max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
                 bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
index 5440352ad908..a14a67a3cb3a 100644
--- a/torch/optim/_multi_tensor/adamw.py
+++ b/torch/optim/_multi_tensor/adamw.py
@@ -124,7 +124,7 @@ def step(self, closure=None):
 
             if amsgrad:
                 # Maintains the maximum of all 2nd moment running avg. till now
-                max_exp_avg_sq = [torch.max(a, b) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
+                [torch.max(a, b, out=a) for a, b in zip(max_exp_avg_sq, exp_avg_sq)]
                 # Use the max. for normalizing running avg. of gradient
                 max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq)
                 bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2]

From d2623da52c02fb61c58654008840c945e6a5ad86 Mon Sep 17 00:00:00 2001
From: vishalrao487 <111801046@smail.iitpkd.ac.in>
Date: Tue, 29 Sep 2020 00:26:18 -0700
Subject: [PATCH 244/449] replaced whitelist with allowlist (#45260)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/41754

**(1)**
Intially file was named **gen_op_registration_whitelist.py** I changed it to **gen_op_registration_allowlist.py**

**(2)**
There were some **whitelist** in comment inside the file, I changed it to **allowlist**
![update1](https://user-images.githubusercontent.com/62737243/94106752-b296e780-fe59-11ea-8541-632a1dbf90d6.png)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45260

Reviewed By: dhruvbird

Differential Revision: D23947182

Pulled By: ljk53

fbshipit-source-id: 31b486592451dbb0605d7950e07747cbb72ab80f
---
 cmake/Codegen.cmake                                         | 2 +-
 ...ration_whitelist.py => gen_op_registration_allowlist.py} | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename tools/code_analyzer/{gen_op_registration_whitelist.py => gen_op_registration_allowlist.py} (94%)

diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index b4bfc421dab0..dbfd55e2d0d5 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -167,7 +167,7 @@ if(INTERN_BUILD_ATEN_OPS)
     endif()
     execute_process(
       COMMAND
-      "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_whitelist.py
+      "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_allowlist.py
       --op-dependency "${OP_DEPENDENCY}"
       --root-ops "${SELECTED_OP_LIST}"
       OUTPUT_VARIABLE OP_REGISTRATION_WHITELIST
diff --git a/tools/code_analyzer/gen_op_registration_whitelist.py b/tools/code_analyzer/gen_op_registration_allowlist.py
similarity index 94%
rename from tools/code_analyzer/gen_op_registration_whitelist.py
rename to tools/code_analyzer/gen_op_registration_allowlist.py
index 5971864b2187..56e0f78cc1b5 100644
--- a/tools/code_analyzer/gen_op_registration_whitelist.py
+++ b/tools/code_analyzer/gen_op_registration_allowlist.py
@@ -1,11 +1,11 @@
 """
-This util is invoked from cmake to produce the op registration whitelist param
+This util is invoked from cmake to produce the op registration allowlist param
 for `ATen/gen.py` for custom mobile build.
 For custom build with dynamic dispatch, it takes the op dependency graph of ATen
 and the list of root ops, and outputs all transitive dependencies of the root
-ops as the whitelist.
+ops as the allowlist.
 For custom build with static dispatch, the op dependency graph will be omitted,
-and it will directly output root ops as the whitelist.
+and it will directly output root ops as the allowlist.
 """
 
 import argparse

From 92306b85d5094d902eb82218c39df4287b6e4a5c Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 29 Sep 2020 01:12:18 -0700
Subject: [PATCH 245/449] [TensorExpr] Consolidate
 {buffer,function,tensor}.{h.cpp} in tensor.{h,cpp}. (#45388)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45388

Classes defined in these files are closely related, so it is reasonable
to have them all in one file. The change is purely a code move.

Differential Revision: D23952867

Test Plan: Imported from OSS

Reviewed By: nickgg

Pulled By: ZolotukhinM

fbshipit-source-id: 12cfaa968bdfc4dff00509e34310a497c7b59155
---
 test/cpp/tensorexpr/test_boundsinference.cpp |   2 -
 test/cpp/tensorexpr/test_cuda.cpp            |   1 -
 test/cpp/tensorexpr/test_expr.cpp            |   2 -
 test/cpp/tensorexpr/test_kernel.cpp          |   1 -
 test/cpp/tensorexpr/test_llvm.cpp            |   2 -
 test/cpp/tensorexpr/test_loopnest.cpp        |   2 -
 test/cpp/tensorexpr/test_reductions.cpp      |   2 -
 test/cpp/tensorexpr/test_train.cpp           |   2 -
 test/cpp/tensorexpr/test_train_impl.cpp      |   2 -
 tools/build_variables.bzl                    |   1 -
 torch/csrc/jit/tensorexpr/buffer.cpp         |   0
 torch/csrc/jit/tensorexpr/buffer.h           |  88 ----------
 torch/csrc/jit/tensorexpr/codegen.h          |   1 -
 torch/csrc/jit/tensorexpr/eval.h             |   2 -
 torch/csrc/jit/tensorexpr/function.cpp       | 144 ----------------
 torch/csrc/jit/tensorexpr/function.h         | 105 ------------
 torch/csrc/jit/tensorexpr/ir.cpp             |   2 +-
 torch/csrc/jit/tensorexpr/ir_printer.cpp     |   1 -
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp   |   2 +-
 torch/csrc/jit/tensorexpr/reduction.h        |   2 +-
 torch/csrc/jit/tensorexpr/tensor.cpp         | 137 ++++++++++++++-
 torch/csrc/jit/tensorexpr/tensor.h           | 168 ++++++++++++++++++-
 22 files changed, 306 insertions(+), 363 deletions(-)
 delete mode 100644 torch/csrc/jit/tensorexpr/buffer.cpp
 delete mode 100644 torch/csrc/jit/tensorexpr/buffer.h
 delete mode 100644 torch/csrc/jit/tensorexpr/function.h

diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index 98d3d4127da8..7efa4713b40a 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -7,9 +7,7 @@
 #include <test/cpp/tensorexpr/padded_buffer.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index 12102b1c604c..e4a6b099c85e 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -8,7 +8,6 @@
 
 #include <torch/csrc/jit/testing/file_check.h>
 #include "test/cpp/tensorexpr/padded_buffer.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/cuda_codegen.h"
 #include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
index e94e70aa6b38..5a4865a094ab 100644
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -2,9 +2,7 @@
 
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index ab916d370e82..d80710fa732b 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -2,7 +2,6 @@
 #include <torch/csrc/jit/frontend/code_template.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/ir/irparser.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/jit/tensorexpr/loopnest.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index ee8540eb58c4..5b26d20983c0 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -3,9 +3,7 @@
 
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index 201a7e57820b..e19d959c7b66 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -7,9 +7,7 @@
 #include <test/cpp/tensorexpr/padded_buffer.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 2af8e33d3981..8fdfda07c38b 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -7,9 +7,7 @@
 
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "torch/csrc/jit/tensorexpr/analysis.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/ir_simplifier.h"
diff --git a/test/cpp/tensorexpr/test_train.cpp b/test/cpp/tensorexpr/test_train.cpp
index aa2426050324..49dbe63cf86e 100644
--- a/test/cpp/tensorexpr/test_train.cpp
+++ b/test/cpp/tensorexpr/test_train.cpp
@@ -2,9 +2,7 @@
 #include "test/cpp/tensorexpr/padded_buffer.h"
 #include "test/cpp/tensorexpr/test_base.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
diff --git a/test/cpp/tensorexpr/test_train_impl.cpp b/test/cpp/tensorexpr/test_train_impl.cpp
index 1636b583cef9..c80669990a92 100644
--- a/test/cpp/tensorexpr/test_train_impl.cpp
+++ b/test/cpp/tensorexpr/test_train_impl.cpp
@@ -1,8 +1,6 @@
 #include "test/cpp/tensorexpr/test_train.h"
 #include "test/cpp/tensorexpr/test_utils.h"
-#include "torch/csrc/jit/tensorexpr/buffer.h"
 #include "torch/csrc/jit/tensorexpr/eval.h"
-#include "torch/csrc/jit/tensorexpr/function.h"
 #include "torch/csrc/jit/tensorexpr/ir.h"
 #include "torch/csrc/jit/tensorexpr/ir_printer.h"
 #include "torch/csrc/jit/tensorexpr/loopnest.h"
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index ba73e5d8f5c9..f5fe9d24aad5 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -230,7 +230,6 @@ core_sources_full = [
     "torch/csrc/jit/tensorexpr/codegen.cpp",
     "torch/csrc/jit/tensorexpr/eval.cpp",
     "torch/csrc/jit/tensorexpr/expr.cpp",
-    "torch/csrc/jit/tensorexpr/function.cpp",
     "torch/csrc/jit/tensorexpr/hash_provider.cpp",
     "torch/csrc/jit/tensorexpr/ir.cpp",
     "torch/csrc/jit/tensorexpr/ir_mutator.cpp",
diff --git a/torch/csrc/jit/tensorexpr/buffer.cpp b/torch/csrc/jit/tensorexpr/buffer.cpp
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/torch/csrc/jit/tensorexpr/buffer.h b/torch/csrc/jit/tensorexpr/buffer.h
deleted file mode 100644
index 26bba143deaf..000000000000
--- a/torch/csrc/jit/tensorexpr/buffer.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#pragma once
-
-#include <torch/csrc/jit/tensorexpr/ir.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-// TODO: Merge this class with 'BufHandle'
-class Buffer {
- public:
-  Buffer(const BufHandle& data) : data_(data.node()) {
-    if (data_->base_handle()->dtype() != kHandle) {
-      throw malformed_input("Buffer dtype must be Handle");
-    }
-
-    std::vector<ExprHandle> stride_handles(ndim());
-    for (int i = (int)ndim() - 1; i >= 0; i--) {
-      if (i == ndim() - 1) {
-        stride_handles[i] = 1;
-      } else {
-        stride_handles[i] = stride_handles[i + 1] * ExprHandle(dim(i + 1));
-      }
-    }
-    strides_ = ExprHandleVectorToExprVector(stride_handles);
-  }
-  Buffer(
-      const std::string& name,
-      const Dtype& dtype,
-      const std::vector<ExprHandle>& dims)
-      : Buffer(BufHandle(name, dims, dtype)) {}
-
-  const Buf* data() const {
-    return data_;
-  }
-  Dtype dtype() const {
-    return data_->dtype();
-  }
-  int ndim() const {
-    return data_->ndim();
-  }
-  const Expr* dim(int index) const {
-    return data_->dim(index);
-  }
-  std::vector<const Expr*> dims() const {
-    return data_->dims();
-  }
-
-  // TODO: consider defer the storage flatten to a later stage.
-  template <typename... Args>
-  ExprHandle operator()(Args... args) const {
-    return LoadValue(std::forward<Args>(args)...);
-  }
-
-  ExprHandle LoadValue(
-      const ExprHandle& x,
-      const ExprHandle& y,
-      const ExprHandle& z) const {
-    return Load::make(*this, {x, y, z}, ExprHandle(1));
-  }
-  ExprHandle LoadValue(const ExprHandle& x, const ExprHandle& y) const {
-    return Load::make(*this, {x, y}, ExprHandle(1));
-  }
-  ExprHandle LoadValue(const ExprHandle& x) const {
-    return Load::make(*this, {x}, ExprHandle(1));
-  }
-
-  template <typename T>
-  ExprHandle call(const std::vector<T>& args) const {
-    std::vector<ExprHandle> params(args.begin(), args.end());
-    return LoadValue(params);
-  }
-
- private:
-  ExprHandle LoadValue(const std::vector<ExprHandle>& indices) const;
-
-  const Buf* data_;
-  std::vector<const Expr*> strides_;
-};
-
-inline ExprHandle Buffer::LoadValue(
-    const std::vector<ExprHandle>& indices) const {
-  return Load::make(*this, indices, ExprHandle(1));
-}
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index 271e831a7768..bd7a619bbdc9 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <ATen/ATen.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 7d15a001301f..ee0bfb025968 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -8,11 +8,9 @@
 #include <c10/util/Logging.h>
 #include <c10/util/math_compat.h>
 #include <c10/util/string_utils.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/codegen.h>
 #include <torch/csrc/jit/tensorexpr/exceptions.h>
 #include <torch/csrc/jit/tensorexpr/execution_counter.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
diff --git a/torch/csrc/jit/tensorexpr/function.cpp b/torch/csrc/jit/tensorexpr/function.cpp
index c86ec70dab21..e69de29bb2d1 100644
--- a/torch/csrc/jit/tensorexpr/function.cpp
+++ b/torch/csrc/jit/tensorexpr/function.cpp
@@ -1,144 +0,0 @@
-#include <torch/csrc/jit/tensorexpr/function.h>
-
-#include <c10/util/Logging.h>
-#include <torch/csrc/jit/tensorexpr/buffer.h>
-#include <torch/csrc/jit/tensorexpr/dim_arg.h>
-#include <torch/csrc/jit/tensorexpr/reduction.h>
-#include <torch/csrc/jit/tensorexpr/tensor.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body = body_func(VarVectorToVarHandleVector(args)).node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(const VarHandle&)>& body_func) {
-  if (dim_args.size() != 1) {
-    throw malformed_input("mismatch between body and arg size (1)");
-  }
-
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body = body_func(VarHandle(args[0])).node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
-        body_func) {
-  if (dim_args.size() != 2) {
-    throw malformed_input("mismatch between body and arg size (2)");
-  }
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<
-        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
-        body_func) {
-  if (dim_args.size() != 3) {
-    throw malformed_input("mismatch between body and arg size (3)");
-  }
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args;
-  unpack_dim_args(dim_args, &dims, &args);
-  const Expr* body =
-      body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
-          .node();
-  Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Tensor* Compute(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const std::function<ExprHandle(
-        const VarHandle&,
-        const VarHandle&,
-        const VarHandle&,
-        const VarHandle&)>& body_func) {
-  if (dim_args.size() != 4) {
-    throw malformed_input("mismatch between body and arg size (4)");
-  }
-  std::vector<const Expr*> dims;
-  std::vector<const Var*> args_nodes;
-  unpack_dim_args(dim_args, &dims, &args_nodes);
-  auto args = VarVectorToVarHandleVector(args_nodes);
-  const Expr* body = body_func(args[0], args[1], args[2], args[3]).node();
-  Function* func = new Function(func_name, dims, args_nodes, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
-}
-
-Stmt* Function::ElementStmt(size_t index) {
-  const Buf* buf = func_var(index);
-  std::vector<const Expr*> indices;
-  for (size_t i = 0; i < buf->ndim(); i++) {
-    indices.push_back(this->args_[i]);
-  }
-
-  const Expr* mask = new IntImm(1);
-
-  Stmt* update_stmt = new Store(buf, indices, body(index), mask);
-  return update_stmt;
-}
-
-Tensor* Reduce(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const Reducer& reducer,
-    const Buffer& buffer,
-    const std::vector<DimArg>& reduce_args) {
-  return Reduce(
-      func_name,
-      dim_args,
-      reducer,
-      [&](ParameterList& p) { return buffer.call(p); },
-      reduce_args);
-}
-
-Tensor* Reduce(
-    const std::string& func_name,
-    const std::vector<DimArg>& dim_args,
-    const Reducer& reducer,
-    Tensor* tensor,
-    const std::vector<DimArg>& reduce_args) {
-  return Reduce(
-      func_name,
-      dim_args,
-      reducer,
-      [&](ParameterList& p) { return tensor->call(p); },
-      reduce_args);
-}
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/function.h b/torch/csrc/jit/tensorexpr/function.h
deleted file mode 100644
index 128253d95ff0..000000000000
--- a/torch/csrc/jit/tensorexpr/function.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#pragma once
-
-#include <functional>
-#include <vector>
-
-#include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/ir.h>
-
-namespace torch {
-namespace jit {
-namespace tensorexpr {
-
-class Function : public KernelScopedObject {
- public:
-  Function(
-      const std::string& func_name,
-      const std::vector<const Expr*>& dims,
-      const std::vector<const Var*>& args,
-      const Expr* body)
-      // TODO: Function should not create buffers, they should be created
-      // manually before constructing a function.
-      : func_vars_({new Buf(func_name, dims, body->dtype())}),
-        dims_(dims),
-        args_(args),
-        bodies_({body}) {}
-  Function(
-      const std::vector<std::string>& func_names,
-      const std::vector<const Expr*>& dims,
-      const std::vector<const Var*>& args,
-      const std::vector<const Expr*>& bodies)
-      : func_vars_(func_names.size()),
-        dims_(dims),
-        args_(args),
-        bodies_(bodies) {
-    for (size_t i = 0; i < func_names.size(); i++) {
-      func_vars_[i] = new Buf(func_names[i], dims, bodies[i]->dtype());
-    }
-  }
-  Function(
-      const std::string& func_name,
-      Buf* func_var,
-      const std::vector<const Expr*>& dims,
-      const std::vector<const Var*>& args,
-      const Expr* body)
-      : func_vars_({func_var}), dims_(dims), args_(args), bodies_({body}) {}
-
-  size_t ndim() const {
-    return dims_.size();
-  }
-
-  const Expr* dim(size_t index) const {
-    if (index < 0 || index >= dims_.size()) {
-      throw out_of_range_index();
-    }
-
-    return dims_[index];
-  }
-  const std::vector<const Expr*>& dims() const {
-    return dims_;
-  }
-
-  const Var* arg(size_t index) const {
-    if (index < 0 || index >= args_.size()) {
-      throw out_of_range_index();
-    }
-
-    return args_[index];
-  }
-  const std::vector<const Var*>& args() const {
-    return args_;
-  }
-
-  std::vector<const Expr*> bodies() const {
-    return bodies_;
-  }
-  const Expr* body(size_t index) const {
-    if (index >= bodies_.size()) {
-      throw out_of_range_index();
-    }
-
-    return bodies_[index];
-  }
-
-  std::vector<const Buf*> func_vars() const {
-    return func_vars_;
-  }
-  const Buf* func_var(size_t index) const {
-    if (index >= func_vars_.size()) {
-      throw out_of_range_index();
-    }
-    return func_vars_[index];
-  }
-
-  Stmt* ElementStmt(size_t index);
-
- private:
-  std::vector<const Buf*> func_vars_;
-  std::vector<const Expr*> dims_;
-  std::vector<const Var*> args_;
-  std::vector<const Expr*> bodies_;
-};
-
-} // namespace tensorexpr
-} // namespace jit
-} // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index 69144bb9b7a0..cd153d9d4bbb 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -1,6 +1,6 @@
 #include <torch/csrc/jit/tensorexpr/ir.h>
 
-#include <torch/csrc/jit/tensorexpr/buffer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 
 namespace torch {
 namespace jit {
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 4fc7336f8eac..8f47d855500f 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
 
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
 #include <torch/csrc/jit/tensorexpr/reduction.h>
 #include <torch/csrc/jit/tensorexpr/tensor.h>
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index d817f62339ba..609c5a731c42 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -17,10 +17,10 @@
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <llvm/Transforms/Scalar.h>
 
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/execution_counter.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/tensorexpr/types.h>
 
 #define DEBUG_PRINT 0
diff --git a/torch/csrc/jit/tensorexpr/reduction.h b/torch/csrc/jit/tensorexpr/reduction.h
index 1f2358d203ed..5fa0544261ff 100644
--- a/torch/csrc/jit/tensorexpr/reduction.h
+++ b/torch/csrc/jit/tensorexpr/reduction.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <torch/csrc/jit/tensorexpr/buffer.h>
 #include <torch/csrc/jit/tensorexpr/dim_arg.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
 #include <torch/csrc/jit/tensorexpr/ir.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
 #include <torch/csrc/jit/tensorexpr/types.h>
 
 #include <functional>
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index f986cd663dc8..5d75cb5401b8 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -1,7 +1,142 @@
 #include <torch/csrc/jit/tensorexpr/tensor.h>
 
+#include <c10/util/Logging.h>
+#include <torch/csrc/jit/tensorexpr/dim_arg.h>
+#include <torch/csrc/jit/tensorexpr/reduction.h>
+
 namespace torch {
 namespace jit {
-namespace tensorexpr {} // namespace tensorexpr
+namespace tensorexpr {
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func) {
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body = body_func(VarVectorToVarHandleVector(args)).node();
+  Function* func = new Function(func_name, dims, args, body);
+  const Buf* buf = func->func_var(0);
+  return new Tensor(buf, func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(const VarHandle&)>& body_func) {
+  if (dim_args.size() != 1) {
+    throw malformed_input("mismatch between body and arg size (1)");
+  }
+
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body = body_func(VarHandle(args[0])).node();
+  Function* func = new Function(func_name, dims, args, body);
+  const Buf* buf = func->func_var(0);
+  return new Tensor(buf, func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func) {
+  if (dim_args.size() != 2) {
+    throw malformed_input("mismatch between body and arg size (2)");
+  }
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
+  Function* func = new Function(func_name, dims, args, body);
+  const Buf* buf = func->func_var(0);
+  return new Tensor(buf, func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func) {
+  if (dim_args.size() != 3) {
+    throw malformed_input("mismatch between body and arg size (3)");
+  }
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args;
+  unpack_dim_args(dim_args, &dims, &args);
+  const Expr* body =
+      body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
+          .node();
+  Function* func = new Function(func_name, dims, args, body);
+  const Buf* buf = func->func_var(0);
+  return new Tensor(buf, func, 0);
+}
+
+Tensor* Compute(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func) {
+  if (dim_args.size() != 4) {
+    throw malformed_input("mismatch between body and arg size (4)");
+  }
+  std::vector<const Expr*> dims;
+  std::vector<const Var*> args_nodes;
+  unpack_dim_args(dim_args, &dims, &args_nodes);
+  auto args = VarVectorToVarHandleVector(args_nodes);
+  const Expr* body = body_func(args[0], args[1], args[2], args[3]).node();
+  Function* func = new Function(func_name, dims, args_nodes, body);
+  const Buf* buf = func->func_var(0);
+  return new Tensor(buf, func, 0);
+}
+
+Stmt* Function::ElementStmt(size_t index) {
+  const Buf* buf = func_var(index);
+  std::vector<const Expr*> indices;
+  for (size_t i = 0; i < buf->ndim(); i++) {
+    indices.push_back(this->args_[i]);
+  }
+
+  const Expr* mask = new IntImm(1);
+
+  Stmt* update_stmt = new Store(buf, indices, body(index), mask);
+  return update_stmt;
+}
+
+Tensor* Reduce(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const Reducer& reducer,
+    const Buffer& buffer,
+    const std::vector<DimArg>& reduce_args) {
+  return Reduce(
+      func_name,
+      dim_args,
+      reducer,
+      [&](ParameterList& p) { return buffer.call(p); },
+      reduce_args);
+}
+
+Tensor* Reduce(
+    const std::string& func_name,
+    const std::vector<DimArg>& dim_args,
+    const Reducer& reducer,
+    Tensor* tensor,
+    const std::vector<DimArg>& reduce_args) {
+  return Reduce(
+      func_name,
+      dim_args,
+      reducer,
+      [&](ParameterList& p) { return tensor->call(p); },
+      reduce_args);
+}
+
+} // namespace tensorexpr
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 7ed32a905ead..dd1fc4ab9133 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -1,17 +1,107 @@
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <functional>
 #include <vector>
 
 #include <torch/csrc/jit/tensorexpr/dim_arg.h>
 #include <torch/csrc/jit/tensorexpr/expr.h>
-#include <torch/csrc/jit/tensorexpr/function.h>
 #include <torch/csrc/jit/tensorexpr/reduction.h>
 
 namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+class Function : public KernelScopedObject {
+ public:
+  Function(
+      const std::string& func_name,
+      const std::vector<const Expr*>& dims,
+      const std::vector<const Var*>& args,
+      const Expr* body)
+      // TODO: Function should not create buffers, they should be created
+      // manually before constructing a function.
+      : func_vars_({new Buf(func_name, dims, body->dtype())}),
+        dims_(dims),
+        args_(args),
+        bodies_({body}) {}
+  Function(
+      const std::vector<std::string>& func_names,
+      const std::vector<const Expr*>& dims,
+      const std::vector<const Var*>& args,
+      const std::vector<const Expr*>& bodies)
+      : func_vars_(func_names.size()),
+        dims_(dims),
+        args_(args),
+        bodies_(bodies) {
+    for (size_t i = 0; i < func_names.size(); i++) {
+      func_vars_[i] = new Buf(func_names[i], dims, bodies[i]->dtype());
+    }
+  }
+  Function(
+      const std::string& func_name,
+      Buf* func_var,
+      const std::vector<const Expr*>& dims,
+      const std::vector<const Var*>& args,
+      const Expr* body)
+      : func_vars_({func_var}), dims_(dims), args_(args), bodies_({body}) {}
+
+  size_t ndim() const {
+    return dims_.size();
+  }
+
+  const Expr* dim(size_t index) const {
+    if (index < 0 || index >= dims_.size()) {
+      throw out_of_range_index();
+    }
+
+    return dims_[index];
+  }
+  const std::vector<const Expr*>& dims() const {
+    return dims_;
+  }
+
+  const Var* arg(size_t index) const {
+    if (index < 0 || index >= args_.size()) {
+      throw out_of_range_index();
+    }
+
+    return args_[index];
+  }
+  const std::vector<const Var*>& args() const {
+    return args_;
+  }
+
+  std::vector<const Expr*> bodies() const {
+    return bodies_;
+  }
+  const Expr* body(size_t index) const {
+    if (index >= bodies_.size()) {
+      throw out_of_range_index();
+    }
+
+    return bodies_[index];
+  }
+
+  std::vector<const Buf*> func_vars() const {
+    return func_vars_;
+  }
+  const Buf* func_var(size_t index) const {
+    if (index >= func_vars_.size()) {
+      throw out_of_range_index();
+    }
+    return func_vars_[index];
+  }
+
+  Stmt* ElementStmt(size_t index);
+
+ private:
+  std::vector<const Buf*> func_vars_;
+  std::vector<const Expr*> dims_;
+  std::vector<const Var*> args_;
+  std::vector<const Expr*> bodies_;
+};
+
 class Tensor : KernelScopedObject {
  public:
   Function* function() const {
@@ -71,6 +161,82 @@ class Tensor : KernelScopedObject {
   const Expr* initializer_{nullptr};
 };
 
+class Buffer {
+ public:
+  Buffer(const BufHandle& data) : data_(data.node()) {
+    if (data_->base_handle()->dtype() != kHandle) {
+      throw malformed_input("Buffer dtype must be Handle");
+    }
+
+    std::vector<ExprHandle> stride_handles(ndim());
+    for (int i = (int)ndim() - 1; i >= 0; i--) {
+      if (i == ndim() - 1) {
+        stride_handles[i] = 1;
+      } else {
+        stride_handles[i] = stride_handles[i + 1] * ExprHandle(dim(i + 1));
+      }
+    }
+    strides_ = ExprHandleVectorToExprVector(stride_handles);
+  }
+  Buffer(
+      const std::string& name,
+      const Dtype& dtype,
+      const std::vector<ExprHandle>& dims)
+      : Buffer(BufHandle(name, dims, dtype)) {}
+
+  const Buf* data() const {
+    return data_;
+  }
+  Dtype dtype() const {
+    return data_->dtype();
+  }
+  int ndim() const {
+    return data_->ndim();
+  }
+  const Expr* dim(int index) const {
+    return data_->dim(index);
+  }
+  std::vector<const Expr*> dims() const {
+    return data_->dims();
+  }
+
+  // TODO: consider defer the storage flatten to a later stage.
+  template <typename... Args>
+  ExprHandle operator()(Args... args) const {
+    return LoadValue(std::forward<Args>(args)...);
+  }
+
+  ExprHandle LoadValue(
+      const ExprHandle& x,
+      const ExprHandle& y,
+      const ExprHandle& z) const {
+    return Load::make(*this, {x, y, z}, ExprHandle(1));
+  }
+  ExprHandle LoadValue(const ExprHandle& x, const ExprHandle& y) const {
+    return Load::make(*this, {x, y}, ExprHandle(1));
+  }
+  ExprHandle LoadValue(const ExprHandle& x) const {
+    return Load::make(*this, {x}, ExprHandle(1));
+  }
+
+  template <typename T>
+  ExprHandle call(const std::vector<T>& args) const {
+    std::vector<ExprHandle> params(args.begin(), args.end());
+    return LoadValue(params);
+  }
+
+ private:
+  ExprHandle LoadValue(const std::vector<ExprHandle>& indices) const;
+
+  const Buf* data_;
+  std::vector<const Expr*> strides_;
+};
+
+inline ExprHandle Buffer::LoadValue(
+    const std::vector<ExprHandle>& indices) const {
+  return Load::make(*this, indices, ExprHandle(1));
+}
+
 TORCH_API Tensor* Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,

From 3c33695a6d932bab687018081ca9d53ad3973221 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 29 Sep 2020 01:12:18 -0700
Subject: [PATCH 246/449] [TensorExpr] Rename `Buffer` to `Placeholder`.
 (#45389)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45389

Differential Revision: D23952866

Test Plan: Imported from OSS

Reviewed By: nickgg

Pulled By: ZolotukhinM

fbshipit-source-id: 17eedd3ac17897501403482ac1866c569d247c75
---
 test/cpp/tensorexpr/test_aten.cpp            | 186 +++++++++----------
 test/cpp/tensorexpr/test_boundsinference.cpp |  28 +--
 test/cpp/tensorexpr/test_cuda.cpp            |  96 +++++-----
 test/cpp/tensorexpr/test_expr.cpp            |  36 ++--
 test/cpp/tensorexpr/test_llvm.cpp            | 184 +++++++++---------
 test/cpp/tensorexpr/test_loopnest.cpp        |  76 ++++----
 test/cpp/tensorexpr/test_reductions.cpp      |  90 ++++-----
 test/cpp/tensorexpr/test_registerizer.cpp    |  44 ++---
 test/cpp/tensorexpr/test_simplify.cpp        | 112 +++++------
 test/cpp/tensorexpr/test_train.cpp           |  16 +-
 test/cpp/tensorexpr/test_train.h             |   2 +-
 test/cpp/tensorexpr/test_train_impl.cpp      |   6 +-
 torch/csrc/jit/tensorexpr/codegen.h          |   2 +-
 torch/csrc/jit/tensorexpr/eval.h             |   2 +-
 torch/csrc/jit/tensorexpr/expr.h             |   2 +-
 torch/csrc/jit/tensorexpr/ir.cpp             |   8 +-
 torch/csrc/jit/tensorexpr/ir.h               |   6 +-
 torch/csrc/jit/tensorexpr/kernel.cpp         |   2 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp       |   4 +-
 torch/csrc/jit/tensorexpr/reduction.h        |   2 +-
 torch/csrc/jit/tensorexpr/stmt.h             |   6 +-
 torch/csrc/jit/tensorexpr/tensor.cpp         |   2 +-
 torch/csrc/jit/tensorexpr/tensor.h           |  16 +-
 23 files changed, 464 insertions(+), 464 deletions(-)

diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp
index 3ccc484c8420..1bd13bcd9569 100644
--- a/test/cpp/tensorexpr/test_aten.cpp
+++ b/test/cpp/tensorexpr/test_aten.cpp
@@ -15,8 +15,8 @@ using namespace torch::jit::tensorexpr;
 void testATen_cast_Float() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -43,8 +43,8 @@ void testATen_cast_Float() {
 void testATennegInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -71,8 +71,8 @@ void testATennegInt() {
 void testATennegFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -99,10 +99,10 @@ void testATennegFloat() {
 void testATenaddInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -136,10 +136,10 @@ void testATenaddInt() {
 void testATenaddFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -173,10 +173,10 @@ void testATenaddFloat() {
 void testATensubInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -210,10 +210,10 @@ void testATensubInt() {
 void testATensubFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -247,10 +247,10 @@ void testATensubFloat() {
 void testATenlerp() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -285,11 +285,11 @@ void testATenlerp() {
 void testATenaddcmulInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
-  Buffer e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -328,11 +328,11 @@ void testATenaddcmulInt() {
 void testATenaddcmulFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -371,9 +371,9 @@ void testATenaddcmulFloat() {
 void testATenmulInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -403,9 +403,9 @@ void testATenmulInt() {
 void testATenmulFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -435,9 +435,9 @@ void testATenmulFloat() {
 void testATendivInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -467,9 +467,9 @@ void testATendivInt() {
 void testATendivFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -499,9 +499,9 @@ void testATendivFloat() {
 void testATenmaxInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -532,9 +532,9 @@ void testATenmaxInt() {
 void testATenmaxFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -565,9 +565,9 @@ void testATenmaxFloat() {
 void testATenminInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -598,9 +598,9 @@ void testATenminInt() {
 void testATenminFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -631,8 +631,8 @@ void testATenminFloat() {
 void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -658,8 +658,8 @@ void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
 void testATenreluInt() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -685,8 +685,8 @@ void testATenreluInt() {
 void testATenreluFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -716,8 +716,8 @@ void testATenreluFloat() {
 void testATenlogFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -743,8 +743,8 @@ void testATenlogFloat() {
 void testATenlog10Float() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -770,8 +770,8 @@ void testATenlog10Float() {
 void testATenlog2Float() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -797,8 +797,8 @@ void testATenlog2Float() {
 void testATenexpFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -824,8 +824,8 @@ void testATenexpFloat() {
 void testATenerfFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -851,8 +851,8 @@ void testATenerfFloat() {
 void testATencosFloat() {
   KernelScope kernel_scope;
   const int kTotalSize = 128;
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
   ExprHandle load_a = Load::make(a_buf, {index}, 1);
@@ -878,9 +878,9 @@ void testATencosFloat() {
 void testATeneqInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 0);
@@ -909,9 +909,9 @@ void testATeneqInt() {
 void testATengeInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 5);
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 0);
@@ -940,9 +940,9 @@ void testATengeInt() {
 void testATengtInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 6);
   std::vector<int> b_buffer(N, 3);
   std::vector<int> c_buffer(N, 0);
@@ -971,9 +971,9 @@ void testATengtInt() {
 void testATenleInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 5);
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 0);
@@ -1002,9 +1002,9 @@ void testATenleInt() {
 void testATenltInt() {
   KernelScope kernel_scope;
   constexpr int N = 128;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 5);
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 1);
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index 7efa4713b40a..7af7e3296e05 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -47,7 +47,7 @@ void testBoundsInference_1() {
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}}
   KernelScope kernel_scope;
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
   LoopNest l({b});
@@ -72,7 +72,7 @@ void testBoundsInference_2() {
   // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}}
   KernelScope kernel_scope;
   VarHandle n("n", kInt);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
   LoopNest l({b});
@@ -97,7 +97,7 @@ void testBoundsInference_3() {
   // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}}
   KernelScope kernel_scope;
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n + 10}, kFloat));
+  Placeholder a(BufHandle("a", {n + 10}, kFloat));
   Tensor* b = Compute(
       "b", {{n, "i"}}, [&](const VarHandle& i) { return a(i) * a(i + 10); });
   LoopNest l({b});
@@ -126,7 +126,7 @@ void testBoundsInference_4() {
   KernelScope kernel_scope;
   ExprHandle W(320);
   ExprHandle H(200);
-  Buffer a(BufHandle("a", {H, W}, kFloat));
+  Placeholder a(BufHandle("a", {H, W}, kFloat));
   Tensor* b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
@@ -205,7 +205,7 @@ void testBoundsInference_5() {
   //   b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16];
   KernelScope kernel_scope;
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
   LoopNest l({b});
@@ -258,7 +258,7 @@ void testBoundsInference_6() {
   ExprHandle H(200);
   ExprHandle CW(32);
   ExprHandle CH(20);
-  Buffer a(BufHandle("a", {H, W}, kFloat));
+  Placeholder a(BufHandle("a", {H, W}, kFloat));
   Tensor* b = Compute(
       "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
         return x * y;
@@ -326,7 +326,7 @@ void testBoundsInference_6() {
 void testBoundsInferenceNonOverlapping() {
   KernelScope kernel_scope;
   ExprHandle H(3);
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
   Tensor* b =
       Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); });
   Tensor* c = Compute(
@@ -387,7 +387,7 @@ void testBoundsInferenceNonOverlapping() {
 void testBoundsInferenceAdjacent() {
   KernelScope kernel_scope;
   ExprHandle H(6);
-  Buffer a(BufHandle("a", {20}, kFloat));
+  Placeholder a(BufHandle("a", {20}, kFloat));
   Tensor* b =
       Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); });
   Tensor* c =
@@ -446,7 +446,7 @@ void testBoundsInferenceAdjacent() {
 
 void testMergeInferredBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
 
   // There are seven cases to consider in mergeTensorAccesses(A, B)
   //   * A is lower than B and does not overlap.
@@ -516,7 +516,7 @@ void testMergeInferredBounds() {
 
 void testMergeInferredLoadStoreDiff() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
 
   // Loads and Stores do not merge:
   BoundsInfo info;
@@ -547,7 +547,7 @@ void testMergeInferredLoadStoreDiff() {
 
 void testMergeInferred2DBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10, 10}, kFloat));
+  Placeholder a(BufHandle("a", {10, 10}, kFloat));
 
   // Non overlapping in both dimensions:
   BoundsInfo info;
@@ -605,7 +605,7 @@ void testMergeInferred2DBounds() {
 
 void testMergeAdjacentBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
 
   // Adjacent but not overlapping bounds can be merged.
   // e.g. {1-4} | {5-9} => {1-9}
@@ -645,7 +645,7 @@ std::pair<std::string, std::string> boundAsStringPair(
 
 void testMergeSymbolicBounds() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
   VarHandle W("W", kInt);
   VarHandle X("X", kInt);
   VarHandle Y("Y", kInt);
@@ -755,7 +755,7 @@ void testMergeSymbolicBounds() {
 
 void testMergeSymbolicAdjacent() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {10}, kFloat));
+  Placeholder a(BufHandle("a", {10}, kFloat));
   VarHandle X("X", kInt);
   VarHandle Y("Y", kInt);
 
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index e4a6b099c85e..c1afd889412b 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -30,8 +30,8 @@ void testCudaTestVectorAdd01_impl() {
   const int block_count = 16;
   const int block_size = 128;
   Dtype dtype = ToDtype<ctype>();
-  Buffer a_buf("a", dtype, {num_iter, block_count, block_size});
-  Buffer b_buf("b", dtype, {num_iter, block_count, block_size});
+  Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
+  Placeholder b_buf("b", dtype, {num_iter, block_count, block_size});
   Tensor* c = Compute(
       "c",
       {
@@ -96,7 +96,7 @@ void testCudaSigmoid() {
   const int block_count = 16;
   const int block_size = 128;
   Dtype dtype = ToDtype<float>();
-  Buffer a_buf("a", dtype, {num_iter, block_count, block_size});
+  Placeholder a_buf("a", dtype, {num_iter, block_count, block_size});
   Tensor* c = Compute(
       "c",
       {
@@ -161,8 +161,8 @@ void testCudaTestVectorAdd01() {
 
 static void testCudaTestVectorAdd02_impl(int N, int block_size) {
   KernelScope kernel_scope;
-  Buffer a_buf("a", kFloat, {N});
-  Buffer b_buf("b", kFloat, {N});
+  Placeholder a_buf("a", kFloat, {N});
+  Placeholder b_buf("b", kFloat, {N});
   Tensor* c = Compute(
       "c",
       {
@@ -223,7 +223,7 @@ void testCudaTestVectorAdd02() {
 void testCudaHalfCast() {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
-  Buffer a("a", half, {4});
+  Placeholder a("a", half, {4});
   Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
     return Cast::make(kFloat, a(i));
   });
@@ -264,8 +264,8 @@ void testCudaDynamicShape2D() {
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {m, n}, kFloat));
-    Buffer b(BufHandle("b", {m, n}, kFloat));
+    Placeholder a(BufHandle("a", {m, n}, kFloat));
+    Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a(i, j) + b(i, j);
@@ -385,7 +385,7 @@ void testCudaDynamicShapeSplit() {
   KernelScope ks;
   constexpr int N = 4096;
   VarHandle n("n", kInt);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "n"}}, [&](const VarHandle& i) { return a(i) * 2.0f; });
   LoopNest l({b});
@@ -435,8 +435,8 @@ void testCudaDynamicShapeSplit() {
 void testCudaOneBlockOneThreadGlobalReduce1() {
   const static int N = 1024;
   KernelScope kernel_scope;
-  Buffer data_buf("data", kFloat, {N});
-  Buffer output_buf("output", kFloat, {1});
+  Placeholder data_buf("data", kFloat, {N});
+  Placeholder output_buf("output", kFloat, {1});
 
   // The test adds the following code for trivial reduction:
   // for (int bidx = 0; bidx < 1; bidx++) { // blockIdx.x
@@ -514,8 +514,8 @@ void testCudaOneBlockMultiThreadGlobalReduce1() {
   //      b[0] = b[0] + a[t] // implied atomic
   // clang-format on
 
-  Buffer a_buf("a", kFloat, {N});
-  Buffer b_buf("b", kFloat, {1});
+  Placeholder a_buf("a", kFloat, {N});
+  Placeholder b_buf("b", kFloat, {1});
 
   Store* init_store = Store::make(b_buf, {0}, 0.f, 1);
   VarHandle t("t", kInt);
@@ -596,8 +596,8 @@ void testCudaNoThreadIdxWrite_1() {
   //  covered by its own thread-idx
 
   const static int N = 1024;
-  Buffer a_buf("a", kFloat, {2});
-  Buffer b_buf("b", kFloat, {N});
+  Placeholder a_buf("a", kFloat, {2});
+  Placeholder b_buf("b", kFloat, {N});
 
   VarHandle k("k", kInt);
   VarHandle l("l", kInt);
@@ -698,8 +698,8 @@ void testCudaSharedMemReduce_1() {
   LoopOptions block_idx_opt;
   block_idx_opt.set_gpu_block_index(0);
 
-  Buffer a("a", kFloat, {1, M, N});
-  Buffer b("b", kFloat, {1});
+  Placeholder a("a", kFloat, {1, M, N});
+  Placeholder b("b", kFloat, {1});
   VarHandle k("k", kInt);
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
@@ -834,8 +834,8 @@ void testCudaLocalMemReduce_1() {
   LoopOptions block_idx_opt;
   block_idx_opt.set_gpu_block_index(0);
 
-  Buffer a("a", kFloat, {1, M, N});
-  Buffer b("b", kFloat, {1});
+  Placeholder a("a", kFloat, {1, M, N});
+  Placeholder b("b", kFloat, {1});
   VarHandle k("k", kInt);
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
@@ -929,7 +929,7 @@ void testCudaLocalMemReduce_1() {
 void testCudaHalfSupport() {
   KernelScope ks;
   auto half = ToDtype<at::Half>();
-  Buffer a("a", half, {4});
+  Placeholder a("a", half, {4});
   Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
     return Cast::make(half, ExprHandle(2.0f) * a(i));
   });
@@ -987,7 +987,7 @@ void testCudaHalfSupport() {
 void testCudaHalfPropagation() {
   KernelScope kernel_scope;
   auto half = ToDtype<at::Half>();
-  Buffer a("a", half, {4});
+  Placeholder a("a", half, {4});
   Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
     return Max::make(a(i), ExprHandle(new HalfImm(0)), true);
   });
@@ -1035,9 +1035,9 @@ void testCudaHalfPropagation() {
 
 void testCudaPrioritizeDependents() {
   KernelScope kernel_scope;
-  Buffer a("a", kFloat, {10});
-  Buffer b("b", kFloat, {12});
-  Buffer c("c", kFloat, {12});
+  Placeholder a("a", kFloat, {10});
+  Placeholder b("b", kFloat, {12});
+  Placeholder c("c", kFloat, {12});
 
   LoopOptions block_idx_opt;
   block_idx_opt.set_gpu_block_index(0);
@@ -1111,8 +1111,8 @@ void testCudaMaskBlockDim() {
   KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
   Tensor* c = Compute(
       "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
@@ -1203,8 +1203,8 @@ void testCudaMaskThreadDim() {
   KernelScope kernel_scope;
   int A_SIZE = 50;
   int B_SIZE = 100;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
   Tensor* c = Compute(
       "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
@@ -1297,8 +1297,8 @@ void testCudaMaskMultiBlockDim() {
   KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
   Tensor* c = Compute(
       "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
@@ -1390,8 +1390,8 @@ void testCudaMaskBlockAndThreadDim() {
   KernelScope kernel_scope;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {A_SIZE});
-  Buffer b_buf("b", kFloat, {B_SIZE});
+  Placeholder a_buf("a", kFloat, {A_SIZE});
+  Placeholder b_buf("b", kFloat, {B_SIZE});
   Tensor* c = Compute(
       "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
@@ -1482,8 +1482,8 @@ void testCudaMaskMultiDim() {
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
@@ -1612,8 +1612,8 @@ void testCudaMaskMultiDimSymbolic() {
   VarHandle OUTER_SIZE("OUTER_SIZE", kInt);
   VarHandle A_SIZE("A_SIZE", kInt);
   VarHandle B_SIZE("B_SIZE", kInt);
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
@@ -1748,10 +1748,10 @@ void testCudaMaskCompoundInnerLoop() {
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Buffer c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
 
   // Can't build this using Compute and transforms yet.
   LoopOptions blockBound;
@@ -1887,10 +1887,10 @@ void testCudaMaskInnerLoopOneBlock() {
   int OUTER_SIZE = 10;
   int A_SIZE = 100;
   int B_SIZE = 50;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
-  Buffer c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder c_buf("c", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder d_buf("d", kFloat, {OUTER_SIZE, B_SIZE});
 
   // Can't build this using Compute and transforms yet.
   LoopOptions blockBound;
@@ -2026,8 +2026,8 @@ void testCudaMaskMultiDimMultiAxis() {
   int OUTER_SIZE = 10;
   int A_SIZE = 30;
   int B_SIZE = 15;
-  Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
@@ -2157,8 +2157,8 @@ void testCudaMaskMultiDimMultiLevel() {
   int OUTER_B_SIZE = 5;
   int A_SIZE = 30;
   int B_SIZE = 15;
-  Buffer a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE});
-  Buffer b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE});
+  Placeholder a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE});
+  Placeholder b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE});
   Tensor* c = Compute(
       "C",
       {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}},
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
index 5a4865a094ab..298225c23540 100644
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -62,8 +62,8 @@ void testExprLetTest02() {
 
 void testExprLetStmtTest01() {
   KernelScope kernel_scope;
-  Buffer a_buf("a", kFloat, {1});
-  Buffer b_buf("b", kFloat, {1});
+  Placeholder a_buf("a", kFloat, {1});
+  Placeholder b_buf("b", kFloat, {1});
 
   ExprHandle load_a = Load::make(a_buf, {0}, 1);
   VarHandle var = VarHandle("v", kFloat);
@@ -186,9 +186,9 @@ void testExprVectorAdd01() {
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
 
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   /*
   Build the following:
@@ -236,9 +236,9 @@ void testExprVectorAdd01() {
 void testExprCompareSelectEQ() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 0);
@@ -280,9 +280,9 @@ void testExprCompareSelectDtypes() {
   //   result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 0.0f);
@@ -322,8 +322,8 @@ void testExprCompareSelectDtypes() {
 void testExprIntrinsicsDtypes() {
   KernelScope kernel_scope;
   constexpr int N = 256;
-  Buffer a(BufHandle("A", {N}, kDouble));
-  Buffer b(BufHandle("B", {N}, kDouble));
+  Placeholder a(BufHandle("A", {N}, kDouble));
+  Placeholder b(BufHandle("B", {N}, kDouble));
   std::vector<double> a_buffer(N, -10.0);
   std::vector<double> b_buffer(N, 0.0);
   std::vector<double> b_ref(N, 10.0);
@@ -479,9 +479,9 @@ void testExprDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
-    Buffer c(BufHandle("c", {n}, kFloat));
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
+    Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
     Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
     std::vector<float> aData(size, 1.0f);
@@ -499,7 +499,7 @@ void testCond01() {
   KernelScope kernel_scope;
   const int N = 16;
   PaddedBuffer<float> a_v(N);
-  Buffer a_buf("a", kFloat, {N});
+  Placeholder a_buf("a", kFloat, {N});
   VarHandle index = VarHandle("index", kInt);
   Stmt* assign_x2 =
       Store::make(BufHandle(a_buf.data()), {index}, cast<float>(index) * 2, 1);
@@ -562,7 +562,7 @@ void testStmtClone() {
   KernelScope kernel_scope;
   const int N = 16;
 
-  Buffer a_buf("a", kInt, {N});
+  Placeholder a_buf("a", kInt, {N});
   VarHandle index = VarHandle("index", kInt);
   Stmt* body = Store::make(BufHandle(a_buf.data()), {index}, 5, 1);
   Stmt* loop = For::make(index, 0, N, body);
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 5b26d20983c0..9e227a918ddc 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -161,7 +161,7 @@ void testLLVMByteToDoubleCastTest() {
 void testLLVMLetTest01() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {1}, kFloat));
+  Placeholder a(BufHandle("A", {1}, kFloat));
   std::vector<float> v = {1, 0};
   std::vector<void*> args({v.data()});
   VarHandle x("x", kFloat);
@@ -182,7 +182,7 @@ void testLLVMLetTest01() {
 void testLLVMLetTest02() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {1}, kFloat));
+  Placeholder a(BufHandle("A", {1}, kFloat));
   std::vector<float> v = {1, 0};
   std::vector<void*> args({v.data()});
   VarHandle x("x", kFloat);
@@ -205,7 +205,7 @@ void testLLVMLetTest02() {
 void testLLVMLetTestMultitype() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {1}, kDouble));
+  Placeholder a(BufHandle("A", {1}, kDouble));
   std::vector<double> v = {1, 0};
   std::vector<void*> args({v.data()});
   VarHandle x("x", kByte);
@@ -229,7 +229,7 @@ void testLLVMLetTestMultitype() {
 
 void testLLVMBufferTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {32}, kFloat));
+  Placeholder a(BufHandle("A", {32}, kFloat));
   std::vector<int32_t> v(5);
   std::vector<void*> args({v.data()});
   auto rv = IntImm::make(0);
@@ -239,7 +239,7 @@ void testLLVMBufferTest() {
 
 void testLLVMBlockTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {32}, kInt));
+  Placeholder a(BufHandle("A", {32}, kInt));
   std::vector<int32_t> v = {1, 2};
   std::vector<void*> args({v.data()});
 
@@ -257,8 +257,8 @@ void testLLVMBlockTest() {
 
 void testLLVMLoadStoreTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
   std::vector<int32_t> a_buffer = {42};
   std::vector<int32_t> b_buffer = {-11};
 
@@ -276,9 +276,9 @@ void testLLVMLoadStoreTest() {
 
 void testLLVMIfThenElseTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
-  Buffer c(BufHandle("C", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
+  Placeholder c(BufHandle("C", {1}, kInt));
   std::vector<int32_t> a_buffer = {42};
   std::vector<int32_t> b_buffer = {-11};
   std::vector<int32_t> c_buffer = {1};
@@ -300,8 +300,8 @@ void testLLVMIfThenElseTest() {
 
 void testLLVMVecLoadStoreTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
   std::vector<int32_t> a_buffer = {1, 1, 1, 1};
   std::vector<int32_t> b_buffer = {2, 2, 2, 2};
 
@@ -326,8 +326,8 @@ void testLLVMVecLoadStoreTest() {
 #define FLOAT_INTRINSICS_TEST(Name, Lanes)                       \
   void testLLVMVecFloat_##Name##Lane##Lanes##Test() {            \
     KernelScope kernel_scope;                                    \
-    Buffer a(BufHandle("A", {1}, kFloat));                       \
-    Buffer b(BufHandle("B", {1}, kFloat));                       \
+    Placeholder a(BufHandle("A", {1}, kFloat));                  \
+    Placeholder b(BufHandle("B", {1}, kFloat));                  \
     float val = 0.5f;                                            \
     std::vector<float> a_buffer(Lanes, val);                     \
     std::vector<float> b_buffer(Lanes, val);                     \
@@ -371,8 +371,8 @@ FLOAT_INTRINSICS_TEST(lgamma, 8)
 #define DOUBLE_INTRINSICS_TEST(Name, Lanes)                      \
   void testLLVMVecDouble_##Name##Lane##Lanes##Test() {           \
     KernelScope kernel_scope;                                    \
-    Buffer a(BufHandle("A", {1}, kDouble));                      \
-    Buffer b(BufHandle("B", {1}, kDouble));                      \
+    Placeholder a(BufHandle("A", {1}, kDouble));                 \
+    Placeholder b(BufHandle("B", {1}, kDouble));                 \
     float val = 0.5f;                                            \
     std::vector<double> a_buffer(Lanes, val);                    \
     std::vector<double> b_buffer(Lanes, val);                    \
@@ -415,13 +415,13 @@ DOUBLE_INTRINSICS_TEST(lgamma, 4)
 
 void testLLVMVectorizerLoadStoreTest() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
 
   Tensor* c = Compute("c", {{4, "i"}}, [&](const VarHandle& i) {
     return Load::make(a, {i}, 1);
   });
 
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->func_var()));
   LoopNest l({c});
   Stmt* s = l.root_stmt();
   l.vectorize(dynamic_cast<Block*>(s)->front());
@@ -440,8 +440,8 @@ void testLLVMVectorizerLoadStoreTest() {
 void testLLVMMemcpyTest() {
   KernelScope kernel_scope;
   constexpr int N = 32;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
   std::vector<int32_t> a_buffer(N, 42);
   std::vector<int32_t> b_buffer(N, 0);
 
@@ -464,7 +464,7 @@ void testLLVMMemcpyTest() {
 void testLLVMBzeroTest() {
   KernelScope kernel_scope;
   constexpr int N = 32;
-  Buffer b(BufHandle("B", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
   std::vector<int32_t> b_buffer(N, 11);
 
   auto mask = IntImm::make(1);
@@ -483,9 +483,9 @@ void testLLVMBzeroTest() {
 void testLLVMElemwiseAdd() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int32_t> a_buffer(N, 41);
   std::vector<int32_t> b_buffer(N, 1);
   std::vector<int32_t> c_buffer(N, 1);
@@ -518,9 +518,9 @@ void testLLVMElemwiseAdd() {
 void testLLVMElemwiseAddFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, 41);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
@@ -550,8 +550,8 @@ void testLLVMElemwiseAddFloat() {
 void testLLVMElemwiseLog10Float() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
   std::vector<float> a_buffer(N, 10.0f);
   std::vector<float> b_buffer(N, 2.0f);
 
@@ -581,8 +581,8 @@ void testLLVMElemwiseLog10Float() {
 void testLLVMElemwiseLog1pFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
   std::vector<float> a_buffer(N, expf(3.0f) - 1);
   std::vector<float> b_buffer(N, 42.0f);
 
@@ -612,9 +612,9 @@ void testLLVMElemwiseLog1pFloat() {
 void testLLVMElemwiseMaxInt() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 41);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 1);
@@ -647,9 +647,9 @@ void testLLVMElemwiseMaxInt() {
 void testLLVMElemwiseMinInt() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 41);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 1);
@@ -682,9 +682,9 @@ void testLLVMElemwiseMinInt() {
 void testLLVMElemwiseMaxFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, 41);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
@@ -717,9 +717,9 @@ void testLLVMElemwiseMaxFloat() {
 void testLLVMElemwiseMaxNaNFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, NAN);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
@@ -753,9 +753,9 @@ void testLLVMElemwiseMaxNaNFloat() {
 void testLLVMElemwiseMinFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, 41);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
@@ -788,9 +788,9 @@ void testLLVMElemwiseMinFloat() {
 void testLLVMElemwiseMinNaNFloat() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kFloat));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kFloat));
   std::vector<float> a_buffer(N, NAN);
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
@@ -824,9 +824,9 @@ void testLLVMElemwiseMinNaNFloat() {
 void testLLVMElemwiseMod() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int32_t> a_buffer(N, 41);
   std::vector<int32_t> b_buffer(N, 23);
   std::vector<int32_t> c_buffer(N, 18);
@@ -859,9 +859,9 @@ void testLLVMElemwiseMod() {
 void testLLVMCompareSelectIntEQ() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<int> a_buffer(N, 1);
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 0);
@@ -905,9 +905,9 @@ void testLLVMCompareSelectIntEQ() {
 void testLLVMCompareSelectFloatEQ() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kFloat));
-  Buffer b(BufHandle("B", {N}, kFloat));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kFloat));
+  Placeholder b(BufHandle("B", {N}, kFloat));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<float> a_buffer(N, 1.0f);
   std::vector<float> b_buffer(N, 1.0f);
   std::vector<int> c_buffer(N, 0);
@@ -944,9 +944,9 @@ void testLLVMCompareSelectFloatEQ() {
 void testLLVMCompareSelectByteGT() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 0);
   std::vector<int> c_buffer(N, 0);
@@ -990,9 +990,9 @@ void testLLVMCompareSelectByteGT() {
 void testLLVMCompareSelectByteGE() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 0);
   std::vector<int> c_buffer(N, 0);
@@ -1031,9 +1031,9 @@ void testLLVMCompareSelectByteGE() {
 void testLLVMCompareSelectByteLT() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 128);
   std::vector<int> c_buffer(N, 0);
@@ -1077,9 +1077,9 @@ void testLLVMCompareSelectByteLT() {
 void testLLVMCompareSelectByteLE() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kByte));
-  Buffer b(BufHandle("B", {N}, kByte));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kByte));
+  Placeholder b(BufHandle("B", {N}, kByte));
+  Placeholder c(BufHandle("C", {N}, kInt));
   std::vector<uint8_t> a_buffer(N, 0);
   std::vector<uint8_t> b_buffer(N, 128);
   std::vector<int> c_buffer(N, 0);
@@ -1117,7 +1117,7 @@ void testLLVMCompareSelectByteLE() {
 
 void testLLVMStoreFloat() {
   KernelScope kernel_scope;
-  Buffer result(BufHandle("result", {1}, kFloat));
+  Placeholder result(BufHandle("result", {1}, kFloat));
   std::vector<float> result_buffer = {0.0f};
   auto expr = Store::make(
       result, {IntImm::make(0)}, FloatImm::make(3.14f), IntImm::make(1));
@@ -1135,7 +1135,7 @@ void testLLVMSimpleMath01() {
   });
   LoopNest l({tensor});
   Stmt* stmt = l.root_stmt();
-  Buffer f_buf(BufHandle(tensor->func_var()));
+  Placeholder f_buf(BufHandle(tensor->func_var()));
   LLVMCodeGen cg(stmt, {f_buf});
 
   PaddedBuffer<float> f_v(N, "f_v");
@@ -1152,13 +1152,13 @@ void testLLVMSimpleMath01() {
 void testLLVMComputeMul() {
   KernelScope kernel_scope;
   const int N = 1024;
-  Buffer a(BufHandle("a", {N}, kFloat));
-  Buffer b(BufHandle("b", {N}, kFloat));
+  Placeholder a(BufHandle("a", {N}, kFloat));
+  Placeholder b(BufHandle("b", {N}, kFloat));
   Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
     return Load::make(a, {i}, 1) * Load::make(b, {i}, 1);
   });
 
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->func_var()));
   LoopNest l({c});
   Stmt* s = l.root_stmt();
 
@@ -1176,15 +1176,15 @@ void testLLVMBroadcastAdd() {
   KernelScope kernel_scope;
   const int M = 32;
   const int N = 1024;
-  Buffer a(BufHandle("a", {M, N}, kFloat));
-  Buffer b(BufHandle("b", {N}, kFloat));
+  Placeholder a(BufHandle("a", {M, N}, kFloat));
+  Placeholder b(BufHandle("b", {N}, kFloat));
   Tensor* c = Compute(
       "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
         ExprHandle mask(1);
         return Load::make(a, {i, j}, mask) + Load::make(b, {j}, mask);
       });
 
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->func_var()));
   LoopNest l({c});
   l.prepareForCodegen();
   Stmt* s = l.root_stmt();
@@ -1223,9 +1223,9 @@ void testLLVMDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
-    Buffer c(BufHandle("c", {n}, kFloat));
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
+    Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
     Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
     std::vector<float> aData(size, 1.0f);
@@ -1245,9 +1245,9 @@ void testLLVMBindDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
-    Buffer c(BufHandle("c", {n}, kFloat));
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
+    Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
     Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
     std::vector<float> aData(size, 1.0f);
@@ -1266,8 +1266,8 @@ void testLLVMTensorDynamicShapeAdd() {
   KernelScope kernel_scope;
   auto testWithSize = [](int32_t size) {
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {n}, kFloat));
-    Buffer b(BufHandle("b", {n}, kFloat));
+    Placeholder a(BufHandle("a", {n}, kFloat));
+    Placeholder b(BufHandle("b", {n}, kFloat));
     Tensor* c = Compute(
         "c", {{n, "n"}}, [&](const VarHandle& i) { return a(i) + b(i); });
     LoopNest l({c});
@@ -1289,8 +1289,8 @@ void testLLVMDynamicShape2D() {
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {m, n}, kFloat));
-    Buffer b(BufHandle("b", {m, n}, kFloat));
+    Placeholder a(BufHandle("a", {m, n}, kFloat));
+    Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a(i, j) + b(i, j);
@@ -1321,7 +1321,7 @@ void testLLVMEmptyStmt() {
 
 void testLLVMEliminatedStmt() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("a", {1}, kFloat));
+  Placeholder a(BufHandle("a", {1}, kFloat));
 
   Tensor* c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; });
 
@@ -1342,7 +1342,7 @@ void testLLVMSimpleReduction() {
   int N = 64;
   const int kTotalSize = M * N;
 
-  Buffer a("a", kFloat, {1, M, N});
+  Placeholder a("a", kFloat, {1, M, N});
 
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
@@ -1381,7 +1381,7 @@ void testLLVMRFactorReduction() {
   int N = 64;
   const int kTotalSize = M * N;
 
-  Buffer a("a", kFloat, {1, M, N});
+  Placeholder a("a", kFloat, {1, M, N});
 
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
@@ -1431,7 +1431,7 @@ void DISABLED_testLLVMRFactorVectorizedReduction() {
   int N = 64;
   const int kTotalSize = M * N;
 
-  Buffer a("a", kFloat, {1, M, N});
+  Placeholder a("a", kFloat, {1, M, N});
 
   // TODO: why doesn't implicit vector<DimArg> work?
   std::vector<DimArg> axis = {DimArg(1)};
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index e19d959c7b66..c28acd6ccbf7 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -575,8 +575,8 @@ void testExprSplitWithMask01() {
   KernelScope kernel_scope;
   const int M = 26;
   const int N = 5;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {M, N});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {M, N});
   Tensor* tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
         return a_buf(m, n) + b_buf(m, n) + 1.0f;
@@ -612,8 +612,8 @@ void testExprSplitWithMask01() {
 void testExprSplitWithMaskRepeatedNoMask() {
   KernelScope kernel_scope;
   const int M = 64;
-  Buffer a_buf("a", kFloat, {M});
-  Buffer b_buf("b", kFloat, {M});
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf(m) + b_buf(m) + 1.0f;
   });
@@ -644,8 +644,8 @@ void testExprSplitWithMaskRepeatedNoMask() {
 void testSplitWithTailWithLoopOptions() {
   KernelScope kernel_scope;
   const int M = 21;
-  Buffer a_buf("a", kFloat, {M});
-  Buffer b_buf("b", kFloat, {M});
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf(m) + b_buf(m) + 1.0f;
   });
@@ -674,8 +674,8 @@ void testSplitWithTailWithLoopOptions() {
 void testSplitWithMaskWithLoopOptions() {
   KernelScope kernel_scope;
   const int M = 21;
-  Buffer a_buf("a", kFloat, {M});
-  Buffer b_buf("b", kFloat, {M});
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
     return a_buf(m) + b_buf(m) + 1.0f;
   });
@@ -699,8 +699,8 @@ void testScheduleBroadcastAddBuffer() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
   Tensor* c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
@@ -748,8 +748,8 @@ void testScheduleFunctionCall01() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
   Tensor* c = Compute(
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
@@ -805,10 +805,10 @@ void testScheduleInlineSimple() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
-  Buffer c_buf("c", kFloat, {M, N});
-  Buffer d_buf("d", kFloat, {M, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
+  Placeholder c_buf("c", kFloat, {M, N});
+  Placeholder d_buf("d", kFloat, {M, K});
 
   Tensor* x = Compute(
       "x",
@@ -886,10 +886,10 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
-  Buffer c_buf("c", kFloat, {M, N});
-  Buffer d_buf("d", kFloat, {M, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
+  Placeholder c_buf("c", kFloat, {M, N});
+  Placeholder d_buf("d", kFloat, {M, K});
 
   Tensor* x = Compute(
       "x",
@@ -1122,8 +1122,8 @@ void testScheduleInlineIntrinsics() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
 
   Tensor* x = Compute(
       "x",
@@ -1487,7 +1487,7 @@ void testScheduleFuserStyle() {
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
 
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
 
   Tensor* b = Compute(
       "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
@@ -1520,10 +1520,10 @@ void testScheduleFuserThreeArg() {
   const int kVectorCount = 128;
   const int kTotalSize = kVectorSize * kVectorCount;
 
-  Buffer a(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer b(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
-  Buffer d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder b(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
     return a(i) + b(i);
@@ -1558,8 +1558,8 @@ void testScheduleDynamicShape2D() {
   auto testWithSize = [](int32_t M, int32_t N) {
     VarHandle m("m", kInt);
     VarHandle n("n", kInt);
-    Buffer a(BufHandle("a", {m, n}, kFloat));
-    Buffer b(BufHandle("b", {m, n}, kFloat));
+    Placeholder a(BufHandle("a", {m, n}, kFloat));
+    Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
           return a(i, j) + b(i, j);
@@ -2082,7 +2082,7 @@ void testLoopNestReorderExtraStatements() {
       });
   LoopNest l({tensor});
 
-  Buffer extra(BufHandle("res", {6, 3}, kFloat));
+  Placeholder extra(BufHandle("res", {6, 3}, kFloat));
 
   auto loops = l.getLoopStmtsFor(tensor);
 
@@ -2222,7 +2222,7 @@ void LoopNestReorderTestHelper(
       [](const std::vector<VarHandle>&) { return -1; });
   LoopNest l({c});
 
-  Buffer extra(BufHandle("extra", {5}, kInt));
+  Placeholder extra(BufHandle("extra", {5}, kInt));
 
   auto loops = l.getLoopStmtsFor(c);
   int j = 0;
@@ -2333,10 +2333,10 @@ void testLoopNestReorderInternalLoopNest() {
   const int M = 4;
   const int N = 5;
   const int K = 6;
-  Buffer a_buf("a", kFloat, {M, N});
-  Buffer b_buf("b", kFloat, {N, K});
-  Buffer c_buf("c", kFloat, {M, N});
-  Buffer d_buf("d", kFloat, {M, K});
+  Placeholder a_buf("a", kFloat, {M, N});
+  Placeholder b_buf("b", kFloat, {N, K});
+  Placeholder c_buf("c", kFloat, {M, N});
+  Placeholder d_buf("d", kFloat, {M, K});
 
   Tensor* x = Compute(
       "x",
@@ -2856,7 +2856,7 @@ void testNormalizeAndSplitWithTail() {
 
   // Create a dummy tensor to construct LoopNest.
   ExprHandle n(100);
-  Buffer a(BufHandle("a", {n}, kFloat));
+  Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
       Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
   LoopNest l({b});
@@ -2904,7 +2904,7 @@ void testDetectInlineRankMismatch() {
   KernelScope kernel_scope;
   const int kTotalSize = 8;
 
-  Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
+  Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
   Tensor* a = Compute(
       "a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { return a_buf(i); });
   Tensor* reshape = Compute(
@@ -2914,7 +2914,7 @@ void testDetectInlineRankMismatch() {
   LoopNest l({reshape});
   ASSERT_THROWS_WITH(
       l.computeInline(l.getLoopBodyFor(a)),
-      "Buffer indexed access is inconsistent with its rank");
+      "Placeholder indexed access is inconsistent with its rank");
 }
 
 } // namespace jit
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 8fdfda07c38b..534f036005bf 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -23,7 +23,7 @@ using namespace torch::jit::tensorexpr;
 void testReduceSum1D() {
   KernelScope kernel_scope;
 
-  Buffer b(BufHandle("b", {10}, kFloat));
+  Placeholder b(BufHandle("b", {10}, kFloat));
   std::vector<float> in(10);
   for (int j = 0; j < 10; ++j) {
     in[j] = j;
@@ -52,7 +52,7 @@ void testReduceSum2D() {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
 
-  Buffer b(BufHandle("b", {m, n}, kFloat));
+  Placeholder b(BufHandle("b", {m, n}, kFloat));
   std::vector<float> in(M * N);
   for (int i = 0; i < M; ++i) {
     for (int j = 0; j < N; ++j) {
@@ -90,7 +90,7 @@ void testReduceSum3D() {
   const int M = 10;
   VarHandle m("m", kInt);
 
-  Buffer b(BufHandle("b", {2, 3, m}, kFloat));
+  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
   Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}});
   LoopNest loop({c});
@@ -138,7 +138,7 @@ void testReduceSum3D() {
   }
 
   // This is the same as just reducing the original result across that axis.
-  Buffer c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->func_var()));
   Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
   LoopNest loop3({e});
   loop3.prepareForCodegen();
@@ -157,9 +157,9 @@ void testReduceSum3D() {
 void testReduceSum10D() {
   KernelScope kernel_scope;
 
-  Buffer in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat));
+  Placeholder in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat));
   const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3;
-  Buffer out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat));
+  Placeholder out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat));
   const int OutputSize = 2 * 3 * 2 * 3 * 2;
 
   std::vector<float> in(InputSize, 1.f);
@@ -193,7 +193,7 @@ void testReduceProduct() {
   const int M = 4;
   const int N = 4;
 
-  Buffer b(BufHandle("b", {M, N}, kFloat));
+  Placeholder b(BufHandle("b", {M, N}, kFloat));
   std::vector<float> in(M * N);
   for (int i = 0; i < M; ++i) {
     for (int j = 0; j < N; ++j) {
@@ -230,7 +230,7 @@ void testReduceProduct() {
 void testReduceMax() {
   KernelScope kernel_scope;
 
-  Buffer in_(BufHandle("b", {10}, kFloat));
+  Placeholder in_(BufHandle("b", {10}, kFloat));
 
   std::vector<float> in(10);
   std::vector<float> out(1, -1.f);
@@ -250,7 +250,7 @@ void testReduceMax() {
 
   ASSERT_EQ(out[0], 9);
 
-  Buffer in2_(BufHandle("b", {2, 5}, kFloat));
+  Placeholder in2_(BufHandle("b", {2, 5}, kFloat));
   std::vector<float> out2(2, -1.f);
 
   Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}});
@@ -272,7 +272,7 @@ void testReduceMinCustomInitializer() {
   KernelScope kernel_scope;
 
   VarHandle minInit("minInit", kFloat);
-  Buffer in_(BufHandle("b", {10}, kFloat));
+  Placeholder in_(BufHandle("b", {10}, kFloat));
 
   std::vector<float> in(10);
   std::vector<float> out(1, -1.f);
@@ -310,7 +310,7 @@ void testReduceAnyAll() {
   KernelScope kernel_scope;
 
   VarHandle searchValue("searchValue", kInt);
-  Buffer b(BufHandle("b", {4, 10}, kInt));
+  Placeholder b(BufHandle("b", {4, 10}, kInt));
 
   Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) {
     return CompareSelect::make(a, 1, 1, b, kEQ);
@@ -395,8 +395,8 @@ void testReduceAnyAll() {
 void testReduceMatmul2D() {
   KernelScope kernel_scope;
 
-  Buffer tA(BufHandle("tA", {3, 2}, kFloat));
-  Buffer tB(BufHandle("tB", {2, 3}, kFloat));
+  Placeholder tA(BufHandle("tA", {3, 2}, kFloat));
+  Placeholder tB(BufHandle("tB", {2, 3}, kFloat));
 
   std::vector<float> tA_(6);
   std::vector<float> tB_(6);
@@ -437,7 +437,7 @@ void testReduceMatmul2D() {
 void testReduceRfactorLike() {
   KernelScope kernel_scope;
 
-  Buffer in(BufHandle("in", {10, 10}, kFloat));
+  Placeholder in(BufHandle("in", {10, 10}, kFloat));
   std::vector<float> in_(100);
   for (int i = 0; i < 100; ++i) {
     in_[i] = i;
@@ -446,7 +446,7 @@ void testReduceRfactorLike() {
   std::vector<float> out(1, -1.f);
 
   Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
-  Buffer in_rf(BufHandle(l1->func_var()));
+  Placeholder in_rf(BufHandle(l1->func_var()));
 
   Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
 
@@ -467,8 +467,8 @@ void testReduceAsProducer() {
   const int M = 10;
   VarHandle m("m", kInt);
 
-  Buffer a(BufHandle("a", {2, 3}, kFloat));
-  Buffer b(BufHandle("b", {2, 3, m}, kFloat));
+  Placeholder a(BufHandle("a", {2, 3}, kFloat));
+  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
   Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}});
   Tensor* d = Compute(
@@ -511,8 +511,8 @@ void testReduceAsConsumer() {
   const int M = 10;
   VarHandle m("m", kInt);
 
-  Buffer a(BufHandle("a", {2, 3, m}, kFloat));
-  Buffer b(BufHandle("b", {2, 3, m}, kFloat));
+  Placeholder a(BufHandle("a", {2, 3, m}, kFloat));
+  Placeholder b(BufHandle("b", {2, 3, m}, kFloat));
 
   Tensor* c = Compute(
       "scale",
@@ -557,7 +557,7 @@ void testReduceAsConsumer() {
 void testSplitReduceAxis() {
   KernelScope kernel_scope;
 
-  Buffer in(BufHandle("in", {16, 8}, kFloat));
+  Placeholder in(BufHandle("in", {16, 8}, kFloat));
 
   std::vector<float> in_(16 * 8);
   for (int i = 0; i < 16; ++i) {
@@ -591,7 +591,7 @@ void testSplitReduceAxis() {
 void testSplitNonReduceAxis() {
   KernelScope kernel_scope;
 
-  Buffer in(BufHandle("in", {16, 8}, kFloat));
+  Placeholder in(BufHandle("in", {16, 8}, kFloat));
 
   std::vector<float> in_(16 * 8);
   for (int i = 0; i < 16; ++i) {
@@ -635,7 +635,7 @@ void testReorderedReductionInitializer() {
         SumOp(c(k, n), 0, a(k, m, n), {m})
   */
 
-  Buffer in(BufHandle("in", {1, 12, 6}, kFloat));
+  Placeholder in(BufHandle("in", {1, 12, 6}, kFloat));
   std::vector<float> in_(12 * 6, 1.f);
 
   Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}});
@@ -683,7 +683,7 @@ void testReduceRfactor() {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
 
-  Buffer b(BufHandle("b", {m, n}, kFloat));
+  Placeholder b(BufHandle("b", {m, n}, kFloat));
   std::vector<float> in(M * N);
   for (int j = 0; j < M * N; ++j) {
     in[j] = j;
@@ -718,7 +718,7 @@ void testReduce3DRfactorInternal() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -753,7 +753,7 @@ void testReduce3DRfactorInner() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -788,7 +788,7 @@ void testReduce3DRfactorOuter() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -824,7 +824,7 @@ void testReduce3DRfactorWithOuter() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {l, m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {l, m, n, k}, kFloat));
   std::vector<float> in(L * M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -859,7 +859,7 @@ void testReduce3DRfactorRepeated() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -905,7 +905,7 @@ void testReduceRfactorInsertionPoint() {
   VarHandle m("m", kInt);
   VarHandle n("n", kInt);
 
-  Buffer b(BufHandle("b", {m, n}, kFloat));
+  Placeholder b(BufHandle("b", {m, n}, kFloat));
   std::vector<float> in(M * N);
   for (int j = 0; j < M * N; ++j) {
     in[j] = j;
@@ -940,7 +940,7 @@ void testReduce3DRfactorInsertionPoint() {
   VarHandle n("n", kInt);
   VarHandle k("k", kInt);
 
-  Buffer b(BufHandle("b", {m, n, k}, kFloat));
+  Placeholder b(BufHandle("b", {m, n, k}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -967,7 +967,7 @@ void testReduce3DRfactorInsertionPoint() {
 void testReduceRepeatedInternalRfactor() {
   KernelScope kernel_scope;
 
-  Buffer in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat));
+  Placeholder in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat));
   const int InputSize = 2 * 3 * 4 * 5 * 6;
 
   std::vector<float> in(InputSize, 1.f);
@@ -1018,7 +1018,7 @@ void testReduceSplitTail() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1051,7 +1051,7 @@ void testReduceSplitNoTail() {
   const int M = 10;
   const int N = 10;
   const int K = 10;
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1086,7 +1086,7 @@ void testReduceOverSplitTail() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1120,7 +1120,7 @@ void testReduceSplitMask() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1153,7 +1153,7 @@ void testReduceSplitNoMask() {
   const int M = 10;
   const int N = 10;
   const int K = 10;
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1187,7 +1187,7 @@ void testReduceOverSplitMask() {
   const int N = 10;
   const int K = 10;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int j = 0; j < M * N * K; ++j) {
     in[j] = j;
@@ -1223,7 +1223,7 @@ void testReduceSplitRfactor() {
   const int K = 10;
   const int SPLIT_FACTOR = 4;
 
-  Buffer b(BufHandle("b", {M, N, K}, kFloat));
+  Placeholder b(BufHandle("b", {M, N, K}, kFloat));
   std::vector<float> in(M * N * K);
   for (int m = 0; m < M; ++m) {
     for (int j = 0; j < N * K; ++j) {
@@ -1262,7 +1262,7 @@ void testReduceOverSplitRfactor() {
   const int K = 10;
   const int SPLIT_FACTOR = 16;
 
-  Buffer b(BufHandle("b", {N, K}, kFloat));
+  Placeholder b(BufHandle("b", {N, K}, kFloat));
   std::vector<float> in(N * K);
   for (int j = 0; j < N * K; ++j) {
     in[j] = j;
@@ -1312,8 +1312,8 @@ void testReduceInlineReduction() {
   const int N = 5;
   const int K = 6;
 
-  Buffer a_buf("a", kFloat, {M});
-  Buffer b_buf("b", kFloat, {M, N, K});
+  Placeholder a_buf("a", kFloat, {M});
+  Placeholder b_buf("b", kFloat, {M, N, K});
 
   Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
   Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
@@ -1345,8 +1345,8 @@ void testReduceInlineConsumer() {
   const int N = 5;
   const int K = 6;
 
-  Buffer a_buf("a", kFloat, {M, N, K});
-  Buffer b_buf("b", kFloat, {M, N, K});
+  Placeholder a_buf("a", kFloat, {M, N, K});
+  Placeholder b_buf("b", kFloat, {M, N, K});
 
   Tensor* x = Compute(
       "x",
@@ -1399,8 +1399,8 @@ void testReduceInlineReducerInternal() {
   const int N = 5;
   const int K = 6;
 
-  Buffer a_buf("a", kFloat, {M, N, K});
-  Buffer b_buf("b", kFloat, {M, N, K});
+  Placeholder a_buf("a", kFloat, {M, N, K});
+  Placeholder b_buf("b", kFloat, {M, N, K});
 
   Tensor* x = Compute(
       "x",
diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp
index e7a28f1fb277..5164cab08725 100644
--- a/test/cpp/tensorexpr/test_registerizer.cpp
+++ b/test/cpp/tensorexpr/test_registerizer.cpp
@@ -13,7 +13,7 @@ using namespace torch::jit::tensorexpr;
 // Can replace a simple scalar access with a local variable.
 void testRegisterizerSimple() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -58,7 +58,7 @@ void testRegisterizerSimple() {
 // Won't do replacement of a loop access.
 void testRegisterizerLoop() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {10}, kInt));
+  Placeholder a(BufHandle("A", {10}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -105,7 +105,7 @@ void testRegisterizerLoop() {
 // invalidate it.
 void testRegisterizerLoopFixedLoad() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -151,7 +151,7 @@ void testRegisterizerLoopFixedLoad() {
 // Will registerize multiple accesses of different items of the same buffer.
 void testRegisterizerMultiVar() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
+  Placeholder a(BufHandle("A", {2}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -207,8 +207,8 @@ void testRegisterizerMultiVar() {
 // Will registerize the valid accesses while skipping invalid replacements.
 void testRegisterizerVariableLoad() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {10}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {10}, kInt));
   VarHandle x("x", kInt);
   VarHandle x2("x", kInt);
   Stmt* stmt = Block::make(
@@ -268,7 +268,7 @@ void testRegisterizerSymbolicIndices() {
   KernelScope kernel_scope;
   VarHandle i("i", kInt);
   VarHandle N("N", kInt);
-  Buffer a(BufHandle("A", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {i}, 0, 1),
@@ -317,7 +317,7 @@ void testRegisterizerSymbolicIndices() {
 // yet. Will have to fix soon though.
 void testRegisterizerEarlyStop() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make(
       {Store::make(a, {0}, 0, 1),
@@ -344,7 +344,7 @@ void testRegisterizerEarlyStop() {
 // Can registerize accesses dependent on multiple loop vars.
 void testRegisterizerMultiLoop() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   Stmt* stmt = Block::make(
@@ -402,7 +402,7 @@ void testRegisterizerMultiLoop() {
 // Can registerize correctly if scalars already exist in the program.
 void testRegisterizerRepeated() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
+  Placeholder a(BufHandle("A", {2}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -458,7 +458,7 @@ void testRegisterizerRepeated() {
 // Can registerize rthe load of A.
 void testRegisterizerNoLoads() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make(
       {Store::make(a, {0}, 0, 1),
@@ -499,8 +499,8 @@ void testRegisterizerNoLoads() {
 // Can registerize the load of A but not the store of B.
 void testRegisterizerNoRepeatedStores() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {10}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {10}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -548,7 +548,7 @@ void testRegisterizerNoRepeatedStores() {
 // Won't registerize if there are multiple accesses which may overlap.
 void testRegisterizerMultiVarOverlap() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
+  Placeholder a(BufHandle("A", {2}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -578,9 +578,9 @@ void testRegisterizerMultiVarOverlap() {
 void testRegisterizerAllocs() {
   KernelScope kernel_scope;
 
-  Buffer a(BufHandle("A", {2}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
-  Buffer c(BufHandle("C", {1}, kInt));
+  Placeholder a(BufHandle("A", {2}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
+  Placeholder c(BufHandle("C", {1}, kInt));
   VarHandle x("x", kInt);
 
   VarHandle b_(b.data()->base_handle());
@@ -646,7 +646,7 @@ void testRegisterizerAllocs() {
 
 void testRegisterizerNoInitializer() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
@@ -687,8 +687,8 @@ void testRegisterizerNoInitializer() {
 
 void testRegisterizerLoadThenStore() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
@@ -737,7 +737,7 @@ void testRegisterizerLoadThenStore() {
 
 void testRegisterizerParallelized() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {1}, kInt));
+  Placeholder a(BufHandle("A", {1}, kInt));
   VarHandle x("x", kInt);
   LoopOptions loopOpts;
   loopOpts.set_gpu_block_index(0);
@@ -765,7 +765,7 @@ void testRegisterizerParallelized() {
 
 void testRegisterizerConditions() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {5}, kInt));
+  Placeholder a(BufHandle("A", {5}, kInt));
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index f0185884fc58..ebc74ac59b49 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -504,9 +504,9 @@ void testHashDifferenceTypes() {
 void testHashLargeExpression() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_stmt = For::make(
@@ -522,8 +522,8 @@ void testHashLargeExpression() {
               CompareSelectOperation::kEQ),
           mask));
 
-  Buffer d(BufHandle("D", {1}, kInt));
-  Buffer e(BufHandle("E", {1}, kInt));
+  Placeholder d(BufHandle("D", {1}, kInt));
+  Placeholder e(BufHandle("E", {1}, kInt));
   auto store_ramp_stmt = Store::make(
       e,
       {Ramp::make(0, 1, 4)},
@@ -555,9 +555,9 @@ void testHashLargeExpression() {
 void testHashForLoopOptions() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Buffer a(BufHandle("A", {N}, kInt));
-  Buffer b(BufHandle("B", {N}, kInt));
-  Buffer c(BufHandle("C", {N}, kInt));
+  Placeholder a(BufHandle("A", {N}, kInt));
+  Placeholder b(BufHandle("B", {N}, kInt));
+  Placeholder c(BufHandle("C", {N}, kInt));
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto for_stmt = For::make(
@@ -2632,8 +2632,8 @@ void testSimplifyConstantCond() {
   {
     // If the condition is constant true then take the true_value.
     // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1
-    Buffer a(BufHandle("A", {1}, kInt));
-    Buffer b(BufHandle("B", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
+    Placeholder b(BufHandle("B", {1}, kInt));
     ExprHandle condition(1);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2648,8 +2648,8 @@ void testSimplifyConstantCond() {
   {
     // If the condition is constant false then take the false_value.
     // 0 ? A[0] = 1 : B[0] = 1 => B[0] = 1
-    Buffer a(BufHandle("A", {1}, kInt));
-    Buffer b(BufHandle("B", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
+    Placeholder b(BufHandle("B", {1}, kInt));
     ExprHandle condition(0);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2665,8 +2665,8 @@ void testSimplifyConstantCond() {
     // condition is simplified before checking.
     // (x-x) ? A[0] = 1 : B[0] = 1 => B[0] = 1
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
-    Buffer b(BufHandle("B", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
+    Placeholder b(BufHandle("B", {1}, kInt));
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2682,7 +2682,7 @@ void testSimplifyConstantCond() {
     // If both branches are the same then don't do the condition.
     // x ? A[0] = x : A[0] = x => A[0] = x
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, x, 1);
     Stmt* false_val = Store::make(a, {0}, x, 1);
@@ -2698,7 +2698,7 @@ void testSimplifyConstantCond() {
     // If both branches simplify to the same thing it still works.
     // x ? (x + x) : (2 * x) => x
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, ExprHandle(2) * x, 1);
     Stmt* false_val = Store::make(a, {0}, x + x, 1);
@@ -2714,7 +2714,7 @@ void testSimplifyConstantCond() {
     // But not if they dont
     // x ? x : (2 * x) => x ? x : (2 * x)
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
     ExprHandle condition(x);
     Stmt* true_val = Store::make(a, {0}, x, 1);
     Stmt* false_val = Store::make(a, {0}, ExprHandle(2) * x, 1);
@@ -2771,8 +2771,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // Will eliminate zero loop For.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2784,8 +2784,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // still works if start is not zero.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2798,8 +2798,8 @@ void testSimplifyEliminateZeroLengthFor() {
   {
     // works if both terms are variable.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2812,8 +2812,8 @@ void testSimplifyEliminateZeroLengthFor() {
   {
     // works if one term simplifies down.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2825,8 +2825,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2841,8 +2841,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // Will remove the loop if the body is run once.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2856,8 +2856,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // still works if start is not zero.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2872,8 +2872,8 @@ void testSimplifyOneLoopFor() {
   {
     // works if both terms are variable.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2888,8 +2888,8 @@ void testSimplifyOneLoopFor() {
   {
     // works if one term simplifies down.
     VarHandle x("x", kInt);
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2903,8 +2903,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2919,8 +2919,8 @@ void testSimplifyForWontLoseLoopOptions() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     LoopOptions options;
@@ -2939,8 +2939,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Multiple layers of For will be simplified out.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -2956,8 +2956,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Will maintain an outer loop if the inner loop is eliminated.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -2979,8 +2979,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Will maintain inner loop if outer loops is eliminated.
-    Buffer a(BufHandle("A", {4}, kInt));
-    Buffer c(BufHandle("C", {4}, kInt));
+    Placeholder a(BufHandle("A", {4}, kInt));
+    Placeholder c(BufHandle("C", {4}, kInt));
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -3003,7 +3003,7 @@ void testSimplifyForCleansUp() {
   KernelScope kernel_scope;
 
   {
-    Buffer a("a", kFloat, {1, 12, 1});
+    Placeholder a("a", kFloat, {1, 12, 1});
     VarHandle x("x", kInt);
     Tensor* b = Compute(
         "x",
@@ -3051,7 +3051,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten multiple blocks down to one.
     // { { { stmt1, stmt2 } } } =>  { stmt1, stmt2 }
-    Buffer a(BufHandle("A", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3074,7 +3074,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten multiple sub blocks containing statements.
     // { { stmt1 }, { stmt2 } } =>  { stmt1, stmt2 }
-    Buffer a(BufHandle("A", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3097,7 +3097,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten sub blocks with different depths.
     // { stmt1 , { { stmt2 } } } =>  { stmt1, stmt2 }
-    Buffer a(BufHandle("A", {1}, kInt));
+    Placeholder a(BufHandle("A", {1}, kInt));
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3240,9 +3240,9 @@ void testDontSimplifyRand() {
 
 void testSimplifyReorderForCond() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {4}, kInt));
-  Buffer b(BufHandle("B", {1}, kInt));
-  Buffer c(BufHandle("C", {4}, kInt));
+  Placeholder a(BufHandle("A", {4}, kInt));
+  Placeholder b(BufHandle("B", {1}, kInt));
+  Placeholder c(BufHandle("C", {4}, kInt));
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -3440,8 +3440,8 @@ void testSimplifyReorderForCond() {
 
 void testSimplifyFuseConditions() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {2}, kInt));
-  Buffer b(BufHandle("B", {2}, kInt));
+  Placeholder a(BufHandle("A", {2}, kInt));
+  Placeholder b(BufHandle("B", {2}, kInt));
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -3858,7 +3858,7 @@ void testSimplifyFuseConditions() {
 
 void testSimplifySyncThreads() {
   KernelScope kernel_scope;
-  Buffer a(BufHandle("A", {4}, kInt));
+  Placeholder a(BufHandle("A", {4}, kInt));
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
 
@@ -3974,7 +3974,7 @@ void testSimplifyBroadcastTermExpander() {
   // relevant path in TermExpander::mutate. The two bc1 terms are brought
   // together and simplified to 2 * bc1, which then needs to make 2 multi-lane.
   ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1);
-  Buffer buf(BufHandle("buf", {num_lanes}, kInt));
+  Placeholder buf(BufHandle("buf", {num_lanes}, kInt));
   // The result isn't fully simplified currently and thus would be brittle to
   // match. Observe its value instead.
   auto store = Store::make(
diff --git a/test/cpp/tensorexpr/test_train.cpp b/test/cpp/tensorexpr/test_train.cpp
index 49dbe63cf86e..755d482dc2b4 100644
--- a/test/cpp/tensorexpr/test_train.cpp
+++ b/test/cpp/tensorexpr/test_train.cpp
@@ -54,7 +54,7 @@ void testTrainBasic() {
     auto C = call("mul", {A, B})[0];
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -83,7 +83,7 @@ void testTrainBasic() {
     auto dA = grad(D, A, ones);
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -115,7 +115,7 @@ void testTrainBasic() {
     auto C = A + B;
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -144,7 +144,7 @@ void testTrainBasic() {
     auto dA = D.grad(A, ones);
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -179,7 +179,7 @@ void testTrainBasic() {
     auto dC = (C * C).grad(B, ones);
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -207,7 +207,7 @@ void testTrainBasic() {
     auto X = T(g, {"K"});
     auto Y = X.sum();
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -227,7 +227,7 @@ void testTrainBasic() {
     auto Y = X.sum();
     auto Z = Y.broadcast_like(X);
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
@@ -264,7 +264,7 @@ void testTrainBasic() {
     auto new_W = W - W_grad;
 
     Stmt* s;
-    std::map<const VTensor*, Buffer> inputs;
+    std::map<const VTensor*, Placeholder> inputs;
     std::map<const VTensor*, Tensor*> bindings;
     std::map<std::string, VarHandle> vbindings;
 
diff --git a/test/cpp/tensorexpr/test_train.h b/test/cpp/tensorexpr/test_train.h
index 39674933aa9c..16ff667860d0 100644
--- a/test/cpp/tensorexpr/test_train.h
+++ b/test/cpp/tensorexpr/test_train.h
@@ -37,7 +37,7 @@ VTensor* grad(VTensor* y, VTensor* x, VTensor* j);
 std::string dot(const VGraph& g);
 std::tuple<
     torch::jit::tensorexpr::Stmt*,
-    std::map<const VTensor*, torch::jit::tensorexpr::Buffer>,
+    std::map<const VTensor*, torch::jit::tensorexpr::Placeholder>,
     std::map<const VTensor*, torch::jit::tensorexpr::Tensor*>,
     std::map<std::string, torch::jit::tensorexpr::VarHandle>>
 to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs = {});
diff --git a/test/cpp/tensorexpr/test_train_impl.cpp b/test/cpp/tensorexpr/test_train_impl.cpp
index c80669990a92..0f9b5548a911 100644
--- a/test/cpp/tensorexpr/test_train_impl.cpp
+++ b/test/cpp/tensorexpr/test_train_impl.cpp
@@ -406,7 +406,7 @@ std::string dot(const VGraph& g) {
 
 std::tuple<
     Stmt*,
-    std::map<const VTensor*, Buffer>,
+    std::map<const VTensor*, Placeholder>,
     std::map<const VTensor*, Tensor*>,
     std::map<std::string, VarHandle>>
 to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs) {
@@ -456,7 +456,7 @@ to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs) {
     return order;
   };
 
-  std::map<const VTensor*, Buffer> inputs;
+  std::map<const VTensor*, Placeholder> inputs;
   std::map<const VTensor*, Tensor*> bindings;
   std::map<std::string, torch::jit::tensorexpr::VarHandle> vbindings;
 
@@ -479,7 +479,7 @@ to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs) {
       if (vars.size() == 0) {
         vars.emplace_back(IntImm::make(1));
       }
-      Buffer inpB(BufHandle(get_name(id), exprs, kFloat));
+      Placeholder inpB(BufHandle(get_name(id), exprs, kFloat));
       auto inpT =
           Compute("input" + get_name(id), vars, [&](const VarHandle& i) {
             return Load::make(inpB, {i}, 1);
diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h
index bd7a619bbdc9..4bf9d7680ad0 100644
--- a/torch/csrc/jit/tensorexpr/codegen.h
+++ b/torch/csrc/jit/tensorexpr/codegen.h
@@ -70,7 +70,7 @@ class TORCH_API CodeGen {
 
 class CodeGen::BufferArg {
  public:
-  BufferArg(const Buffer& buffer)
+  BufferArg(const Placeholder& buffer)
       : var_(buffer.data()->base_handle()), dtype_(buffer.dtype()) {}
   BufferArg(Tensor* tensor)
       : var_(tensor->function()
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index ee0bfb025968..b95bc643ce58 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -885,7 +885,7 @@ class ExprEval {
   ExprEval(const ExprHandle& expr, const std::vector<BufferArg>& buffer_args)
       : dtype_(expr.dtype()) {
     std::vector<BufferArg> buffer_args_extended = buffer_args;
-    Buffer ret_buf("ret_val", dtype_, {1});
+    Placeholder ret_buf("ret_val", dtype_, {1});
     std::vector<const Expr*> indices;
     const Expr* zero = new IntImm(0);
     for (size_t i = 0; i < ret_buf.data()->ndim(); i++) {
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index c93fe67f1937..841aca47c7f6 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -208,7 +208,7 @@ class TORCH_API Buf : public ExprNode<Buf> {
   std::vector<const Expr*> dims_;
 };
 
-// TODO: Merge this class with 'Buffer'
+// TODO: Merge this class with 'Placeholder'
 class TORCH_API BufHandle : public ExprHandle {
  public:
   BufHandle(
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index cd153d9d4bbb..0ad603006fd9 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -40,7 +40,7 @@ static bool indicesValid(const std::vector<const Expr*>& indices) {
 }
 
 Load::Load(
-    const Buffer& buffer,
+    const Placeholder& buffer,
     const std::vector<const Expr*>& indices,
     const Expr* mask)
     : Load(
@@ -70,7 +70,7 @@ Load::Load(
 }
 
 ExprHandle Load::make(
-    const Buffer& buffer,
+    const Placeholder& buffer,
     const std::vector<ExprHandle>& indices,
     const ExprHandle& mask) {
   return ExprHandle(
@@ -86,7 +86,7 @@ ExprHandle Load::make(
 }
 
 Store::Store(
-    const Buffer& buffer,
+    const Placeholder& buffer,
     const std::vector<const Expr*>& indices,
     const Expr* value,
     const Expr* mask)
@@ -129,7 +129,7 @@ Store::Store(
 }
 
 Store* Store::make(
-    const Buffer& buffer,
+    const Placeholder& buffer,
     const std::vector<ExprHandle>& indices,
     const ExprHandle& value,
     const ExprHandle& mask) {
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index d75b611145f9..f5739fc4c968 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -55,7 +55,7 @@ inline int getPrecedence(IRNodeType ty) {
   }
 }
 
-class Buffer;
+class Placeholder;
 
 class Cast : public ExprNode<Cast> {
  public:
@@ -391,7 +391,7 @@ class TORCH_API Load : public ExprNode<Load> {
     return buf_;
   }
   static ExprHandle make(
-      const Buffer& buffer,
+      const Placeholder& buffer,
       const std::vector<ExprHandle>& indices,
       const ExprHandle& mask);
   static ExprHandle make(
@@ -401,7 +401,7 @@ class TORCH_API Load : public ExprNode<Load> {
       const ExprHandle& mask);
 
   Load(
-      const Buffer& buffer,
+      const Placeholder& buffer,
       const std::vector<const Expr*>& indices,
       const Expr* mask);
   Load(
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 539abe941e19..4c5e0a89d361 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1449,7 +1449,7 @@ void TensorExprKernel::bindInput(const torch::jit::Value* input) {
   switch (t->kind()) {
     case TypeKind::TensorType: {
       auto tt = input->type()->cast<TensorType>();
-      Buffer inBuffer(
+      Placeholder inBuffer(
           "t" + input->debugName(),
           ToDtype(static_cast<ScalarType>(*tt->scalarType())),
           {0});
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 091b931bb809..e9de0ccf05fd 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -342,7 +342,7 @@ class Flattener : public IRMutator {
   Expr* mutate(const FunctionCall* v) override {
     const Tensor* t = v->tensor();
     const Buf* b = t->buf();
-    Buffer buffer = Buffer(BufHandle(b));
+    Placeholder buffer = Placeholder(BufHandle(b));
     const std::vector<const Expr*>& params = v->params();
     std::vector<ExprHandle> params_expr(params.size());
     for (size_t i = 0; i < params.size(); i++) {
@@ -506,7 +506,7 @@ class FunctionInliner : public IRMutator {
 
     if (v->nparams() != buf->ndim()) {
       throw malformed_input(
-          "Buffer indexed access is inconsistent with its rank", v);
+          "Placeholder indexed access is inconsistent with its rank", v);
     }
 
     std::vector<const Var*> index_vars;
diff --git a/torch/csrc/jit/tensorexpr/reduction.h b/torch/csrc/jit/tensorexpr/reduction.h
index 5fa0544261ff..88f833819744 100644
--- a/torch/csrc/jit/tensorexpr/reduction.h
+++ b/torch/csrc/jit/tensorexpr/reduction.h
@@ -91,7 +91,7 @@ class Reducer {
   Reducer(ExprHandle init, ReduceInteraction& interaction)
       : init_(init.node()), interaction_(interaction) {}
 
-  Reducer(ExprHandle init, ReduceInteraction& interaction, Buffer& buf)
+  Reducer(ExprHandle init, ReduceInteraction& interaction, Placeholder& buf)
       : init_(init.node()), interaction_(interaction) {}
 
   template <typename RI>
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index eb0b7837c5c6..772a0fa4c187 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -10,7 +10,7 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
-class Buffer;
+class Placeholder;
 
 // The common base between all statement node.
 class TORCH_API Stmt : public KernelScopedObject {
@@ -271,7 +271,7 @@ class TORCH_API Store : public StmtNode<Store> {
   }
 
   static Store* make(
-      const Buffer& buffer,
+      const Placeholder& buffer,
       const std::vector<ExprHandle>& indices,
       const ExprHandle& value,
       const ExprHandle& mask);
@@ -289,7 +289,7 @@ class TORCH_API Store : public StmtNode<Store> {
 
   // TODO: merge this with Load.
   Store(
-      const Buffer& buffer,
+      const Placeholder& buffer,
       const std::vector<const Expr*>& indices,
       const Expr* value,
       const Expr* mask);
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index 5d75cb5401b8..09676ad1f9fb 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -113,7 +113,7 @@ Tensor* Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
-    const Buffer& buffer,
+    const Placeholder& buffer,
     const std::vector<DimArg>& reduce_args) {
   return Reduce(
       func_name,
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index dd1fc4ab9133..0b896aa1c926 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -161,11 +161,11 @@ class Tensor : KernelScopedObject {
   const Expr* initializer_{nullptr};
 };
 
-class Buffer {
+class Placeholder {
  public:
-  Buffer(const BufHandle& data) : data_(data.node()) {
+  Placeholder(const BufHandle& data) : data_(data.node()) {
     if (data_->base_handle()->dtype() != kHandle) {
-      throw malformed_input("Buffer dtype must be Handle");
+      throw malformed_input("Placeholder dtype must be Handle");
     }
 
     std::vector<ExprHandle> stride_handles(ndim());
@@ -178,11 +178,11 @@ class Buffer {
     }
     strides_ = ExprHandleVectorToExprVector(stride_handles);
   }
-  Buffer(
+  Placeholder(
       const std::string& name,
       const Dtype& dtype,
       const std::vector<ExprHandle>& dims)
-      : Buffer(BufHandle(name, dims, dtype)) {}
+      : Placeholder(BufHandle(name, dims, dtype)) {}
 
   const Buf* data() const {
     return data_;
@@ -232,7 +232,7 @@ class Buffer {
   std::vector<const Expr*> strides_;
 };
 
-inline ExprHandle Buffer::LoadValue(
+inline ExprHandle Placeholder::LoadValue(
     const std::vector<ExprHandle>& indices) const {
   return Load::make(*this, indices, ExprHandle(1));
 }
@@ -322,12 +322,12 @@ Tensor* Reduce(
   return Reduce(func_name, dim_args, reducer, body_func, reduce_args);
 }
 
-// Overload for the common case of all dimensions of a Buffer.
+// Overload for the common case of all dimensions of a Placeholder.
 TORCH_API Tensor* Reduce(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
     const Reducer& reducer,
-    const Buffer& buffer,
+    const Placeholder& buffer,
     const std::vector<DimArg>& reduce_args);
 
 // Overload for the common case of all dimensions of a prevously Computed

From b86008ab75d7fcb96c6f8d5466cb57d3e2ed2f89 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 29 Sep 2020 01:12:18 -0700
Subject: [PATCH 247/449] [TensorExpr] Remove buf_ field from class Tensor.
 (#45390)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45390

Tensor objects should always refer to their Function's bufs. Currently
we never create a Tensor with a buffer different than of its function,
but having it in two places seems incorrect and dangerous.

Differential Revision: D23952865

Test Plan: Imported from OSS

Reviewed By: nickgg

Pulled By: ZolotukhinM

fbshipit-source-id: e63fc26d7078427514649d9ce973b74ea635a94a
---
 test/cpp/tensorexpr/test_llvm.cpp             |  8 +++----
 test/cpp/tensorexpr/test_reductions.cpp       |  4 ++--
 .../csrc/jit/tensorexpr/bounds_inference.cpp  |  3 +--
 torch/csrc/jit/tensorexpr/kernel.cpp          |  2 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp        |  4 ++--
 torch/csrc/jit/tensorexpr/tensor.cpp          | 15 +++++--------
 torch/csrc/jit/tensorexpr/tensor.h            | 22 ++++++++-----------
 7 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index 9e227a918ddc..af7c31dbf28f 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -421,7 +421,7 @@ void testLLVMVectorizerLoadStoreTest() {
     return Load::make(a, {i}, 1);
   });
 
-  Placeholder c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   LoopNest l({c});
   Stmt* s = l.root_stmt();
   l.vectorize(dynamic_cast<Block*>(s)->front());
@@ -1135,7 +1135,7 @@ void testLLVMSimpleMath01() {
   });
   LoopNest l({tensor});
   Stmt* stmt = l.root_stmt();
-  Placeholder f_buf(BufHandle(tensor->func_var()));
+  Placeholder f_buf(BufHandle(tensor->buf()));
   LLVMCodeGen cg(stmt, {f_buf});
 
   PaddedBuffer<float> f_v(N, "f_v");
@@ -1158,7 +1158,7 @@ void testLLVMComputeMul() {
     return Load::make(a, {i}, 1) * Load::make(b, {i}, 1);
   });
 
-  Placeholder c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   LoopNest l({c});
   Stmt* s = l.root_stmt();
 
@@ -1184,7 +1184,7 @@ void testLLVMBroadcastAdd() {
         return Load::make(a, {i, j}, mask) + Load::make(b, {j}, mask);
       });
 
-  Placeholder c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   LoopNest l({c});
   l.prepareForCodegen();
   Stmt* s = l.root_stmt();
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 534f036005bf..6dedef4363e1 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -138,7 +138,7 @@ void testReduceSum3D() {
   }
 
   // This is the same as just reducing the original result across that axis.
-  Placeholder c_buf(BufHandle(c->func_var()));
+  Placeholder c_buf(BufHandle(c->buf()));
   Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}});
   LoopNest loop3({e});
   loop3.prepareForCodegen();
@@ -446,7 +446,7 @@ void testReduceRfactorLike() {
   std::vector<float> out(1, -1.f);
 
   Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}});
-  Placeholder in_rf(BufHandle(l1->func_var()));
+  Placeholder in_rf(BufHandle(l1->buf()));
 
   Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}});
 
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index e9a211e5f959..9c065179b9d9 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -31,8 +31,7 @@ void BoundsInference::visit(const Load* v) {
 }
 
 void BoundsInference::visit(const FunctionCall* v) {
-  accesses_[v->tensor()->func_var()].push_back(
-      {kLoad, v->params(), v->params()});
+  accesses_[v->tensor()->buf()].push_back({kLoad, v->params(), v->params()});
 }
 
 void BoundsInference::visit(const Store* v) {
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 4c5e0a89d361..4f643b7cba95 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1264,7 +1264,7 @@ void TensorExprKernel::flattenTensors(BackendType backendType) {
     // Flatten the index for GPU kernels.
     // TODO: move this to fusing axis when it is ready.
     Tensor* newOut = Compute(
-        tensor->func_var()->name_hint() + "_flat",
+        tensor->buf()->name_hint() + "_flat",
         {totalCount},
         [tensor](const VarHandle& index) -> ExprHandle {
           std::vector<ExprHandle> dims;
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index e9de0ccf05fd..5edc5be0fcf4 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -459,7 +459,7 @@ Stmt* LoopNest::lowerToStmt(Tensor* t) {
 
   const Expr* initializer = t->initializer();
   if (initializer) {
-    buf_initializers_[t->func_var()] = initializer;
+    buf_initializers_[t->buf()] = initializer;
   }
   std::vector<const Expr*> indices(t->args().begin(), t->args().end());
 
@@ -1406,7 +1406,7 @@ class LoopComputeAtRewriter : public IRMutator {
     return new Load(v->dtype(), new_buf_, new_indices, v->mask());
   }
   const Expr* mutate(const FunctionCall* v) override {
-    if (v->tensor()->func_var() != buf_) {
+    if (v->tensor()->buf() != buf_) {
       return v;
     }
     std::vector<const Expr*> new_indices;
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index 09676ad1f9fb..81e987caba9a 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -17,8 +17,7 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   const Expr* body = body_func(VarVectorToVarHandleVector(args)).node();
   Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
+  return new Tensor(func, 0);
 }
 
 Tensor* Compute(
@@ -34,8 +33,7 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   const Expr* body = body_func(VarHandle(args[0])).node();
   Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
+  return new Tensor(func, 0);
 }
 
 Tensor* Compute(
@@ -51,8 +49,7 @@ Tensor* Compute(
   unpack_dim_args(dim_args, &dims, &args);
   const Expr* body = body_func(VarHandle(args[0]), VarHandle(args[1])).node();
   Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
+  return new Tensor(func, 0);
 }
 
 Tensor* Compute(
@@ -71,8 +68,7 @@ Tensor* Compute(
       body_func(VarHandle(args[0]), VarHandle(args[1]), VarHandle(args[2]))
           .node();
   Function* func = new Function(func_name, dims, args, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
+  return new Tensor(func, 0);
 }
 
 Tensor* Compute(
@@ -92,8 +88,7 @@ Tensor* Compute(
   auto args = VarVectorToVarHandleVector(args_nodes);
   const Expr* body = body_func(args[0], args[1], args[2], args[3]).node();
   Function* func = new Function(func_name, dims, args_nodes, body);
-  const Buf* buf = func->func_var(0);
-  return new Tensor(buf, func, 0);
+  return new Tensor(func, 0);
 }
 
 Stmt* Function::ElementStmt(size_t index) {
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index 0b896aa1c926..a77f01b8c501 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -104,6 +104,9 @@ class Function : public KernelScopedObject {
 
 class Tensor : KernelScopedObject {
  public:
+  Tensor(Function* function, int output_index)
+      : function_(function), output_index_(output_index) {}
+
   Function* function() const {
     return function_;
   }
@@ -115,17 +118,17 @@ class Tensor : KernelScopedObject {
   const Expr* body() const {
     return function()->body(output_index());
   }
-  const Buf* func_var() const {
+  const Buf* buf() const {
     return function()->func_var(output_index());
   }
   int ndim() const {
-    return buf_->dims().size();
+    return buf()->dims().size();
   }
   const Expr* dim(int index) const {
-    return buf_->dim(index);
+    return buf()->dim(index);
   }
   std::vector<const Expr*> dims() const {
-    return buf_->dims();
+    return buf()->dims();
   }
   const Var* arg(int index) const {
     return function()->arg(index);
@@ -134,10 +137,6 @@ class Tensor : KernelScopedObject {
     return function()->args();
   }
 
-  const Buf* buf() const {
-    return buf_;
-  }
-
   void initializeTo(const Expr* initializer) {
     initializer_ = initializer;
   }
@@ -145,8 +144,6 @@ class Tensor : KernelScopedObject {
     return initializer_;
   }
 
-  Tensor(const Buf* buf, Function* function, int output_index)
-      : buf_(buf), function_(function), output_index_(output_index) {}
   template <typename... Ts>
   inline ExprHandle operator()(const Ts&... ts);
   template <typename T>
@@ -155,7 +152,6 @@ class Tensor : KernelScopedObject {
   inline ExprHandle call(const Ts&... ts);
 
  private:
-  const Buf* buf_;
   Function* function_;
   int output_index_;
   const Expr* initializer_{nullptr};
@@ -306,7 +302,7 @@ Tensor* Reduce(
   dims.insert(dims.end(), reduce_dims.begin(), reduce_dims.end());
   Function* func =
       new Function(func_name, func_result, dims, all_vars, reduce_op);
-  Tensor* t = new Tensor(func_result, func, 0);
+  Tensor* t = new Tensor(func, 0);
   t->initializeTo(new Cast(body.dtype(), reducer.initializer()));
   return t;
 }
@@ -373,7 +369,7 @@ class FunctionCall : public CallNode<FunctionCall> {
   }
 
   std::string func_name() const override {
-    return tensor_->func_var()->name_hint();
+    return tensor_->buf()->name_hint();
   }
 
   Tensor* tensor_;

From bb478810e0f9a80752debea51c56e0d25088959a Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Tue, 29 Sep 2020 01:43:11 -0700
Subject: [PATCH 248/449] [quant] torch.max_pool1d (#45152)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45152

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23846473

Pulled By: z-a-f

fbshipit-source-id: 38fd611e568e4f8b39b7a00adeb42c7b99576360
---
 aten/src/ATen/native/MaxPooling.cpp          |  4 ++
 aten/src/ATen/native/native_functions.yaml   |  5 ++
 aten/src/ATen/native/quantized/cpu/qpool.cpp | 72 +++++++++++++++-----
 aten/src/ATen/native/quantized/library.cpp   |  1 +
 test/quantization/test_quantized_op.py       | 51 +++++++++++++-
 torch/nn/quantized/functional.py             | 16 +++++
 torch/overrides.py                           |  2 +
 7 files changed, 132 insertions(+), 19 deletions(-)

diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp
index a0298ea937de..645822f55065 100644
--- a/aten/src/ATen/native/MaxPooling.cpp
+++ b/aten/src/ATen/native/MaxPooling.cpp
@@ -97,6 +97,10 @@ Tensor max_pool1d(
     IntArrayRef padding,
     IntArrayRef dilation,
     bool ceil_mode) {
+  if (self.is_quantized()) {
+    return at::quantized_max_pool1d(self, kernel_size, stride, padding,
+                                    dilation, ceil_mode);
+  }
   if (self.requires_grad() || !self.device().is_cpu()) {
     // Needs indices for grad and with_indices defines CUDA dispatch
     return std::get<0>(at::max_pool1d_with_indices(
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 339cc1294580..43df6031727c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -2250,6 +2250,11 @@
   dispatch:
     MkldnnCPU: mkldnn_max_pool3d
 
+- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+  use_c10_dispatcher: full
+  dispatch:
+    QuantizedCPU: quantized_max_pool1d
+
 - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   use_c10_dispatcher: full
   dispatch:
diff --git a/aten/src/ATen/native/quantized/cpu/qpool.cpp b/aten/src/ATen/native/quantized/cpu/qpool.cpp
index f986ab4934b9..a4e255f030d4 100644
--- a/aten/src/ATen/native/quantized/cpu/qpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp
@@ -134,7 +134,12 @@ Tensor q_maxpool_2d(
   int64_t oC = iC;
   int64_t oH = pooling_output_shape(iH, kH, pH, sH, dH, ceil_mode);
   int64_t oW = pooling_output_shape(iW, kW, pW, sW, dW, ceil_mode);
-  TORCH_CHECK(oH > 0 && oW > 0, "the resulting Tensor is too small.");
+  TORCH_CHECK(oH > 0 && oW > 0,
+              "Given input size: (",
+              iC, "x", iH, "x", iW,
+              "). Calculated output size: (",
+              oC, "x", oH, "x", oW,
+              "). Output size is too small.");
 
   std::vector<int64_t> oSizes;
   if (ndim == 3) {
@@ -232,7 +237,7 @@ void check_maxpool2d_params(
 }
 
 #ifdef USE_PYTORCH_QNNPACK
- static Tensor qnnpack_maxpool(
+ static Tensor qnnpack_maxpool2d(
      Tensor input,
      IntArrayRef kernel_size,
      IntArrayRef stride,
@@ -243,23 +248,23 @@ void check_maxpool2d_params(
 
    TORCH_CHECK(
        input.ndimension() == 4,
-       "qnnpack_maxpool(): Expected input to be 4-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected input to be 4-dimensional: got ",
        input.ndimension());
    TORCH_CHECK(
        kernel_size.size() == 2,
-       "qnnpack_maxpool(): Expected kernel_size to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected kernel_size to be 2-dimensional: got ",
        kernel_size.size());
    TORCH_CHECK(
        stride.size() == 2,
-       "qnnpack_maxpool(): Expected stride to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected stride to be 2-dimensional: got ",
        stride.size());
    TORCH_CHECK(
        dilation.size() == 2,
-       "qnnpack_maxpool(): Expected dilation to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected dilation to be 2-dimensional: got ",
        dilation.size());
    TORCH_CHECK(
        padding.size() == 2,
-       "qnnpack_maxpool(): Expected padding to be 2-dimensional: got ",
+       "qnnpack_maxpool2d(): Expected padding to be 2-dimensional: got ",
        padding.size());
 
    int64_t batch_size = input.size(0);
@@ -284,10 +289,10 @@ void check_maxpool2d_params(
 
    TORCH_CHECK(
        kH > 0 && kW > 0,
-       "qnnpack_maxpool(): kernel_size should be greater than zero.");
+       "qnnpack_maxpool2d(): kernel_size should be greater than zero.");
    TORCH_CHECK(
        strideH > 0 && strideW > 0,
-       "qnnpack_maxpool(): strides should be greater than zero.");
+       "qnnpack_maxpool2d(): strides should be greater than zero.");
 
    const pytorch_qnnp_status createStatus =
        pytorch_qnnp_create_max_pooling2d_nhwc_u8(
@@ -318,7 +323,7 @@ void check_maxpool2d_params(
 
    TORCH_CHECK(
        outH > 0 && outW > 0,
-       "qnnpack_maxpool(): the resulting output Tensor size should be >= 0");
+       "qnnpack_maxpool2d(): the resulting output Tensor size should be >= 0");
 
    std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
        qnnpack_uniq_ptr(qnnpack_operator);
@@ -375,7 +380,7 @@ Tensor quantized_max_pool2d(
   }
 #ifdef USE_PYTORCH_QNNPACK
   if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8 && !ceil_mode) {
-    return qnnpack_maxpool(qx, kernel_size, stride, padding, dilation, ceil_mode);
+    return qnnpack_maxpool2d(qx, kernel_size, stride, padding, dilation, ceil_mode);
   }
 #endif
   Tensor qy;
@@ -395,9 +400,37 @@ Tensor quantized_max_pool2d(
   return qy;
 }
 
+// Quantized max_pool1d is a special case of the max_pool2d, with one of the
+// dimensions and kernels removed.
+Tensor quantized_max_pool1d(
+    const Tensor& qx,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+  // (C, L) -> (C, 1, L) => kSqueezeDim = 1
+  // (N, C, L) -> (N, C, 1, L) => kSqueezeDim = 2
+  const int32_t kSqueezeDim = qx.dim() - 1;
+  const auto qx_unsqueeze = qx.unsqueeze(kSqueezeDim);
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+  auto qy = at::quantized_max_pool2d(
+    qx.unsqueeze(kSqueezeDim),
+    {1, kernel_size[0]},
+    {1, stride[0]},
+    {0, padding[0]},
+    {1, dilation[0]},
+    ceil_mode);
+  qy = qy.squeeze(kSqueezeDim);
+  return qy;
+}
+
 // Keep the registry in the anonymous namespace.
 namespace {
-class QMaxPool2D_arr_args final {
+template <uint32_t kSpatialDim>
+class QMaxPool_arr_args final {
  public:
   static Tensor run(
       Tensor qx,
@@ -406,17 +439,20 @@ class QMaxPool2D_arr_args final {
       std::vector<int64_t> padding,
       std::vector<int64_t> dilation,
       bool ceil_mode) {
-    #ifdef USE_PYTORCH_QNNPACK
-    if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8 && !ceil_mode) {
-      return qnnpack_maxpool(qx, kernel_size, stride, padding, dilation, ceil_mode);
+    if (kSpatialDim == 1) {
+      return at::quantized_max_pool1d(qx, kernel_size, stride, padding,
+                                      dilation, ceil_mode);
+    } else if (kSpatialDim == 2) {
+      return at::quantized_max_pool2d(qx, kernel_size, stride, padding,
+                                      dilation, ceil_mode);
     }
-    #endif
-    return at::max_pool2d(qx, kernel_size, stride, padding, dilation, ceil_mode);
+    TORCH_CHECK(false, "MaxPool", kSpatialDim, "D is not supported.");
   }
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("max_pool2d", TORCH_FN(QMaxPool2D_arr_args::run));
+  m.impl("max_pool1d", TORCH_FN(QMaxPool_arr_args<1>::run));
+  m.impl("max_pool2d", TORCH_FN(QMaxPool_arr_args<2>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 6049ccbe1e46..093519015639 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -166,6 +166,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def("mul_scalar_relu.Tensor(Tensor qa, Tensor b)-> Tensor qc");
   m.def("mul_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out");
   m.def("mul_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out");
+  m.def("max_pool1d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor");
   // NB: missing a space after comma here...
   m.def("max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation,bool ceil_mode) -> Tensor");
   m.def("relu6(Tensor qx, bool inplace=False) -> Tensor");
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 9412332c238b..93bb2c9f8bce 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -906,7 +906,56 @@ def test_channel_shuffle(self, X, groups):
         self.assertEqual(a_ref, a_hat.dequantize(),
                          msg="torch.nn.functional.channel_shuffle results are off")
 
-    """Tests max pool operation on quantized tensors."""
+    """Tests 1D max pool operation on quantized tensors."""
+    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=2, max_dims=3,
+                                              min_side=1, max_side=10),
+                       qparams=hu.qparams()),
+           kernel=st.sampled_from((3, 5, 7)),
+           stride=st.sampled_from((None, 1, 2)),
+           dilation=st.integers(1, 2),
+           padding=st.integers(0, 2),
+           ceil_mode=st.booleans())
+    def test_max_pool1d(self, X, kernel, stride, dilation, padding, ceil_mode):
+        X, (scale, zero_point, torch_type) = X
+        # Check constraints
+        assume(kernel // 2 >= padding)  # Kernel cannot be overhanging!
+        iW = X.shape[-1]
+        oW = pool_output_shape(iW, kernel, padding, stride, dilation, ceil_mode)
+        assume(oW > 0)
+
+        a = torch.from_numpy(X)
+        a_pool = torch.nn.functional.max_pool1d(a, kernel_size=kernel,
+                                                stride=stride,
+                                                padding=padding,
+                                                dilation=dilation,
+                                                ceil_mode=ceil_mode)
+        a_ref = torch.quantize_per_tensor(a_pool, scale=scale,
+                                          zero_point=zero_point, dtype=torch_type)
+        a_ref = a_ref.dequantize()
+        qa = torch.quantize_per_tensor(a, scale=scale, zero_point=zero_point,
+                                       dtype=torch_type)
+
+        ops_under_test = {
+            "torch": torch.max_pool1d,
+            "nn.functional": torch.nn.functional.max_pool1d,
+            "nn.quantized.functional": torch.nn.quantized.functional.max_pool1d
+        }
+
+        for name, op in ops_under_test.items():
+            a_hat = op(qa, kernel_size=kernel, stride=stride, padding=padding,
+                       dilation=dilation, ceil_mode=ceil_mode)
+            self.assertEqual(a_ref, a_hat.dequantize(),
+                             msg="{} results are off".format(name))
+        # Test the ops.quantized separately, because None is not treated.
+        a_hat = torch.ops.quantized.max_pool1d(
+            qa, kernel_size=_single(kernel),
+            stride=_single(kernel if stride is None else stride),
+            padding=_single(padding), dilation=_single(dilation),
+            ceil_mode=ceil_mode)
+        self.assertEqual(a_ref, a_hat.dequantize(),
+                         msg="ops.quantized.max_pool1d results are off")
+
+    """Tests 2D max pool operation on quantized tensors."""
     @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4,
                                               min_side=1, max_side=10),
                        qparams=hu.qparams()),
diff --git a/torch/nn/quantized/functional.py b/torch/nn/quantized/functional.py
index 5985104eaf07..16006a0ea075 100644
--- a/torch/nn/quantized/functional.py
+++ b/torch/nn/quantized/functional.py
@@ -360,6 +360,22 @@ def linear(input, weight, bias=None, scale=None, zero_point=None):
     _packed_params = torch.ops.quantized.linear_prepack(weight, bias)
     return torch.ops.quantized.linear(input, _packed_params, scale, zero_point)
 
+def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1,
+               ceil_mode=False, return_indices=False):
+    r"""Applies a 1D max pooling over a quantized input signal composed of
+    several quantized input planes.
+
+    .. note:: The input quantization parameters are propagated to the output.
+
+    See :class:`~torch.nn.quantized.MaxPool1d` for details.
+    """
+    if return_indices:
+        raise NotImplementedError("return_indices is not yet implemented!")
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.nn.functional.max_pool1d(input, kernel_size, stride, padding,
+                                          dilation, ceil_mode, return_indices)
+
 def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
                ceil_mode=False, return_indices=False):
     r"""Applies a 2D max pooling over a quantized input signal composed of
diff --git a/torch/overrides.py b/torch/overrides.py
index 352ba76b9593..d7cda983fde6 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -664,6 +664,8 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
 
         torch.quantized_lstm_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
                                     col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.quantized_max_pool1d: (lambda input, kernel_size, stride=tuple(), padding=(0,),
+                                     dilation=(1,), ceil_mode=False: -1),
         torch.quantized_max_pool2d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0),
                                      dilation=(1, 1), ceil_mode=False: -1),
         torch.quantized_rnn_relu_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,

From 489af4ddcb8b4e55a72b3f8cf904150b4c175fd2 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Tue, 29 Sep 2020 01:48:59 -0700
Subject: [PATCH 249/449] [quant] Add quant APIs to save/load observer
 state_dict (#44846)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44846

The save function traverses the model state dict to pick out the observer stats
load function traverse the module hierarchy to load the state dict into module attributes depending on observer type

Test Plan:
python test/test_quantization.py TestQuantizeFx.test_save_observer_state_dict

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D23746821

fbshipit-source-id: 05c571b62949a2833602d736a81924d77e7ade55
---
 test/quantization/test_quantize_fx.py         | 58 ++----------
 test/quantization/test_workflow_module.py     | 28 ++++++
 torch/quantization/observer.py                | 94 +++++++++++++++++--
 .../testing/_internal/common_quantization.py  | 11 +++
 4 files changed, 135 insertions(+), 56 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index c1641ae3e194..124995e3fe7d 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -42,6 +42,7 @@
     skip_if_no_torchvision,
     train_one_epoch,
     run_ddp,
+    LinearModelWithSubmodule,
 )
 
 from torch.testing._internal.common_quantized import (
@@ -60,6 +61,7 @@
 import itertools
 import operator
 import unittest
+import io
 
 @skipIfNoFBGEMM
 class TestQuantizeFx(QuantizationTestCase):
@@ -494,28 +496,8 @@ def forward(self, x):
 
     @skipIfNoFBGEMM
     def test_qat_and_script(self):
-        class TwoLayerLinear(nn.Module):
-            def __init__(self):
-                super(TwoLayerLinear, self).__init__()
-                self.fc1 = nn.Linear(5, 5)
-                self.fc2 = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.fc1(x)
-                return self.fc2(x)
-
-        class Model(nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-                self.subm = TwoLayerLinear()
-                self.fc = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.subm(x)
-                x = self.fc(x)
-                return x
 
-        model = Model()
+        model = LinearModelWithSubmodule()
         qengine = torch.backends.quantized.engine
         qconfig_dict = {'': torch.quantization.get_default_qat_qconfig(qengine)}
 
@@ -553,34 +535,12 @@ def forward(self, x):
 
     @skipIfNoFBGEMM
     def test_save_observer_state_dict(self):
-        class TwoLayerLinear(nn.Module):
-            def __init__(self):
-                super(TwoLayerLinear, self).__init__()
-                self.fc1 = nn.Linear(5, 5)
-                self.fc2 = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.fc1(x)
-                return self.fc2(x)
-
-        class Model(nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-                self.subm = TwoLayerLinear()
-                self.fc = nn.Linear(5, 5)
-
-            def forward(self, x):
-                x = self.subm(x)
-                x = self.fc(x)
-                return x
-
-        model = Model().eval()
+        orig = LinearModelWithSubmodule().eval()
+        model = orig
         qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')}
-
         # symbolically trace
         model = symbolic_trace(model)
         model = prepare_static_fx(model, qconfig_dict)
-
         # run it through input
         x = torch.randn(5, 5)
         model(x)
@@ -588,18 +548,18 @@ def forward(self, x):
         quant = convert_static_fx(model)
 
         # save state_dict of model
-        import io
+        obs_dict = torch.quantization.get_observer_state_dict(model)
         b = io.BytesIO()
-        torch.save(model.state_dict(), b)
+        torch.save(obs_dict, b)
         b.seek(0)
 
         # Load the stats into new model
-        model_2 = Model().eval()
+        model_2 = orig
         model_2 = symbolic_trace(model_2)
         model_2 = prepare_static_fx(model_2, qconfig_dict)
 
         loaded_dict = torch.load(b)
-        model_2.load_state_dict(loaded_dict)
+        torch.quantization.load_observer_state_dict(model_2, loaded_dict)
 
         quant_2 = convert_static_fx(model_2)
 
diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index 5068a6fe7fd4..8d4e43cc52c4 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -16,6 +16,7 @@
     default_per_channel_weight_observer,
     get_observer_dict,
     prepare,
+    QConfig,
 )
 
 from torch.quantization._learnable_fake_quantize import (
@@ -44,6 +45,7 @@
     QuantizationTestCase,
     AnnotatedSingleLayerLinearModel,
     test_only_eval_fn,
+    SingleLayerLinearModel,
 )
 
 from torch.testing._internal.common_quantized import (
@@ -473,6 +475,32 @@ def test_histogram_observer_save_load_state_dict(self):
         self.assertEqual(obs2.max_val.shape, torch.Size([]))
 
 
+    def test_save_load_state_dict_script(self):
+        """
+        Tests that we can save and load state_dict for observers that are scripted
+        in a quantized model.
+        """
+        obs_list = [MinMaxObserver, MovingAverageMinMaxObserver,
+                    MinMaxDynamicQuantObserver, PerChannelMinMaxObserver,
+                    MovingAveragePerChannelMinMaxObserver, HistogramObserver]
+
+        for obs in obs_list:
+            model = SingleLayerLinearModel().eval()
+            qconfig = QConfig(activation=default_observer, weight=obs)
+            qconfig_dict = {'' : qconfig}
+            scripted = torch.jit.script(model)
+            scripted = torch.quantization.prepare_jit(scripted, qconfig_dict)
+            x = torch.rand(5, 5)
+            scripted(x)
+            obs_dict = torch.quantization.get_observer_state_dict(scripted)
+
+            # Load stats
+            scripted_2 = torch.jit.script(model)
+            scripted_2 = torch.quantization.prepare_jit(scripted_2, qconfig_dict)
+            torch.quantization.load_observer_state_dict(scripted_2, obs_dict)
+            # Verify that state_dict matches exactly with original one.
+            self.assertEqual(scripted.state_dict(), scripted_2.state_dict())
+
 # HistogramObserver that works like it does on master
 class _ReferenceHistogramObserver(HistogramObserver):
     def __init__(self, *args, **kwargs):
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 163bd037467e..fc0d2a436d0b 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -2,10 +2,11 @@
 import warnings
 from abc import ABCMeta, abstractmethod
 from functools import partial
-from typing import List, Tuple, Optional
-
+from typing import List, Tuple, Optional, Dict, Union
+from collections import OrderedDict
 import torch
 import torch.nn as nn
+import re
 
 def _with_args(cls_or_self, **kwargs):
     r"""Wrapper that allows creation of class factories.
@@ -633,9 +634,10 @@ def calculate_qparams(self):
     def extra_repr(self):
         return "min_val={}, max_val={}".format(self.min_vals, self.max_vals)
 
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-
+    @torch.jit.export
+    def _load_from_state_dict(self, state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], prefix: str,
+                              local_metadata: Dict[str, torch.Tensor], strict: bool,
+                              missing_keys: List[str], unexpected_keys: List[str], error_msgs: List[str]):
         local_state = ['min_vals', 'max_vals']
         for name in local_state:
             key = prefix + name
@@ -649,10 +651,26 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                     self.min_vals.resize_(val.shape)
                 else:
                     self.max_vals.resize_(val.shape)
+                # For torchscript module we need to update the attributes here since we do not
+                # call the `_load_from_state_dict` function defined module.py
+                if torch.jit.is_scripting():
+                    if name == 'min_vals':
+                        self.min_vals.copy_(val)
+                    else:
+                        self.max_vals.copy_(val)
             elif strict:
                 missing_keys.append(key)
-        super(PerChannelMinMaxObserver, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict,
-                                                                    missing_keys, unexpected_keys, error_msgs)
+
+        if not torch.jit.is_scripting():
+            super(PerChannelMinMaxObserver, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict,
+                                                                        missing_keys, unexpected_keys, error_msgs)
+
+    @torch.jit.export
+    def _load_from_state_dict_script(self, state_dict: Union[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
+                                     prefix: str, local_metadata: Dict[str, torch.Tensor], strict: bool,
+                                     missing_keys: List[str], unexpected_keys: List[str], error_msgs: List[str]):
+
+        self._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
 
 class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver):
     r"""Observer module for computing the quantization parameters based on the
@@ -1092,6 +1110,68 @@ def forward(self, x):
     def calculate_qparams(self):
         raise Exception("calculate_qparams should not be called for NoopObserver")
 
+def _is_observer_script_module(mod, obs_type_name):
+    ''' Returns true if given mod is an instance of Observer script module.
+    '''
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        # qualified name looks like '__torch__.torch.quantization.observer.___torch_mangle_2.MinMaxObserver'
+        suffix = mod._c.qualified_name.split('.', 1)[1]
+        name = re.sub(r'\.___torch_mangle_\d+', '', suffix)
+        return obs_type_name in name
+    return False
+
+def _is_activation_post_process(module):
+    return (isinstance(module, torch.quantization.ObserverBase) or
+            isinstance(module, torch.quantization.FakeQuantize) or
+            _is_observer_script_module(module, 'torch.quantization.observer'))
+
+def _is_per_channel_script_obs_instance(module):
+    if isinstance(module, torch.jit.RecursiveScriptModule):
+        return _is_observer_script_module(module, "torch.quantization.observer.PerChannelMinMaxObserver") or\
+            _is_observer_script_module(module, "torch.quantization.observer.MovingAveragePerChannelMinMaxObserver")
+    return False
+
+def get_observer_state_dict(mod):
+    r"""
+    Returns the state dict corresponding to the observer stats.
+    Traverse the model state_dict and extract out the stats.
+    """
+    od = OrderedDict()
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        for k, v in mod.state_dict().items():
+            if 'observer' in k:
+                od[k] = v
+    else:
+        # path for GraphModule and nn.Module (eager mode)
+        for k, v in mod.state_dict().items():
+            if 'activation_post_process' in k:
+                od[k] = v
+    od._metadata = mod.state_dict()._metadata
+    return od
+
+def load_observer_state_dict(mod, obs_dict):
+    r"""
+    Given input model and a state_dict containing model observer stats,
+    load the stats back into the model. The observer state_dict can be saved
+    using torch.quantization.get_observer_state_dict
+    """
+    missing_keys = []
+    unexpected_keys = []
+    for name, module in mod.named_modules():
+        prefix = name + '.'
+        if _is_activation_post_process(module):
+            if _is_per_channel_script_obs_instance(module):
+                # For per-channel observers we need to call a custom load_from_state_dict to resize the tensor.
+                # However this is not called when the module is scripted and we end up calling the default one in module.py
+                module._load_from_state_dict_script(obs_dict, prefix, {}, True, missing_keys, unexpected_keys, [])
+            else:
+                module._load_from_state_dict(obs_dict, prefix, {}, False, missing_keys, unexpected_keys, [])
+    for k in missing_keys:
+        if 'observer' in k or 'activation_post_process' in k:
+            raise Exception("Missing keys for observer {} in state_dict".format(k))
+    for k in unexpected_keys:
+        if 'observer' in k or 'activation_post_process' in k:
+            raise Exception("Unexpected keys for observer {} in state_dict".format(k))
 
 # Restrict activations to be in the range (0,127)
 default_observer = MinMaxObserver.with_args(reduce_range=True)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 387c45bd31e0..bcdb766b997d 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -906,6 +906,17 @@ def forward(self, x):
         x = self.fc2(x)
         return x
 
+class LinearModelWithSubmodule(nn.Module):
+    def __init__(self):
+        super(LinearModelWithSubmodule, self).__init__()
+        self.subm = TwoLayerLinearModel()
+        self.fc = nn.Linear(5, 5)
+
+    def forward(self, x):
+        x = self.subm(x)
+        x = self.fc(x)
+        return x
+
 class AnnotatedTwoLayerLinearModel(torch.nn.Module):
     def __init__(self):
         super().__init__()

From 1ed1a2f5b004eb03257d35e2ff08030fac6c4f64 Mon Sep 17 00:00:00 2001
From: Basil Hosmer <bhosmer@fb.com>
Date: Tue, 29 Sep 2020 02:37:55 -0700
Subject: [PATCH 250/449] [wip] fast typeMeta/ScalarType conversion approach 2
 (#44965)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44965

Test Plan: Imported from OSS

Reviewed By: ezyang

Differential Revision: D23789657

Pulled By: bhosmer

fbshipit-source-id: 5afdd52d24bd097891ff4a7313033f7bd400165e
---
 aten/src/ATen/native/DispatchStub.h  |   2 +
 aten/src/ATen/templates/TensorBody.h |   1 +
 aten/src/TH/THStorageFunctions.hpp   |   1 +
 c10/core/DefaultDtype.cpp            |   4 +-
 c10/core/ScalarType.h                |  66 +-------
 c10/core/ScalarTypeToTypeMeta.h      |  47 ++++++
 c10/core/TensorImpl.cpp              |   3 +-
 c10/core/TensorImpl.h                |   4 +-
 c10/core/TensorOptions.h             |   1 +
 c10/core/UndefinedTensorImpl.h       |   2 -
 c10/util/typeid.cpp                  |  62 ++++----
 c10/util/typeid.h                    | 223 +++++++++++++++++++--------
 torch/csrc/DynamicTypes.h            |   1 +
 torch/csrc/jit/tensorexpr/stmt.h     |   1 +
 14 files changed, 251 insertions(+), 167 deletions(-)
 create mode 100644 c10/core/ScalarTypeToTypeMeta.h

diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index dc21a505e8c1..63e2462489be 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -3,7 +3,9 @@
 #include <c10/core/Backend.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
+
 #include <type_traits>
+#include <atomic>
 
 // Implements instruction set specific function dispatch.
 //
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index 202b2124f286..f86a17a113cc 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -6,6 +6,7 @@
 #include <c10/core/QScheme.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Storage.h>
 #include <ATen/core/TensorAccessor.h>
 #include <c10/core/TensorImpl.h>
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index b78f8c7a3035..8d5c28daa796 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -8,6 +8,7 @@
 #include <TH/THStorageFunctions.h>
 
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 
 // Note [Weak references for intrusive refcounting]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp
index c4f420ab6e22..b125c9a56541 100644
--- a/c10/core/DefaultDtype.cpp
+++ b/c10/core/DefaultDtype.cpp
@@ -3,12 +3,12 @@
 
 namespace c10 {
 static auto default_dtype = caffe2::TypeMeta::Make<float>();
-static auto default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
+static auto default_dtype_as_scalartype = default_dtype.toScalarType();
 static auto default_complex_dtype = caffe2::TypeMeta::Make<c10::complex<float>>();
 
 void set_default_dtype(caffe2::TypeMeta dtype) {
   default_dtype = std::move(dtype);
-  default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
+  default_dtype_as_scalartype = default_dtype.toScalarType();
   if(default_dtype_as_scalartype == ScalarType::Double) {
     default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<double>>());
   } else {
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 41980540017c..f7feb3b550f0 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -3,9 +3,11 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/complex.h>
 #include <c10/util/Half.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/Optional.h>
-#include <c10/util/typeid.h>
 
 #include <complex>
 #include <cstdint>
@@ -67,6 +69,8 @@ enum class ScalarType : int8_t {
   NumOptions
 };
 
+constexpr uint16_t NumScalarTypes = static_cast<uint16_t>(ScalarType::NumOptions);
+
 namespace impl {
 
 // These are used to map ScalarTypes to C++ types.
@@ -93,7 +97,7 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
 
 #undef SPECIALIZE_ScalarTypeToCPPType
 
-}
+} // namespace impl
 
 template <typename T>
 struct CppTypeToScalarType;
@@ -160,64 +164,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
   _(c10::complex<float>, ComplexFloat)         \
   _(c10::complex<double>, ComplexDouble)
 
-static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
-#define DEFINE_CASE(ctype, name) \
-  case ScalarType::name:         \
-    return caffe2::TypeMeta::Make<ctype>();
-
-  switch (scalar_type) {
-    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
-    case ScalarType::Undefined:
-      return caffe2::TypeMeta();
-    default:
-      AT_ERROR(
-          "Unrecognized Scalartype ",
-          scalar_type,
-          " (please report this error)");
-  }
-#undef DEFINE_CASE
-}
-
-static inline c10::optional<ScalarType> tryTypeMetaToScalarType(
-    caffe2::TypeMeta dtype) {
-#define DEFINE_IF(ctype, name)                    \
-  if (dtype == caffe2::TypeMeta::Make<ctype>()) { \
-    return {ScalarType::name};                    \
-  }
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_IF)
-#undef DEFINE_IF
-  if (dtype == caffe2::TypeMeta()) {
-    return {ScalarType::Undefined};
-  }
-  return c10::nullopt;
-}
-
-static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
-  if (auto scalar_type = tryTypeMetaToScalarType(dtype)) {
-    return *scalar_type;
-  }
-  AT_ERROR(
-      "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)");
-}
-
-inline optional<at::ScalarType> optTypeMetaToScalarType(optional<caffe2::TypeMeta> type_meta) {
-  if (!type_meta.has_value()) {
-    return c10::nullopt;
-  }
-  return typeMetaToScalarType(*type_meta);
-}
-
-static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
-  if (auto mt = tryTypeMetaToScalarType(m)) {
-    return (*mt) == t;
-  }
-  return false;
-}
-
-static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
-  return t == m;
-}
-
 #define DEFINE_CONSTANT(_, name) \
   constexpr ScalarType k##name = ScalarType::name;
 
diff --git a/c10/core/ScalarTypeToTypeMeta.h b/c10/core/ScalarTypeToTypeMeta.h
new file mode 100644
index 000000000000..b0ac38309704
--- /dev/null
+++ b/c10/core/ScalarTypeToTypeMeta.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/typeid.h>
+
+// these just expose TypeMeta/ScalarType bridge functions in c10
+// TODO move to typeid.h (or codemod away) when TypeMeta et al
+// are moved from caffe2 to c10 (see note at top of typeid.h)
+
+namespace c10 {
+
+/**
+ * convert ScalarType enum values to TypeMeta handles
+ */
+static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
+  return caffe2::TypeMeta::fromScalarType(scalar_type);
+}
+
+/**
+ * convert TypeMeta handles to ScalarType enum values
+ */
+static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
+  return dtype.toScalarType();
+}
+
+/**
+ * typeMetaToScalarType(), lifted to optional
+ */
+static inline optional<at::ScalarType> optTypeMetaToScalarType(optional<caffe2::TypeMeta> type_meta) {
+  if (!type_meta.has_value()) {
+    return c10::nullopt;
+  }
+  return type_meta->toScalarType();
+}
+
+/**
+ * convenience: equality across TypeMeta/ScalarType conversion
+ */
+static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
+  return m.isScalarType(t);
+}
+
+static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
+  return t == m;
+}
+
+} // namespace c10
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 8702ed4fdebf..2c8a87f9c2d1 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -62,8 +62,7 @@ TensorImpl::TensorImpl(Storage&& storage, DispatchKeySet key_set, const caffe2::
       data_type_(data_type),
       device_opt_(device_opt) {
   if (!key_set.empty()) {
-    AT_ASSERT(data_type.id() ==  caffe2::TypeIdentifier::uninitialized() ||
-              device_opt_.has_value());
+    TORCH_INTERNAL_ASSERT(data_type == ScalarType::Undefined || device_opt_.has_value());
     // UndefinedTensorImpl is a singleton, so we skip logging it
     C10_LOG_API_USAGE_ONCE("tensor.create");
   }
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 5b383303df92..60472747bd2f 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1777,13 +1777,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 //    strides SmallVector (pre-allocated 4)
 //    storage offset
 //    numel
-//    data type pointer
+//    data type
 //    (optional) device
 //    tensor type id
 //    miscellaneous bitfield
 //
 static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...
-              sizeof(TensorImpl) == sizeof(int64_t) * 31,
+              sizeof(TensorImpl) == sizeof(int64_t) * 30,
               "You changed the size of TensorImpl on 64-bit arch."
               "See Note [TensorImpl size constraints] on how to proceed.");
 } // namespace c10
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index dd92f919662f..e0d998cc49c4 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -4,6 +4,7 @@
 #include <c10/core/Backend.h>
 #include <c10/core/Layout.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Device.h>
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/DispatchKeySet.h>
diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h
index 9f1cb93c10eb..26122ed305e2 100644
--- a/c10/core/UndefinedTensorImpl.h
+++ b/c10/core/UndefinedTensorImpl.h
@@ -28,8 +28,6 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
 private:
   UndefinedTensorImpl();
   static UndefinedTensorImpl _singleton;
-public:
-  friend struct UndefinedType;
 };
 
 } // namespace c10
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index e97eaa843979..f3fe048b4cca 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -14,42 +14,41 @@ namespace detail {
 C10_EXPORT void _ThrowRuntimeTypeLogicError(const string& msg) {
   // In earlier versions it used to be std::abort() but it's a bit hard-core
   // for a library
-  AT_ERROR(msg);
+  TORCH_CHECK(false, msg);
 }
+} // namespace detail
 
+[[noreturn]] void TypeMeta::error_unsupported_typemeta(caffe2::TypeMeta dtype) {
+  TORCH_CHECK(false, "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)");
+}
 
-} // namespace detail
+// see TypeMeta::addTypeMetaData
+std::atomic<uint16_t> TypeMeta::nextTypeIndex(NumScalarTypes);
 
-template <>
-EXPORT_IF_NOT_GCC const detail::TypeMetaData* TypeMeta::_typeMetaDataInstance<
-    detail::_Uninitialized>() noexcept {
-  static constexpr detail::TypeMetaData singleton = detail::TypeMetaData(
-      0,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      TypeIdentifier::uninitialized(),
-      "nullptr (uninitialized)");
-  return &singleton;
+// fixed length array of TypeMetaData instances
+detail::TypeMetaData* TypeMeta::typeMetaDatas() {
+  static detail::TypeMetaData instances[MaxTypeIndex + 1] = {
+#define SCALAR_TYPE_META(T, name)         \
+    /* ScalarType::name */                \
+    detail::TypeMetaData(                 \
+      sizeof(T),                          \
+      detail::_PickNew<T>(),              \
+      detail::_PickPlacementNew<T>(),     \
+      detail::_PickCopy<T>(),             \
+      detail::_PickPlacementDelete<T>(),  \
+      detail::_PickDelete<T>(),           \
+      TypeIdentifier::Get<T>(),           \
+      c10::util::get_fully_qualified_type_name<T>()),
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SCALAR_TYPE_META)
+#undef SCALAR_TYPE_META
+    // The remainder of the array is padded with TypeMetaData blanks.
+    // The first of these is the entry for ScalarType::Undefined.
+    // The rest are consumed by CAFFE_KNOWN_TYPE entries.
+  };
+  return instances;
 }
 
-CAFFE_KNOWN_TYPE(uint8_t)
-CAFFE_KNOWN_TYPE(int8_t)
-CAFFE_KNOWN_TYPE(int16_t)
-CAFFE_KNOWN_TYPE(int)
-CAFFE_KNOWN_TYPE(int64_t)
-CAFFE_KNOWN_TYPE(at::Half)
-CAFFE_KNOWN_TYPE(float)
-CAFFE_KNOWN_TYPE(double)
-CAFFE_KNOWN_TYPE(c10::complex<c10::Half>)
-CAFFE_KNOWN_TYPE(c10::complex<float>)
-CAFFE_KNOWN_TYPE(c10::complex<double>)
-// 11 = undefined type id
-// 12 = Tensor (defined in tensor.cc)
 CAFFE_KNOWN_TYPE(std::string)
-CAFFE_KNOWN_TYPE(bool)
 CAFFE_KNOWN_TYPE(uint16_t)
 CAFFE_KNOWN_TYPE(char)
 CAFFE_KNOWN_TYPE(std::unique_ptr<std::mutex>)
@@ -79,14 +78,11 @@ using _guard_long_unique = std::conditional_t<
     _guard_long_unique_dummy<T>,
     T>;
 } // namespace detail
+
 CAFFE_KNOWN_TYPE(detail::_guard_long_unique<long>);
 CAFFE_KNOWN_TYPE(detail::_guard_long_unique<std::vector<long>>)
 
 CAFFE_KNOWN_TYPE(float*)
 CAFFE_KNOWN_TYPE(at::Half*)
-CAFFE_KNOWN_TYPE(c10::qint8)
-CAFFE_KNOWN_TYPE(c10::quint8)
-CAFFE_KNOWN_TYPE(c10::qint32)
-CAFFE_KNOWN_TYPE(at::BFloat16)
 
 } // namespace caffe2
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 62a0bdfc6644..c5dc8847aecb 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -21,17 +21,14 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
-#include <c10/util/Half.h>
 #include <c10/util/IdWrapper.h>
 #include <c10/util/Type.h>
 #include <c10/util/TypeTraits.h>
 #include <c10/util/TypeIndex.h>
-#include <c10/util/qint32.h>
-#include <c10/util/qint8.h>
-#include <c10/util/quint8.h>
-#include <c10/util/BFloat16.h>
 #include <c10/util/flat_hash_map.h>
 
+#include <c10/core/ScalarType.h>
+
 /*
  * TypeIdentifier is a small type containing an id.
  * Types must be registered using CAFFE_KNOWN_TYPE() for them to have a type id.
@@ -66,7 +63,7 @@ namespace caffe2 {
  */
 class C10_API TypeIdentifier final
     : public at::IdWrapper<TypeIdentifier, c10::util::type_index> {
- public:
+public:
   friend std::ostream& operator<<(std::ostream& stream, TypeIdentifier typeId);
   friend constexpr bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
 
@@ -86,9 +83,8 @@ class C10_API TypeIdentifier final
     return TypeIdentifier(c10::util::type_index{0});
   }
 
- private:
+private:
   constexpr explicit TypeIdentifier(c10::util::type_index id) : IdWrapper(id) {}
-  friend class TypeMeta; // TODO Is this friend an issue?
 };
 
 // Allow usage in std::map / std::set
@@ -125,7 +121,16 @@ struct TypeMetaData final {
   using PlacementDelete = void(void*, size_t);
   using Delete = void(void*);
 
-  TypeMetaData() = delete;
+  constexpr TypeMetaData() noexcept
+  : itemsize_(0),
+    new_(nullptr),
+    placementNew_(nullptr),
+    copy_(nullptr),
+    placementDelete_(nullptr),
+    delete_(nullptr),
+    id_(TypeIdentifier::uninitialized()),
+    name_("nullptr (uninitialized)") {}
+
   constexpr TypeMetaData(
       size_t itemsize,
       New* newFn,
@@ -135,14 +140,14 @@ struct TypeMetaData final {
       Delete* deleteFn,
       TypeIdentifier id,
       c10::string_view name) noexcept
-      : itemsize_(itemsize),
-        new_(newFn),
-        placementNew_(placementNew),
-        copy_(copy),
-        placementDelete_(placementDelete),
-        delete_(deleteFn),
-        id_(id),
-        name_(name) {}
+  : itemsize_(itemsize),
+    new_(newFn),
+    placementNew_(placementNew),
+    copy_(copy),
+    placementDelete_(placementDelete),
+    delete_(deleteFn),
+    id_(id),
+    name_(name) {}
 
   size_t itemsize_;
   New* new_;
@@ -293,25 +298,25 @@ inline constexpr TypeMetaData::Delete* _PickDelete() noexcept {
   return &_Delete<T>;
 }
 
-template <class T>
-inline C10_TYPENAME_CONSTEXPR TypeMetaData _makeTypeMetaDataInstance() {
-  C10_HOST_CONSTEXPR_VAR auto typeId = TypeIdentifier::Get<T>();
-  C10_TYPENAME_CONSTEXPR auto typeName = c10::util::get_fully_qualified_type_name<T>();
-
-  return {sizeof(T),
-          _PickNew<T>(),
-          _PickPlacementNew<T>(),
-          _PickCopy<T>(),
-          _PickPlacementDelete<T>(),
-          _PickDelete<T>(),
-          typeId,
-          typeName};
-}
-
 class _Uninitialized final {};
 
 } // namespace detail
 
+//
+// note: these are outside TypeMeta bc gcc seems to have trouble
+// with scalarTypeItemSizes as a constexpr static member used by
+// a public inline instance method
+//
+static constexpr uint16_t NumScalarTypes = static_cast<uint16_t>(ScalarType::NumOptions);
+
+// item sizes for TypeMeta::itemsize() fast path
+static constexpr size_t scalarTypeItemSizes[NumScalarTypes] = {
+#define SCALAR_TYPE_SIZE(T, name) sizeof(T),
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SCALAR_TYPE_SIZE)
+#undef SCALAR_TYPE_SIZE
+    0, // Undefined
+};
+
 /**
  * TypeMeta is a thin class that allows us to store the type of a container such
  * as a blob, or the data type of a tensor, with a unique run-time id. It also
@@ -343,11 +348,11 @@ class C10_API TypeMeta final {
 
   TypeMeta(TypeMeta&& rhs) noexcept = default;
 
- private:
+private:
   // TypeMeta can only be created by Make, making sure that we do not
   // create incorrectly mixed up TypeMeta objects.
-  explicit TypeMeta(const detail::TypeMetaData* data) noexcept
-  : data_(data) {
+  explicit TypeMeta(const uint16_t index) noexcept
+  : index_(index) {
   }
 
  public:
@@ -355,48 +360,66 @@ class C10_API TypeMeta final {
    * Returns the type id.
    */
   TypeIdentifier id() const noexcept {
-    return data_->id_;
+    return data().id_;
+  }
+  /**
+   * true if we represent some ScalarType type
+   */
+  inline bool isScalarType() const noexcept {
+    return index_ < NumScalarTypes;
+  }
+  /**
+   * true if we represent ScalarType scalar_type
+   */
+  inline bool isScalarType(ScalarType scalar_type) const noexcept {
+    return index_ == static_cast<uint16_t>(scalar_type);
   }
   /**
    * Returns the size of the item.
    */
-  size_t itemsize() const noexcept {
-    return data_->itemsize_;
+  inline size_t itemsize() const noexcept {
+    if (C10_LIKELY(isScalarType())) {
+      return scalarTypeItemSizes[index_];
+    }
+    return data().itemsize_;
   }
+  /**
+   * Returns the new function pointer for individual items.
+   */
   New* newFn() const noexcept {
-    return data_->new_;
+    return data().new_;
   }
   /**
    * Returns the placement new function pointer for individual items.
    */
   PlacementNew* placementNew() const noexcept {
-    return data_->placementNew_;
+    return data().placementNew_;
   }
   /**
    * Returns the typed copy function pointer for individual iterms.
    */
   Copy* copy() const noexcept {
-    return data_->copy_;
+    return data().copy_;
   }
   /**
    * Returns the destructor function pointer for individual items.
    */
   PlacementDelete* placementDelete() const noexcept {
-    return data_->placementDelete_;
+    return data().placementDelete_;
   }
   Delete* deleteFn() const noexcept {
-    return data_->delete_;
+    return data().delete_;
   }
   /**
    * Returns a printable name for the type.
    */
   c10::string_view name() const noexcept {
-    return data_->name_;
+    return data().name_;
   }
 
   friend bool operator==(
-      const TypeMeta& lhs,
-      const TypeMeta& rhs) noexcept;
+      const TypeMeta lhs,
+      const TypeMeta rhs) noexcept;
 
   template <typename T>
   bool Match() const noexcept {
@@ -411,7 +434,7 @@ class C10_API TypeMeta final {
   }
 
   template <class T>
-  static C10_TYPENAME_CONSTEXPR c10::string_view TypeName() noexcept {
+  static c10::string_view TypeName() noexcept {
     return c10::util::get_fully_qualified_type_name<T>();
   }
 
@@ -436,35 +459,105 @@ class C10_API TypeMeta final {
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wundefined-var-template"
 #endif
-    return TypeMeta(_typeMetaDataInstance<T>());
+    return TypeMeta(_typeMetaData<T>());
 #ifndef _MSC_VER
 #pragma GCC diagnostic pop
 #endif
   }
 
- private:
-  const detail::TypeMetaData* data_;
+  /**
+  * convert ScalarType enum values to TypeMeta handles
+  */
+  static inline caffe2::TypeMeta fromScalarType(ScalarType scalar_type) {
+    const size_t index = static_cast<uint16_t>(scalar_type);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      index < NumScalarTypes,
+      "Unrecognized Scalartype ", scalar_type, " (please report this error)");
+    return TypeMeta(index);
+  }
 
+  /**
+   * convert TypeMeta handles to ScalarType enum values
+   */
+  inline ScalarType toScalarType() {
+    if (C10_LIKELY(isScalarType())) {
+      return static_cast<ScalarType>(index_);
+    }
+    error_unsupported_typemeta(*this);
+  }
+
+private:
+  [[noreturn]] static void error_unsupported_typemeta(caffe2::TypeMeta dtype);
+
+  // hard limit number of registered types
+  // note: constexpr provokes Windows compilation error "member may not be initialized"
+  // static constexpr size_t MaxTypeIndex = UINT8_MAX;
+  #define MaxTypeIndex UINT8_MAX
+
+  static std::atomic<uint16_t> nextTypeIndex;
+
+  static detail::TypeMetaData* typeMetaDatas();
+
+  template <class T>
+  static uint16_t addTypeMetaData() {
+    const uint16_t index = nextTypeIndex++;
+    TORCH_CHECK(index <= MaxTypeIndex,
+      "Maximum number of CAFFE_KNOWN_TYPE declarations has been exceeded. ",
+      "Please report this issue.");
+    typeMetaDatas()[index] = detail::TypeMetaData{
+      sizeof(T),
+      detail::_PickNew<T>(),
+      detail::_PickPlacementNew<T>(),
+      detail::_PickCopy<T>(),
+      detail::_PickPlacementDelete<T>(),
+      detail::_PickDelete<T>(),
+      TypeIdentifier::Get<T>(),
+      c10::util::get_fully_qualified_type_name<T>()};
+    return index;
+  }
+
+  // specializations return indexes into typeMetaDataInstances()
   template <class T>
-  C10_API static const detail::TypeMetaData* _typeMetaDataInstance() noexcept;
+  C10_API static uint16_t _typeMetaData() noexcept;
+
+  //
+  // TypeMeta just wraps this index
+  //
+
+  uint16_t index_;
+
+  inline const detail::TypeMetaData& data() const {
+    return typeMetaDatas()[index_];
+  }
 };
 
+// specializations of TypeMeta::_typeMetaData for ScalarType types
+
+#define DEFINE_SCALAR_METADATA_INSTANCE(T, name)              \
+  template <>                                                 \
+  constexpr uint16_t TypeMeta::_typeMetaData<T>() noexcept {  \
+    return static_cast<uint16_t>(ScalarType::name);           \
+  }
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_METADATA_INSTANCE)
+#undef DEFINE_SCALAR_METADATA_INSTANCE
+
 template <>
-C10_EXPORT const detail::TypeMetaData* TypeMeta::_typeMetaDataInstance<
-    detail::_Uninitialized>() noexcept;
+constexpr uint16_t TypeMeta::_typeMetaData<detail::_Uninitialized>() noexcept {
+  return static_cast<uint16_t>(ScalarType::Undefined);
+}
 
 inline TypeMeta::TypeMeta() noexcept
-    : data_(_typeMetaDataInstance<detail::_Uninitialized>()) {
+  : index_(_typeMetaData<detail::_Uninitialized>()) {
 }
 
 inline bool operator==(
-    const TypeMeta& lhs,
-    const TypeMeta& rhs) noexcept {
-  return (lhs.data_ == rhs.data_);
+    const TypeMeta lhs,
+    const TypeMeta rhs) noexcept {
+  return (lhs.index_ == rhs.index_);
 }
 inline bool operator!=(
-    const TypeMeta& lhs,
-    const TypeMeta& rhs) noexcept {
+    const TypeMeta lhs,
+    const TypeMeta rhs) noexcept {
   return !operator==(lhs, rhs);
 }
 
@@ -499,13 +592,11 @@ inline std::ostream& operator<<(
 #define EXPORT_IF_NOT_GCC
 #endif
 
-#define CAFFE_KNOWN_TYPE(T)                                        \
-  template <>                                                      \
-  EXPORT_IF_NOT_GCC const detail::TypeMetaData*                    \
-  TypeMeta::_typeMetaDataInstance<T>() noexcept {                  \
-    static C10_TYPENAME_CONSTEXPR detail::TypeMetaData singleton = \
-        detail::_makeTypeMetaDataInstance<T>();                    \
-    return &singleton;                                             \
+#define CAFFE_KNOWN_TYPE(T)                                           \
+  template <>                                                         \
+  EXPORT_IF_NOT_GCC uint16_t TypeMeta::_typeMetaData<T>() noexcept {  \
+    static const uint16_t index = addTypeMetaData<T>();               \
+    return index;                                                     \
   }
 
 } // namespace caffe2
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
index 0877fb317cb3..774c5e99999f 100644
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@@ -6,6 +6,7 @@
 
 #include <ATen/Device.h>
 #include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Backend.h>
 #include <c10/core/Layout.h>
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index 772a0fa4c187..990127ffab56 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <list>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include <torch/csrc/jit/tensorexpr/expr.h>

From 56af12265975606e16e222e007872a25640ddd12 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 29 Sep 2020 04:31:34 -0700
Subject: [PATCH 251/449] Revert D23966878: [pytorch][PR] This PR flips a
 switch to enable PE + TE

Test Plan: revert-hammer

Differential Revision:
D23966878 (https://github.com/pytorch/pytorch/commit/dddb685c114a94df01a1919c7238f331fa63fc6d)

Original commit changeset: 2010a0b07c59

fbshipit-source-id: 132556039730fd3e4babd0d7ca8daf9c8d14f728
---
 torch/csrc/jit/passes/tensorexpr_fuser.cpp               | 2 +-
 torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index fd70f2963b8b..8e90fbefa77f 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -192,7 +192,7 @@ bool isSupported(Node* node) {
 
 } // namespace tensorexpr
 
-static bool texpr_fuser_enabled_ = true;
+static bool texpr_fuser_enabled_ = false;
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index cb733938c033..5d63d78d4765 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -45,7 +45,7 @@ static std::atomic<bool> executor_mode{true};
 static std::atomic<bool> profiling_mode{false};
 #else
 static std::atomic<bool> executor_mode{true};
-static std::atomic<bool> profiling_mode{true};
+static std::atomic<bool> profiling_mode{false};
 #endif
 
 static std::atomic<size_t> num_profiled_runs{1};

From 37f9af7f29a150faca7640b56de5dd691f56f346 Mon Sep 17 00:00:00 2001
From: Antonio Cuni <anto.cuni@gmail.com>
Date: Tue, 29 Sep 2020 04:52:41 -0700
Subject: [PATCH 252/449] Missing tests about torch.xxx(out=...) (#44465)

Summary:
PR opened just to run the CI tests

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44465

Reviewed By: ngimel

Differential Revision: D23907565

Pulled By: mruberry

fbshipit-source-id: 620661667877f1e9a2bab17d19988e2dc986fc0f
---
 test/test_ops.py                              | 23 +++++++++++++++++++
 .../_internal/common_methods_invocations.py   |  3 +++
 2 files changed, 26 insertions(+)

diff --git a/test/test_ops.py b/test/test_ops.py
index 28570d9892ab..5be450d4d41f 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -131,8 +131,31 @@ def test_inplace_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
 
 
+class TestOut(TestCase):
+    exact_dtype = True
+
+    @ops(op_db)
+    def test_out(self, device, dtype, op):
+        if not op.supports_tensor_out:
+            self.skipTest("Skipped! Operator %s does not support out=..." % op.name)
+        samples = op.sample_inputs(device, dtype)
+        if len(samples) == 0:
+            self.skipTest("Skipped! No sample inputs!")
+
+        # NOTE: only tests on first sample
+        sample = samples[0]
+        # call it normally to get the expected result
+        expected = op(sample.input, *sample.args, **sample.kwargs)
+        # call it with out=... and check we get the expected result
+        out_kwargs = sample.kwargs.copy()
+        out_kwargs['out'] = out = torch.empty_like(expected)
+        op(sample.input, *sample.args, **out_kwargs)
+        self.assertEqual(expected, out)
+
+
 instantiate_device_type_tests(TestOpInfo, globals())
 instantiate_device_type_tests(TestGradients, globals())
+instantiate_device_type_tests(TestOut, globals())
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index cf4ff7f31fdd..db2a4c2b92d1 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -63,8 +63,10 @@ def __init__(self,
                  dtypesIfCUDA=None,  # dtypes this function is expected to work with on CUDA
                  dtypesIfROCM=None,  # dtypes this function is expected to work with on ROCM
                  test_inplace_grad=True,  # whether to gradcheck and gradgradcheck the inplace variant
+                 supports_tensor_out=True,  # whether the op supports the out kwarg, returning a Tensor
                  skips=tuple(),  # information about which tests to skip
                  decorators=None):  # decorators to apply to generated tests
+
         # Validates the dtypes are generated from the dispatch-related functions
         for dtype_list in (dtypes, dtypesIfCPU, dtypesIfCUDA, dtypesIfROCM):
             assert isinstance(dtype_list, (_dispatch_dtypes, type(None)))
@@ -85,6 +87,7 @@ def __init__(self,
         self.inplace_variant = getattr(torch.Tensor, inplace_name) if hasattr(torch.Tensor, name) else None
 
         self.test_inplace_grad = test_inplace_grad
+        self.supports_tensor_out = supports_tensor_out
 
         self.skips = skips
         self.decorators = decorators

From 87f98a5b54513e5fd85d5b2d806d843a124b8f51 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 29 Sep 2020 05:54:04 -0700
Subject: [PATCH 253/449] Updates torch.floor_divide documentation to clarify
 it's actually torch.trunc_divide (or torch.rtz_divide) (#45411)

Summary:
Addresses https://github.com/pytorch/pytorch/issues/43874 for 1.7. 1.8 will need to take floor_divide through a proper deprecation process.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45411

Reviewed By: ngimel

Differential Revision: D23974997

Pulled By: mruberry

fbshipit-source-id: 16dd07e50a17ac76bfc93bd6b71d4ad72d909bf4
---
 torch/_torch_docs.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 7b00ddbd1505..0e43f6d89983 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -2740,20 +2740,27 @@ def merge_dicts(*dicts):
     tensor([-1.,  1., -1., -1.])
 """.format(**common_args))
 
-add_docstr(torch.floor_divide,
-           r"""
+add_docstr(torch.floor_divide, r"""
 floor_divide(input, other, *, out=None) -> Tensor
 
-Return the division of the inputs rounded down to the nearest integer. See :func:`torch.div`
-for type promotion and broadcasting rules.
+.. warning::
+    This function's name is a misnomer. It actually rounds the
+    quotient towards zero instead of taking its floor. This behavior
+    will be deprecated in a future PyTorch release.
+
+Computes :attr:`input` divided by :attr:`other`, elementwise, and rounds each
+quotient towards zero. Equivalently, it truncates the quotient(s):
 
 .. math::
-    \text{{out}}_i = \left\lfloor \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right\rfloor
+    \text{{out}}_i = \text{trunc} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right)
 
 """ + r"""
+
+Supports broadcasting to a common shape, type promotion, and integer and float inputs.
+
 Args:
-    input (Tensor): the numerator tensor
-    other (Tensor or Scalar): the denominator
+    input (Tensor or Number): the dividend
+    other (Tensor or Number): the divisor
 
 Keyword args:
     {out}
@@ -9062,7 +9069,7 @@ def merge_dicts(*dicts):
     out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
 
 Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
-``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``. 
+``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
 The :attr:`periodic` argument is intended as a helpful shorthand
 to produce a periodic window as input to functions like :func:`torch.stft`.
 

From 0806c58e9fdb871167c51b89a100fdb16b942c1e Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Tue, 29 Sep 2020 07:28:04 -0700
Subject: [PATCH 254/449] Optimize view_as_complex and view_as_real (#44908)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This avoids unnecessary memory allocations in `view_as_complex` and `view_as_real`. I construct the new tensor directly with the existing storage to avoid creating a new storage object and also use `DimVector`s to avoid allocating for the sizes and strides. Overall, this saves about 2 us of overhead from `torch.fft.fft` which currently has to call `view_as_real` and `view_as_complex` for every call.

I've used this simple benchmark to measure the overhead:
```python
In [1]: import torch
   ...: a = torch.rand(1, 2)
   ...: ac = torch.view_as_complex(a)
   ...: %timeit torch.view_as_real(ac)
   ...: %timeit torch.view_as_complex(a)
   ...: %timeit ac.real
```

Results before:
```
2.5 µs ± 62.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
2.22 µs ± 36 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
4.17 µs ± 8.76 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
```

and after:
```
1.83 µs ± 9.26 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
1.57 µs ± 7.17 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
3.47 µs ± 34.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44908

Reviewed By: agolynski

Differential Revision: D23793479

Pulled By: anjali411

fbshipit-source-id: 64b9cad70e3ec10891310cbfa8c0bdaa1d72885b
---
 aten/src/ATen/native/ComplexHelper.h | 57 +++++++++++++++++-----------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h
index b8830691f47a..3fde6dbb77e1 100644
--- a/aten/src/ATen/native/ComplexHelper.h
+++ b/aten/src/ATen/native/ComplexHelper.h
@@ -4,12 +4,25 @@
 
 namespace at { namespace native {
 
-inline std::vector<int64_t> computeStrideForViewAsReal(IntArrayRef oldstride) {
-  auto res = oldstride.vec();
-  for(size_t i = 0; i < res.size(); i++) {
-    res[i] = res[i] * 2;
+// View tensor with new dtype, storage offset, sizes and strides 
+inline Tensor view_tensor(
+    const Tensor &tensor, ScalarType dtype,
+    int64_t offset, IntArrayRef sizes, IntArrayRef strides) {
+  Storage storage = tensor.storage();
+  auto new_tensor = detail::make_tensor<TensorImpl>(
+      std::move(storage), tensor.key_set(), scalarTypeToTypeMeta(dtype));
+  auto * impl = new_tensor.unsafeGetTensorImpl();
+  impl->set_storage_offset(offset);
+  impl->set_sizes_and_strides(sizes, strides);
+  return new_tensor;
+}
+
+inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) {
+  DimVector res(oldstride.size() + 1);
+  for(size_t i = 0; i < oldstride.size(); i++) {
+    res[i] = oldstride[i] * 2;
   }
-  res.emplace_back(1);
+  res.back() = 1;
   return res;
 }
 
@@ -18,25 +31,25 @@ inline std::vector<int64_t> computeStrideForViewAsReal(IntArrayRef oldstride) {
 // in the last two dimensions
 Tensor view_as_real(const Tensor& self) {
   TORCH_CHECK(self.is_complex(), "view_as_real is only supported for complex tensors");
-  auto new_sizes = self.sizes().vec();
+  auto old_sizes = self.sizes();
+  DimVector new_sizes(old_sizes.size() + 1);
+  std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
   // last dimension will always have two elements containing the real and imag vals
-  new_sizes.emplace_back(2);
+  new_sizes.back() = 2;
   auto new_strides = computeStrideForViewAsReal(self.strides());
   auto new_storage_offset = 2 * self.storage_offset();
   const auto float_type = c10::toValueType(self.scalar_type());
-  return at::empty({0}, self.options().dtype(float_type)).set_(self.storage(), new_storage_offset, new_sizes, new_strides);
+  return view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides);
 }
 
-inline std::vector<int64_t> computeStrideForViewAsComplex(IntArrayRef oldstride) {
-  auto res = oldstride.vec();
-  int dim = res.size();
-
-  TORCH_CHECK(res[dim-1] == 1, "Tensor must have a last dimension with stride 1");
-  res.pop_back();
+inline DimVector computeStrideForViewAsComplex(IntArrayRef oldstride) {
+  const int64_t dim = oldstride.size();
+  TORCH_CHECK(oldstride[dim-1] == 1, "Tensor must have a last dimension with stride 1");
 
-  for (auto i = decltype(res.size()){0}; i < res.size(); i++) {
-    TORCH_CHECK(res[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension");
-    res[i] = res[i] / 2;
+  DimVector res(dim - 1);
+  for (int64_t i = 0; i < res.size(); i++) {
+    TORCH_CHECK(oldstride[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension");
+    res[i] = oldstride[i] / 2;
   }
   return res;
 }
@@ -48,10 +61,10 @@ Tensor view_as_complex(const Tensor& self) {
     self.scalar_type() == kFloat || self.scalar_type() == kDouble || self.scalar_type() == kHalf,
     "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type());
 
-  TORCH_CHECK(self.dim() != 0, "Input tensor must have one or more dimensions");
-  auto new_sizes = self.sizes().vec();
-  TORCH_CHECK(new_sizes[self.dim()-1] == 2, "Tensor must have a last dimension of size 2");
-  new_sizes.pop_back();
+  auto old_sizes = self.sizes();
+  TORCH_CHECK(old_sizes.size() != 0, "Input tensor must have one or more dimensions");
+  TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2");
+  DimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1);
 
   const auto new_strides = computeStrideForViewAsComplex(self.strides());
   const auto complex_type = c10::toComplexType(self.scalar_type());
@@ -59,7 +72,7 @@ Tensor view_as_complex(const Tensor& self) {
   TORCH_CHECK(self.storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2");
   const auto new_storage_offset = self.storage_offset() / 2;
 
-  return at::empty({0}, self.options().dtype(complex_type)).set_(self.storage(), new_storage_offset, new_sizes, new_strides);
+  return view_tensor(self, complex_type, new_storage_offset, new_sizes, new_strides);
 }
 
 }} // namespace at::native

From 7cde662f08b226fb8d7f8e99dbd6f40dbe2dca16 Mon Sep 17 00:00:00 2001
From: Himangshu <hlahkar@gmail.com>
Date: Tue, 29 Sep 2020 07:35:00 -0700
Subject: [PATCH 255/449] Add check for Complex Type to allow non integral
 alpha. (#45200)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45184

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45200

Reviewed By: gchanan

Differential Revision: D23940134

Pulled By: anjali411

fbshipit-source-id: cce7b1efc22ec189ba6c83e31ce712bb34997139
---
 aten/src/ATen/native/BinaryOps.h |  3 ++-
 test/test_torch.py               | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
index 38a8de7337b8..7640c8bd84ac 100644
--- a/aten/src/ATen/native/BinaryOps.h
+++ b/aten/src/ATen/native/BinaryOps.h
@@ -10,7 +10,8 @@ namespace at { namespace native {
 inline void alpha_check(const ScalarType dtype, Scalar alpha) {
   TORCH_CHECK(! alpha.isBoolean() || dtype == ScalarType::Bool,
               "Boolean alpha only supported for Boolean results.");
-  TORCH_CHECK(isFloatingType(dtype) || alpha.isIntegral(true),
+  TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype) 
+              || alpha.isIntegral(true),
               "For integral input tensors, argument alpha must not be a floating point number.");
 }
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 8badba980925..09bef13243cc 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -11663,6 +11663,24 @@ def test_add(self, device):
         m2 = torch.tensor([3., 4.], dtype=torch.bfloat16)
         self.assertEqual(m1 + m2, torch.tensor([4., 6.], dtype=torch.bfloat16))
 
+        # different alpha types
+        m1 = torch.tensor([2 + 3j, 4 + 5j], dtype=torch.complex64, device=device)
+        m2 = torch.tensor([4 + 5j, 2 + 3j], dtype=torch.complex64, device=device)
+        # add complex numbers with float alpha
+        res = torch.add(m1, m2, alpha=0.1)
+        expected = torch.tensor([2.4000 + 3.5000j, 4.2000 + 5.3000j], dtype=torch.complex64, device=device)
+        self.assertEqual(res, expected)
+
+        # add complex numbers with complex alpha
+        res = torch.add(m1, m2, alpha=complex(0.1, 0.2))
+        expected = torch.tensor([1.4000 + 4.3000j, 3.6000 + 5.7000j], dtype=torch.complex64, device=device)
+        self.assertEqual(res, expected)
+
+        # add complex numbers with integer alpha
+        res = torch.add(m1, m2, alpha=2)
+        expected = torch.tensor([10. + 13.j, 8. + 11.j], dtype=torch.complex64, device=device)
+        self.assertEqual(res, expected)
+
         # mismatched alpha
         m1 = torch.tensor([1], dtype=torch.int8, device=device)
         m2 = torch.tensor([2], dtype=torch.int8, device=device)
@@ -11673,6 +11691,15 @@ def test_add(self, device):
                                r"For integral input tensors, argument alpha must not be a floating point number\.",
                                lambda: torch.add(m1, m2, alpha=1.0))
 
+        # mismatched alpha, float / double tensor and complex alpha
+        m1 = torch.tensor([3., 4.], device=device)
+        m2 = torch.tensor([4., 3.], device=device)
+        self.assertRaises(RuntimeError, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)))
+
+        m1 = torch.tensor([3., 4.], dtype=torch.double, device=device)
+        m2 = torch.tensor([4., 3.], dtype=torch.double, device=device)
+        self.assertRaises(RuntimeError, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2)))
+
         # complex
         m1 = torch.tensor((4.0000 + 4.0000j), dtype=torch.complex64)
         m2 = torch.tensor(4., dtype=torch.float64)

From 6d37126a104aa3ad262d644abae7d0525fb6df53 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Tue, 29 Sep 2020 08:31:59 -0700
Subject: [PATCH 256/449] Makes rdiv consistent with div (#45407)

Summary:
In addition to making rdiv consistent with div, this PR significantly expands division testing, accounting for floor_divide actually performing truncation division, too.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45407

Reviewed By: ngimel

Differential Revision: D23974967

Pulled By: mruberry

fbshipit-source-id: 82b46b07615603f161ab7cd1d3afaa6d886bfe95
---
 .../TestOperators.test_view_flatten.expect    |  76 ++---
 test/test_torch.py                            | 295 ++++++++++++++++--
 torch/tensor.py                               |   7 +-
 3 files changed, 296 insertions(+), 82 deletions(-)

diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect
index 07667797e2cf..5ae9c0576c7a 100644
--- a/test/onnx/expect/TestOperators.test_view_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_view_flatten.expect
@@ -65,60 +65,40 @@ graph {
     }
   }
   node {
-    input: "6"
     output: "7"
-    name: "Cast_6"
-    op_type: "Cast"
-    attribute {
-      name: "to"
-      i: 11
-      type: INT
-    }
-  }
-  node {
-    output: "8"
-    name: "Constant_7"
+    name: "Constant_6"
     op_type: "Constant"
     attribute {
       name: "value"
       t {
-        data_type: 11
-        raw_data: "\000\000\000\000\000\000\360?"
+        data_type: 7
+        raw_data: "\030\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
-    input: "8"
     input: "7"
-    output: "9"
-    name: "Div_8"
+    input: "6"
+    output: "8"
+    name: "Div_7"
     op_type: "Div"
   }
   node {
-    output: "10"
-    name: "Constant_9"
-    op_type: "Constant"
+    input: "8"
+    output: "9"
+    name: "Cast_8"
+    op_type: "Cast"
     attribute {
-      name: "value"
-      t {
-        data_type: 11
-        raw_data: "\000\000\000\000\000\0008@"
-      }
-      type: TENSOR
+      name: "to"
+      i: 7
+      type: INT
     }
   }
   node {
     input: "9"
-    input: "10"
-    output: "11"
-    name: "Mul_10"
-    op_type: "Mul"
-  }
-  node {
-    input: "11"
-    output: "12"
-    name: "Cast_11"
+    output: "10"
+    name: "Cast_9"
     op_type: "Cast"
     attribute {
       name: "to"
@@ -128,8 +108,8 @@ graph {
   }
   node {
     input: "3"
-    output: "13"
-    name: "Unsqueeze_12"
+    output: "11"
+    name: "Unsqueeze_10"
     op_type: "Unsqueeze"
     attribute {
       name: "axes"
@@ -138,9 +118,9 @@ graph {
     }
   }
   node {
-    input: "12"
-    output: "14"
-    name: "Unsqueeze_13"
+    input: "10"
+    output: "12"
+    name: "Unsqueeze_11"
     op_type: "Unsqueeze"
     attribute {
       name: "axes"
@@ -149,10 +129,10 @@ graph {
     }
   }
   node {
-    input: "13"
-    input: "14"
-    output: "15"
-    name: "Concat_14"
+    input: "11"
+    input: "12"
+    output: "13"
+    name: "Concat_12"
     op_type: "Concat"
     attribute {
       name: "axis"
@@ -162,9 +142,9 @@ graph {
   }
   node {
     input: "0"
-    input: "15"
-    output: "16"
-    name: "Reshape_15"
+    input: "13"
+    output: "14"
+    name: "Reshape_13"
     op_type: "Reshape"
   }
   name: "torch-jit-export"
@@ -191,7 +171,7 @@ graph {
     }
   }
   output {
-    name: "16"
+    name: "14"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/test_torch.py b/test/test_torch.py
index 09bef13243cc..0633ae2ee7f5 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -14264,28 +14264,268 @@ def test_binary_op_scalar_device_unspecified(self, devices):
                     self.assertEqual(y1.device, device_obj)
                     self.assertEqual(y0, y1)
 
-    # Tests that CPU scalars (including zero dim tensors) can be used in
-    # binary operations with CUDA tensors.
-    @onlyCUDA
-    def test_cuda_cpu_scalar_binary_ops(self, device):
-        val_scalar = math.pi
-        val_tensor = torch.tensor(val_scalar)
-        for op in (operator.add, torch.add,
-                   operator.sub, torch.sub,
-                   operator.mul, torch.mul,
-                   operator.truediv, torch.true_divide,
-                   operator.floordiv, torch.floor_divide):
-            for tensor_val in (1, (1,)):
-                t_cuda = torch.tensor(tensor_val, device=device)
-                t_cpu = t_cuda.cpu()
-                for val in (val_scalar, val_tensor):
-                    cpu_result = op(t_cpu, val)
-                    cuda_result = op(t_cuda, val)
-                    self.assertEqual(cpu_result, cuda_result)
-
-                    reverse_cpu_result = op(val, t_cpu)
-                    reverse_cuda_result = op(val, t_cuda)
-                    self.assertEqual(reverse_cpu_result, reverse_cuda_result)
+    def test_div_and_floordiv_vs_python(self, device):
+        # Tests torch division ops which can handle both arguments being
+        #   scalars.
+        # NOTE: torch.floor_divide currently truncates instead of flooring.
+        #   the quotient. See https://github.com/pytorch/pytorch/issues/43874.
+        def _scalar_helper(python_op, torch_op):
+            for a, b in product(range(-10, 10), range(-10, 10)):
+                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                    a = op(a)
+                    b = op(b)
+
+                    # Skips zero divisors
+                    if b == 0:
+                        continue
+
+                    expected = python_op(a, b)
+
+                    for op in (operator.truediv, torch.true_divide):
+                        actual_scalar = torch_op(a, b)
+
+                        a_t = torch.tensor(a, device=device)
+                        b_t = torch.tensor(b, device=device)
+
+                        actual_tensor = torch_op(a_t, b_t)
+                        actual_first_tensor = torch_op(a_t, b)
+                        actual_second_tensor = torch_op(a, b_t)
+
+                        self.assertEqual(actual_scalar, expected_div)
+                        self.assertEqual(actual_tensor.item(), expected_div)
+                        self.assertEqual(actual_first_tensor, actual_tensor)
+                        self.assertEqual(actual_second_tensor, actual_tensor)
+
+            _scalar_helper(operator.truediv, operator.truediv)
+            _scalar_helper(operator.truediv, torch.true_divide)
+            _scalar_helper(lambda a, b: math.trunc(a / b), operator.floordiv)
+            _scalar_helper(lambda a, b: math.trunc(a / b), torch.floor_divide)
+
+    # NOTE: torch.floor_divide currently truncates instead of flooring.
+    # See https://github.com/pytorch/pytorch/issues/43874.
+    @onlyOnCPUAndCUDA
+    def test_div_and_floordiv_script_vs_python(self, device):
+        # Creates jitted functions of two tensors
+        def _wrapped_div(a, b):
+            return a / b
+
+        def _wrapped_floordiv(a, b):
+            return a // b
+
+        scripted_div = torch.jit.script(_wrapped_div)
+        scripted_floordiv = torch.jit.script(_wrapped_floordiv)
+        for a, b in product(range(-10, 10), range(-10, 10)):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+                b = op(b)
+
+                # Skips zero divisors
+                if b == 0:
+                    continue
+
+                expected_div = a / b
+                expected_truncdiv = math.trunc(a / b)
+                a_t = torch.tensor(a, device=device)
+                b_t = torch.tensor(b, device=device)
+
+                self.assertEqual(scripted_div(a_t, b_t), expected_div)
+                self.assertEqual(scripted_floordiv(a_t, b_t), expected_truncdiv)
+
+        # Creates jitted functions of one tensor
+        def _wrapped_div_scalar(a):
+            return a / 5
+
+        # NOTE: this will fail when given an integer input, since
+        #   the JIT implements division as
+        #   torch.reciprocal(a) * 5, and reciprocal is only
+        #   implemented for float types.
+        def _wrapped_rdiv_scalar(a):
+            return 5 / a
+
+        def _wrapped_floordiv_scalar(a):
+            return a // 5
+
+        # NOTE: this fails if the input is not an integer tensor
+        # See https://github.com/pytorch/pytorch/issues/45199
+        def _wrapped_rfloordiv_scalar(a):
+            return 5 // a
+
+        scripted_div_scalar = torch.jit.script(_wrapped_div_scalar)
+        scripted_rdiv_scalar = torch.jit.script(_wrapped_rdiv_scalar)
+        scripted_floordiv_scalar = torch.jit.script(_wrapped_floordiv_scalar)
+        scripted_rfloordiv_scalar = torch.jit.script(_wrapped_rfloordiv_scalar)
+
+        for a in range(-10, 10):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+
+                a_t = torch.tensor(a, device=device)
+
+                self.assertEqual(a / 5, scripted_div_scalar(a_t))
+                self.assertEqual(math.trunc(a / 5), scripted_floordiv_scalar(a_t))
+
+                # Skips zero divisors
+                if a == 0:
+                    continue
+
+                if a_t.is_floating_point():
+                    self.assertEqual(5 / a, scripted_rdiv_scalar(a_t))
+                else:
+                    with self.assertRaises(RuntimeError):
+                        scripted_rdiv_scalar(a_t)
+
+
+                # Handles Issue 45199 (see comment above)
+                if a_t.is_floating_point():
+                    with self.assertRaises(RuntimeError):
+                        scripted_rfloordiv_scalar(a_t)
+                else:
+                    self.assertEqual(5 // a, scripted_rfloordiv_scalar(a_t))
+
+    # NOTE: torch.floor_divide currently truncates instead of flooring
+    #   the quotient. See https://github.com/pytorch/pytorch/issues/43874.
+    @onlyOnCPUAndCUDA
+    def test_idiv_and_ifloordiv_vs_python(self, device):
+        def _wrapped_idiv_tensor(a, b):
+            a /= b
+            return a
+
+        def _wrapped_idiv_scalar(a):
+            a /= 5
+            return a
+
+        def _wrapped_true_divide__tensor(a, b):
+            a.true_divide_(b)
+            return a
+
+        def _wrapped_true_divide__scalar(a):
+            a.true_divide_(5)
+            return a
+
+        def _wrapped_floor_divide__tensor(a, b):
+            a.floor_divide_(b)
+            return a
+
+        def _wrapped_floor_divide__scalar(a):
+            a.floor_divide_(5)
+            return a
+
+        # The following functions are unsupported by the JIT
+        def _wrapped_ifloordiv_tensor(a, b):
+            a //= b
+            return a
+
+        def _wrapped_ifloordiv_scalar(a):
+            a //= 5
+            return a
+
+        with self.assertRaises(torch.jit.frontend.NotSupportedError):
+            scripted_ifloordiv_tensor = torch.jit.script(_wrapped_ifloordiv_tensor)
+
+        with self.assertRaises(torch.jit.frontend.NotSupportedError):
+            scripted_ifloordiv_scalar = torch.jit.script(_wrapped_ifloordiv_scalar)
+
+        scripted_idiv_tensor = torch.jit.script(_wrapped_idiv_tensor)
+        scripted_idiv_scalar = torch.jit.script(_wrapped_idiv_scalar)
+        scripted_true_divide__tensor = torch.jit.script(_wrapped_true_divide__tensor)
+        scripted_true_divide__scalar = torch.jit.script(_wrapped_true_divide__scalar)
+        scripted_floor_divide__tensor = torch.jit.script(_wrapped_floor_divide__tensor)
+        scripted_floor_divide__scalar = torch.jit.script(_wrapped_floor_divide__scalar)
+
+        for a, b in product(range(-10, 10), range(-10, 10)):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+                b = op(b)
+
+                # Skips zero divisors
+                if b == 0:
+                    continue
+
+                expected_idiv = a / b
+                expected_ifloordiv = a // b
+                expected_itruncdiv = math.trunc(a / b)
+
+                a_t = torch.tensor(a, device=device)
+                b_t = torch.tensor(b, device=device)
+
+                if a_t.is_floating_point():
+                    tmp0 = a_t.clone()
+                    tmp0 /= b
+
+                    tmp1 = a_t.clone()
+                    tmp1 /= b_t
+
+                    self.assertEqual(tmp0.item(), expected_idiv)
+                    self.assertEqual(tmp1.item(), expected_idiv)
+                    self.assertEqual(scripted_true_divide__tensor(a_t.clone(), b_t).item(), expected_idiv)
+                    self.assertEqual(scripted_true_divide__scalar(a_t.clone()).item(), a / 5)
+                else:
+                    tmp = a_t.clone()
+                    with self.assertRaises(RuntimeError):
+                        tmp /= b
+                    with self.assertRaises(RuntimeError):
+                        tmp /= b_t
+                    with self.assertRaises(RuntimeError):
+                        scripted_true_divide__tensor(tmp, b_t)
+                    with self.assertRaises(RuntimeError):
+                        scripted_true_divide__scalar(tmp)
+
+
+                if not a_t.is_floating_point() and b_t.is_floating_point():
+                    # Inplace modification fails because a float tensor is required
+                    #   if the divisor is a float tensor
+                    with self.assertRaises(RuntimeError):
+                        a_t.clone().floor_divide_(b_t)
+                    with self.assertRaises(RuntimeError):
+                        scripted_floor_divide_tensor(a_t.clone(), b_t)
+                    tmp = a_t.clone()
+                    with self.assertRaises(RuntimeError):
+                        tmp //= b_t
+                else:
+                    # Inplace modification is OK when both or neither tensor is
+                    #   a float tensor
+                    self.assertEqual(a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv)
+                    self.assertEqual(scripted_floor_divide__tensor(a_t.clone(), b_t).item(), expected_itruncdiv)
+                    tmp = a_t.clone()
+                    tmp //= b_t
+                    self.assertEqual(tmp.item(), expected_itruncdiv)
+
+                self.assertEqual(scripted_floor_divide__scalar(a_t), math.trunc(a / 5))
+
+    # Tests binary op equivalence with Python builtin ops
+    # Also tests that reverse operations are equivalent to forward ops
+    # NOTE: division ops are tested separately above
+    def test_binary_ops_with_scalars(self, device):
+        for ops in ((operator.add, torch.add),
+                    (operator.sub, torch.sub),
+                    (operator.mul, torch.mul),
+                    (operator.truediv, torch.div)):
+            python_op, torch_op = ops
+
+            for a, b in product(range(-10, 10), range(-10, 10)):
+                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                    a = op(a)
+                    b = op(b)
+
+                    # Skips zero divisors
+                    if b == 0 or a == 0:
+                        continue
+
+                    a_tensor = torch.tensor(a, device=device)
+                    b_tensor = torch.tensor(b, device=device)
+                    a_tensor_cpu = a_tensor.cpu()
+                    b_tensor_cpu = b_tensor.cpu()
+                    vals = (a, b, a_tensor, b_tensor, a_tensor_cpu, b_tensor_cpu)
+
+                    for args in product(vals, vals):
+                        first, second = args
+
+                        first_scalar = first if not isinstance(first, torch.Tensor) else first.item()
+                        second_scalar = second if not isinstance(second, torch.Tensor) else second.item()
+                        expected = python_op(first_scalar, second_scalar)
+
+                        self.assertEqual(expected, python_op(first, second))
+                        self.assertEqual(expected, torch_op(first, second))
 
     @onlyCUDA
     def test_ceil_out_mismatch(self, device):
@@ -16912,11 +17152,8 @@ def test_rdiv(self, device, dtype):
         else:
             x = torch.rand(100, device=device).add(1).mul(4).to(dtype)
         y = 30 / x
-        if dtype.is_floating_point or dtype.is_complex:
-            z = torch.tensor([30 / v.item() for v in x], dtype=dtype, device=device)
-        else:
-            z = torch.tensor([math.trunc(30. / v.item()) for v in x], dtype=dtype, device=device)
-        self.assertEqual(y, z)
+        z = torch.tensor([30 / v.item() for v in x], device=device)
+        self.assertEqual(y, z, exact_dtype=False)
 
     @onlyCPU
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
@@ -19865,7 +20102,7 @@ def inner(self, device, dtype):
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types,
         True, [], 0, True),
     ('addmv', 'scalar', _medium_1d,
-        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, 
+        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
         [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('addmv', 'two_scalars', _medium_1d,
@@ -20105,7 +20342,7 @@ def inner(self, device, dtype):
     ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
-    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, 
+    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5,
         torch.testing.get_all_fp_dtypes() + _complex_types, [torch.bfloat16]),
     ('asin', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('atan', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
diff --git a/torch/tensor.py b/torch/tensor.py
index 3eadb4667e87..9709c146c815 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -518,7 +518,7 @@ def __rdiv__(self, other):
         if self.dtype.is_floating_point or self.dtype.is_complex:
             return self.reciprocal() * other
         else:
-            return (self.double().reciprocal() * other).type_as(self)
+            return self.to(torch.get_default_dtype()).reciprocal() * other
 
     __rtruediv__ = __rdiv__
     __itruediv__ = _C._TensorBase.__idiv__
@@ -552,10 +552,7 @@ def __floordiv__(self, other):
 
     @_wrap_type_error_to_not_implemented
     def __rfloordiv__(self, other):
-        result = other / self
-        if result.dtype.is_floating_point:
-            result = result.trunc()
-        return result
+        return torch.floor_divide(other, self)
 
     __neg__ = _C._TensorBase.neg
 

From 0a38aed02537de2fc2cd6d85fdef4de6dd1f8065 Mon Sep 17 00:00:00 2001
From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com>
Date: Tue, 29 Sep 2020 08:56:46 -0700
Subject: [PATCH 257/449] Auto set libuv_ROOT env var for Gloo submodule on
 Windows platform (#45484)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45484

Reviewed By: lw

Differential Revision: D23990724

Pulled By: mrshenli

fbshipit-source-id: 1987ce7eb7d3f9d3120c07e954cd6581cd3caf59
---
 CMakeLists.txt | 6 ++++++
 README.md      | 3 +--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d937e0e1655..9a094271a802 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -292,6 +292,12 @@ if(LINUX)
   set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed")
 endif()
 
+if(WIN32 AND USE_DISTRIBUTED)
+  if(NOT DEFINED ENV{libuv_ROOT})
+    set(ENV{libuv_ROOT} $ENV{CONDA_PREFIX}\\Library)
+  endif()
+endif()
+
 if(MSVC)
   foreach(flag_var
       CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
diff --git a/README.md b/README.md
index 9202869dfaa6..ec68e8dfeb88 100644
--- a/README.md
+++ b/README.md
@@ -186,10 +186,9 @@ conda install pkg-config libuv
 
 On Windows
 ```bash
-# Add these packages and set libuv_ROOT environment variable if torch.distributed is needed.
+# Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
 conda install -y -q -c rdonnelly libuv
-set libuv_ROOT={conda active env location}\Library
 ```
 
 #### Get the PyTorch Source

From bb19a554291aca857e81e9acdf49aae6745f3159 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 29 Sep 2020 08:58:04 -0700
Subject: [PATCH 258/449] Improves fft doc consistency and makes deprecation
 warnings more prominent (#45409)

Summary:
This PR makes the deprecation warnings for existing fft functions more prominent and makes the torch.stft deprecation warning consistent with our current deprecation planning.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45409

Reviewed By: ngimel

Differential Revision: D23974975

Pulled By: mruberry

fbshipit-source-id: b90d8276095122ac3542ab625cb49b991379c1f8
---
 aten/src/ATen/native/SpectralOps.cpp | 14 +++---
 torch/_torch_docs.py                 | 69 +++++++++++++---------------
 torch/functional.py                  | 11 +++--
 3 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 1e9c1bce67d3..cf8d52a7a3c2 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -262,12 +262,12 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args(
       ret.shape[i] = input_sizes[ret.dim[i]];
     }
   }
-  
+
   for (int64_t i = 0; i < ret.shape.size(); ++i) {
     TORCH_CHECK(ret.shape[i] > 0,
                 "Invalid number of data points (", ret.shape[i], ") specified");
   }
-  
+
   return ret;
 }
 
@@ -318,14 +318,14 @@ Tensor fftn_c2c(
 // torch.fft.fft, analogous to NumPy's numpy.fft.fft
 Tensor fft_fft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                c10::optional<std::string> norm) {
-  return self.is_complex() ? 
+  return self.is_complex() ?
     fft_c2c(self, n, dim, norm, /*forward=*/true) :
     fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
 }
 
 Tensor fft_ifft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
-  return self.is_complex() ? 
+  return self.is_complex() ?
     fft_c2c(self, n, dim, norm, /*forward=*/false) :
     fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
 }
@@ -647,8 +647,10 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   const bool return_complex = return_complexOpt.value_or(
       self.is_complex() || (window.defined() && window.is_complex()));
   if (!return_complexOpt && !return_complex) {
-    TORCH_WARN("stft will return complex tensors by default in future, use"
-               " return_complex=False to preserve the current output format.");
+    TORCH_WARN_ONCE("stft will require the return_complex parameter be explicitly "
+                    " specified in a future PyTorch release. Use return_complex=False "
+                    " to preserve the current behavior or return_complex=True to return "
+                    " a complex output.");
   }
 
   if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) {
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 0e43f6d89983..a19d3f882ca1 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -8505,7 +8505,13 @@ def merge_dicts(*dicts):
 add_docstr(torch.fft, r"""
 fft(input, signal_ndim, normalized=False) -> Tensor
 
-Complex-to-complex Discrete Fourier Transform
+Complex-to-complex Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.fft` is deprecated and will be removed in
+    PyTorch 1.8. Use the new :ref:`torch.fft <torch-fft-module>` module
+    functions, instead, by importing :ref:`torch.fft <torch-fft-module>` and
+    calling :func:`torch.fft.fft` or :func:`torch.fft.fftn`.
 
 This method computes the complex-to-complex discrete Fourier transform.
 Ignoring the batch dimensions, it computes the following expression:
@@ -8531,12 +8537,6 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.ifft`.
 
-.. deprecated:: 1.7.0
-    The function :func:`torch.fft` is deprecated and will be removed in
-    PyTorch 1.8. Use the new :ref:`torch.fft <torch-fft-module>` module
-    functions, instead, by importing :ref:`torch.fft <torch-fft-module>` and
-    calling :func:`torch.fft.fft` or :func:`torch.fft.fftn`.
-
 .. note::
     For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
     repeatedly running FFT methods on tensors of same geometry with same
@@ -8611,11 +8611,16 @@ def merge_dicts(*dicts):
 
 """)
 
-add_docstr(torch.ifft,
-           r"""
+add_docstr(torch.ifft, r"""
 ifft(input, signal_ndim, normalized=False) -> Tensor
 
-Complex-to-complex Inverse Discrete Fourier Transform
+Complex-to-complex Inverse Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.ifft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.ifft` or :func:`torch.fft.ifftn`.
 
 This method computes the complex-to-complex inverse discrete Fourier
 transform. Ignoring the batch dimensions, it computes the following
@@ -8640,12 +8645,6 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.fft`.
 
-.. deprecated:: 1.7.0
-    The function :func:`torch.ifft` is deprecated and will be removed in a
-    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
-    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
-    and calling :func:`torch.fft.ifft` or :func:`torch.fft.ifftn`.
-
 .. note::
     For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
     repeatedly running FFT methods on tensors of same geometry with same
@@ -8702,11 +8701,17 @@ def merge_dicts(*dicts):
 
 """)
 
-add_docstr(torch.rfft,
-           r"""
+add_docstr(torch.rfft, r"""
 rfft(input, signal_ndim, normalized=False, onesided=True) -> Tensor
 
-Real-to-complex Discrete Fourier Transform
+Real-to-complex Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.rfft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.rfft` for one-sided output, or
+    :func:`torch.fft.fft` for two-sided output.
 
 This method computes the real-to-complex discrete Fourier transform. It is
 mathematically equivalent with :func:`~torch.fft` with differences only in
@@ -8734,13 +8739,6 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.irfft`.
 
-.. deprecated:: 1.7.0
-    The function :func:`torch.rfft` is deprecated and will be removed in a
-    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
-    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
-    and calling :func:`torch.fft.rfft` for one-sided output, or
-    :func:`torch.fft.fft` for two-sided output.
-
 .. note::
     For CUDA tensors, an LRU cache is used for cuFFT plans to speed up
     repeatedly running FFT methods on tensors of same geometry with same
@@ -8778,11 +8776,17 @@ def merge_dicts(*dicts):
 """)
 
 
-add_docstr(torch.irfft,
-           r"""
+add_docstr(torch.irfft, r"""
 irfft(input, signal_ndim, normalized=False, onesided=True, signal_sizes=None) -> Tensor
 
-Complex-to-real Inverse Discrete Fourier Transform
+Complex-to-real Inverse Discrete Fourier Transform.
+
+.. warning::
+    The function :func:`torch.irfft` is deprecated and will be removed in a
+    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
+    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
+    and calling :func:`torch.fft.irfft` for one-sided input, or
+    :func:`torch.fft.ifft` for two-sided input.
 
 This method computes the complex-to-real inverse discrete Fourier transform.
 It is mathematically equivalent with :func:`ifft` with differences only in
@@ -8813,13 +8817,6 @@ def merge_dicts(*dicts):
 
 The inverse of this function is :func:`~torch.rfft`.
 
-.. deprecated:: 1.7.0
-    The function :func:`torch.irfft` is deprecated and will be removed in a
-    future PyTorch release. Use the new :ref:`torch.fft <torch-fft-module>`
-    module functions, instead, by importing :ref:`torch.fft <torch-fft-module>`
-    and calling :func:`torch.fft.irfft` for one-sided input, or
-    :func:`torch.fft.ifft` for two-sided input.
-
 .. warning::
     Generally speaking, input to this function should contain values
     following conjugate symmetry. Note that even if :attr:`onesided` is
diff --git a/torch/functional.py b/torch/functional.py
index 8237313a56ee..8eecf3643035 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -398,9 +398,14 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
          return_complex: Optional[bool] = None) -> Tensor:
     r"""Short-time Fourier transform (STFT).
 
+    .. warning::
+        Setting :attr:`return_complex` explicitly will be required in a future
+        PyTorch release. Set it to False to preserve the current behavior or
+        True to return a complex output.
+
     The STFT computes the Fourier transform of short overlapping windows of the
     input. This giving frequency components of the signal as they change over
-    time. The interface of this function is modeled after librosa_.
+    time. The interface of this function is modeled after the librosa_ stft function.
 
     .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html
 
@@ -456,10 +461,6 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
       the output is a ``input.dim() + 2`` dimensional real tensor where the last
       dimension represents the real and imaginary components.
 
-      .. warning::
-         From pytorch 1.8.0, :attr:`return_complex` will default to ``True``
-         for all input types.
-
     Returns either a complex tensor of size :math:`(* \times N \times T)` if
     :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N
     \times T \times 2)`. Where :math:`*` is the optional batch size of

From df0de780c32f91cfcdb153595ac456cc629c01ba Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Tue, 29 Sep 2020 09:23:27 -0700
Subject: [PATCH 259/449] Add cusolver guard for cuda >= 10.1.243 (#45452)

Summary:
See https://github.com/pytorch/pytorch/issues/45403

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45452

Reviewed By: mruberry

Differential Revision: D23977009

Pulled By: ngimel

fbshipit-source-id: df66425773d7500fa37e64d5e4bcc98167016be3
---
 aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
index 85014c5773ee..dc6dc2f9daca 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h
@@ -7,8 +7,8 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000
-// some cusolver functions doesn't work well on cuda 9.2, cusolver is used on cuda >= 10.0
+#if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && CUSOLVER_VERSION >= 10200
+// some cusolver functions don't work well on cuda 9.2 or cuda 10.1.105, cusolver is used on cuda >= 10.1.243
 #define USE_CUSOLVER
 #endif
 

From b3135c2056f33e918c5b0aeeb4098580c2818ddf Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 29 Sep 2020 09:27:55 -0700
Subject: [PATCH 260/449] Enable torch.cuda.amp typechecking (#45480)

Summary:
Fix `torch._C._autocast_*_nesting` declarations in __init__.pyi

Fix iterable constructor logic: not every iterable can be constructed using `type(val)(val)` trick, for example it would not work for `val=range(10)` although `isinstance(val, Iterable)` is True
Change optional resolution logic to meet mypy expectations

Fixes https://github.com/pytorch/pytorch/issues/45436

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45480

Reviewed By: walterddr

Differential Revision: D23982822

Pulled By: malfet

fbshipit-source-id: 6418a28d04ece1b2427dcde4b71effb67856a872
---
 mypy.ini                        |  3 --
 torch/_C/__init__.pyi.in        |  4 +-
 torch/cuda/amp/autocast_mode.py |  6 ++-
 torch/cuda/amp/grad_scaler.py   | 86 +++++++++++++++++++--------------
 4 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index bcff0da0c42d..03e0cb70fad9 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -174,9 +174,6 @@ ignore_errors = True
 [mypy-torch.cuda]
 ignore_errors = True
 
-[mypy-torch.cuda.amp.*]
-ignore_errors = True
-
 [mypy-torch._lobpcg]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 475eb52cfd2e..b148c181a693 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -389,8 +389,8 @@ def is_grad_enabled() -> _bool: ...
 def set_autocast_enabled(enabled: _bool) -> None: ...
 def is_autocast_enabled() -> _bool: ...
 def clear_autocast_cache() -> None: ...
-def autocast_increment_nesting() -> None: ...
-def autocast_decrement_nesting() -> None: ...
+def autocast_increment_nesting() -> _int: ...
+def autocast_decrement_nesting() -> _int: ...
 def set_anomaly_enabled(enabled: _bool) -> None: ...
 def is_anomaly_enabled() -> _bool: ...
 
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index 8bac02fc39f0..15cb5c956231 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -149,7 +149,11 @@ def _cast(value, dtype):
     elif isinstance(value, container_abcs.Mapping):
         return {_cast(k, dtype): _cast(v, dtype) for k, v in value.items()}
     elif isinstance(value, container_abcs.Iterable):
-        return type(value)(_cast(v, dtype) for v in value)
+        iterable = map(lambda v: _cast(v, dtype), value)
+        if isinstance(value, list) or isinstance(value, tuple):
+            return type(value)(iterable)
+        else:
+            return iterable
     else:
         return value
 
diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py
index 066ff1a0d311..c70ab2596555 100644
--- a/torch/cuda/amp/grad_scaler.py
+++ b/torch/cuda/amp/grad_scaler.py
@@ -3,18 +3,19 @@
 from torch._six import container_abcs
 import warnings
 from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
 
 
 class _MultiDeviceReplicator(object):
     """
     Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
     """
-    def __init__(self, master_tensor):
+    def __init__(self, master_tensor: torch.Tensor) -> None:
         assert master_tensor.is_cuda
         self.master = master_tensor
-        self._per_device_tensors = {}
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
 
-    def get(self, device):
+    def get(self, device) -> torch.Tensor:
         retval = self._per_device_tensors.get(device, None)
         if retval is None:
             retval = self.master.to(device=device, non_blocking=True, copy=True)
@@ -38,6 +39,9 @@ def _refresh_per_optimizer_state():
 
 
 class GradScaler(object):
+    _scale: Optional[torch.Tensor]
+    _grows_tracker: Optional[torch.Tensor]
+    _per_optimizer_states: Dict[int, Dict[str, Any]]
     """
     An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
     conveniently.
@@ -128,10 +132,11 @@ def __init__(self,
             self._growth_tracker = None
             self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
 
-    def _check_scale_growth_tracker(self, funcname):
+    def _check_scale_growth_tracker(self, funcname) -> Tuple[torch.Tensor, torch.Tensor]:
         fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
         assert self._scale is not None, "Attempted {} but _scale is None.  ".format(funcname) + fix
         assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(funcname) + fix
+        return (self._scale, self._growth_tracker)
 
     def _lazy_init_scale_growth_tracker(self, dev):
         assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
@@ -156,21 +161,27 @@ def scale(self, outputs):
             assert outputs.is_cuda
             if self._scale is None:
                 self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
             return outputs * self._scale.to(device=outputs.device, non_blocking=True)
 
         # Invoke the more complex machinery only if we're treating multiple outputs.
-        stash = [None]  # trick to hold a reference that can be overwritten at any level of the recursion below.
+        stash: List[_MultiDeviceReplicator] = []  # holds a reference that can be overwritten by apply_scale
 
         def apply_scale(val):
             if isinstance(val, torch.Tensor):
                 assert val.is_cuda
-                if self._scale is None:
-                    self._lazy_init_scale_growth_tracker(val.device)
-                if stash[0] is None:
-                    stash[0] = _MultiDeviceReplicator(self._scale)
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
                 return val * stash[0].get(val.device)
             elif isinstance(val, container_abcs.Iterable):
-                return type(val)(apply_scale(v) for v in val)
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
             else:
                 raise ValueError("outputs must be a Tensor or an iterable of Tensors")
 
@@ -182,25 +193,25 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
 
         for group in optimizer.param_groups:
             for param in group["params"]:
-                if param.grad is not None:
-                    if (not allow_fp16) and param.grad.dtype == torch.float16:
-                        raise ValueError("Attempting to unscale FP16 gradients.")
+                if param.grad is None:
+                    continue
+                if (not allow_fp16) and param.grad.dtype == torch.float16:
+                    raise ValueError("Attempting to unscale FP16 gradients.")
+                with torch.no_grad():
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
                     else:
-                        with torch.no_grad():
-                            if param.grad.is_sparse:
-                                # is_coalesced() == False means the sparse grad has values with duplicate indices.
-                                # coalesce() deduplicates indices and adds all values that have the same index.
-                                # For scaled fp16 values, there's a good chance coalescing will cause overflow,
-                                # so we should check the coalesced _values().
-                                if param.grad.dtype is torch.float16:
-                                    param.grad = param.grad.coalesce()
-                                to_unscale = param.grad._values()
-                            else:
-                                to_unscale = param.grad
-
-                            torch._amp_non_finite_check_and_unscale_(to_unscale,
-                                                                     per_device_found_inf.get(param.grad.device),
-                                                                     per_device_inv_scale.get(param.grad.device))
+                        to_unscale = param.grad
+
+                    torch._amp_non_finite_check_and_unscale_(to_unscale,
+                                                             per_device_found_inf.get(param.grad.device),
+                                                             per_device_inv_scale.get(param.grad.device))
 
         return per_device_found_inf._per_device_tensors
 
@@ -249,6 +260,7 @@ def unscale_(self, optimizer):
             raise RuntimeError("unscale_() is being called after step().")
 
         # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
         inv_scale = self._scale.double().reciprocal().float()
         found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
 
@@ -332,22 +344,22 @@ def update(self, new_scale=None):
         if not self._enabled:
             return
 
-        self._check_scale_growth_tracker("update")
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
 
         if new_scale is not None:
             # Accept a new user-defined scale.
             if isinstance(new_scale, float):
-                self._scale = torch.full((1,), new_scale, dtype=torch.float32, device=self._scale.device)
+                self._scale = torch.full((1,), new_scale, dtype=torch.float32, device=_scale.device)
             else:
                 reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
-                assert isinstance(new_scale, torch.cuda.FloatTensor), reason
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
                 assert new_scale.numel() == 1, reason
                 assert new_scale.requires_grad is False, reason
                 self._scale = new_scale
         else:
             # Consume shared inf/nan data collected from optimizers to update the scale.
             # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
-            found_infs = [found_inf.to(device=self._scale.device, non_blocking=True)
+            found_infs = [found_inf.to(device=_scale.device, non_blocking=True)
                           for state in self._per_optimizer_states.values()
                           for found_inf in state["found_inf_per_device"].values()]
 
@@ -358,8 +370,8 @@ def update(self, new_scale=None):
                 for i in range(1, len(found_infs)):
                     found_inf_combined += found_infs[i]
 
-            self._scale = torch._amp_update_scale(self._growth_tracker,
-                                                  self._scale,
+            self._scale = torch._amp_update_scale(_growth_tracker,
+                                                  _scale,
                                                   found_inf_combined,
                                                   self._growth_factor,
                                                   self._backoff_factor,
@@ -498,10 +510,10 @@ def __setstate__(self, state):
         self.__dict__.update(state)
 
     def _check_inf_per_device(self, optimizer):
-        self._check_scale_growth_tracker("_check_inf_per_device")
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
 
-        dummy_inv_scale = torch.full((1,), 1.0, dtype=torch.float32, device=self._scale.device)
-        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
+        dummy_inv_scale = torch.full((1,), 1.0, dtype=torch.float32, device=_scale.device)
+        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=_scale.device)
 
         self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
             self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)

From ab5edf21b0f21ab28684211390a35e2b218cf5d3 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Tue, 29 Sep 2020 09:28:45 -0700
Subject: [PATCH 261/449] Revert D23789657: [wip] fast typeMeta/ScalarType
 conversion approach 2

Test Plan: revert-hammer

Differential Revision:
D23789657 (https://github.com/pytorch/pytorch/commit/1ed1a2f5b004eb03257d35e2ff08030fac6c4f64)

Original commit changeset: 5afdd52d24bd

fbshipit-source-id: 6d827be8895bcb39c8e85342eee0f7a3f5056c76
---
 aten/src/ATen/native/DispatchStub.h  |   2 -
 aten/src/ATen/templates/TensorBody.h |   1 -
 aten/src/TH/THStorageFunctions.hpp   |   1 -
 c10/core/DefaultDtype.cpp            |   4 +-
 c10/core/ScalarType.h                |  66 +++++++-
 c10/core/ScalarTypeToTypeMeta.h      |  47 ------
 c10/core/TensorImpl.cpp              |   3 +-
 c10/core/TensorImpl.h                |   4 +-
 c10/core/TensorOptions.h             |   1 -
 c10/core/UndefinedTensorImpl.h       |   2 +
 c10/util/typeid.cpp                  |  62 ++++----
 c10/util/typeid.h                    | 223 ++++++++-------------------
 torch/csrc/DynamicTypes.h            |   1 -
 torch/csrc/jit/tensorexpr/stmt.h     |   1 -
 14 files changed, 167 insertions(+), 251 deletions(-)
 delete mode 100644 c10/core/ScalarTypeToTypeMeta.h

diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
index 63e2462489be..dc21a505e8c1 100644
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@@ -3,9 +3,7 @@
 #include <c10/core/Backend.h>
 #include <c10/core/ScalarType.h>
 #include <c10/util/Exception.h>
-
 #include <type_traits>
-#include <atomic>
 
 // Implements instruction set specific function dispatch.
 //
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index f86a17a113cc..202b2124f286 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -6,7 +6,6 @@
 #include <c10/core/QScheme.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
-#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Storage.h>
 #include <ATen/core/TensorAccessor.h>
 #include <c10/core/TensorImpl.h>
diff --git a/aten/src/TH/THStorageFunctions.hpp b/aten/src/TH/THStorageFunctions.hpp
index 8d5c28daa796..b78f8c7a3035 100644
--- a/aten/src/TH/THStorageFunctions.hpp
+++ b/aten/src/TH/THStorageFunctions.hpp
@@ -8,7 +8,6 @@
 #include <TH/THStorageFunctions.h>
 
 #include <c10/core/ScalarType.h>
-#include <c10/core/ScalarTypeToTypeMeta.h>
 
 // Note [Weak references for intrusive refcounting]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp
index b125c9a56541..c4f420ab6e22 100644
--- a/c10/core/DefaultDtype.cpp
+++ b/c10/core/DefaultDtype.cpp
@@ -3,12 +3,12 @@
 
 namespace c10 {
 static auto default_dtype = caffe2::TypeMeta::Make<float>();
-static auto default_dtype_as_scalartype = default_dtype.toScalarType();
+static auto default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
 static auto default_complex_dtype = caffe2::TypeMeta::Make<c10::complex<float>>();
 
 void set_default_dtype(caffe2::TypeMeta dtype) {
   default_dtype = std::move(dtype);
-  default_dtype_as_scalartype = default_dtype.toScalarType();
+  default_dtype_as_scalartype = typeMetaToScalarType(default_dtype);
   if(default_dtype_as_scalartype == ScalarType::Double) {
     default_complex_dtype = std::move(caffe2::TypeMeta::Make<c10::complex<double>>());
   } else {
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index f7feb3b550f0..41980540017c 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -3,11 +3,9 @@
 #include <c10/util/ArrayRef.h>
 #include <c10/util/complex.h>
 #include <c10/util/Half.h>
-#include <c10/util/qint32.h>
-#include <c10/util/qint8.h>
-#include <c10/util/quint8.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/Optional.h>
+#include <c10/util/typeid.h>
 
 #include <complex>
 #include <cstdint>
@@ -69,8 +67,6 @@ enum class ScalarType : int8_t {
   NumOptions
 };
 
-constexpr uint16_t NumScalarTypes = static_cast<uint16_t>(ScalarType::NumOptions);
-
 namespace impl {
 
 // These are used to map ScalarTypes to C++ types.
@@ -97,7 +93,7 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
 
 #undef SPECIALIZE_ScalarTypeToCPPType
 
-} // namespace impl
+}
 
 template <typename T>
 struct CppTypeToScalarType;
@@ -164,6 +160,64 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
   _(c10::complex<float>, ComplexFloat)         \
   _(c10::complex<double>, ComplexDouble)
 
+static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
+#define DEFINE_CASE(ctype, name) \
+  case ScalarType::name:         \
+    return caffe2::TypeMeta::Make<ctype>();
+
+  switch (scalar_type) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
+    case ScalarType::Undefined:
+      return caffe2::TypeMeta();
+    default:
+      AT_ERROR(
+          "Unrecognized Scalartype ",
+          scalar_type,
+          " (please report this error)");
+  }
+#undef DEFINE_CASE
+}
+
+static inline c10::optional<ScalarType> tryTypeMetaToScalarType(
+    caffe2::TypeMeta dtype) {
+#define DEFINE_IF(ctype, name)                    \
+  if (dtype == caffe2::TypeMeta::Make<ctype>()) { \
+    return {ScalarType::name};                    \
+  }
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_IF)
+#undef DEFINE_IF
+  if (dtype == caffe2::TypeMeta()) {
+    return {ScalarType::Undefined};
+  }
+  return c10::nullopt;
+}
+
+static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
+  if (auto scalar_type = tryTypeMetaToScalarType(dtype)) {
+    return *scalar_type;
+  }
+  AT_ERROR(
+      "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)");
+}
+
+inline optional<at::ScalarType> optTypeMetaToScalarType(optional<caffe2::TypeMeta> type_meta) {
+  if (!type_meta.has_value()) {
+    return c10::nullopt;
+  }
+  return typeMetaToScalarType(*type_meta);
+}
+
+static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
+  if (auto mt = tryTypeMetaToScalarType(m)) {
+    return (*mt) == t;
+  }
+  return false;
+}
+
+static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
+  return t == m;
+}
+
 #define DEFINE_CONSTANT(_, name) \
   constexpr ScalarType k##name = ScalarType::name;
 
diff --git a/c10/core/ScalarTypeToTypeMeta.h b/c10/core/ScalarTypeToTypeMeta.h
deleted file mode 100644
index b0ac38309704..000000000000
--- a/c10/core/ScalarTypeToTypeMeta.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-
-#include <c10/core/ScalarType.h>
-#include <c10/util/typeid.h>
-
-// these just expose TypeMeta/ScalarType bridge functions in c10
-// TODO move to typeid.h (or codemod away) when TypeMeta et al
-// are moved from caffe2 to c10 (see note at top of typeid.h)
-
-namespace c10 {
-
-/**
- * convert ScalarType enum values to TypeMeta handles
- */
-static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
-  return caffe2::TypeMeta::fromScalarType(scalar_type);
-}
-
-/**
- * convert TypeMeta handles to ScalarType enum values
- */
-static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
-  return dtype.toScalarType();
-}
-
-/**
- * typeMetaToScalarType(), lifted to optional
- */
-static inline optional<at::ScalarType> optTypeMetaToScalarType(optional<caffe2::TypeMeta> type_meta) {
-  if (!type_meta.has_value()) {
-    return c10::nullopt;
-  }
-  return type_meta->toScalarType();
-}
-
-/**
- * convenience: equality across TypeMeta/ScalarType conversion
- */
-static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
-  return m.isScalarType(t);
-}
-
-static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
-  return t == m;
-}
-
-} // namespace c10
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index 2c8a87f9c2d1..8702ed4fdebf 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -62,7 +62,8 @@ TensorImpl::TensorImpl(Storage&& storage, DispatchKeySet key_set, const caffe2::
       data_type_(data_type),
       device_opt_(device_opt) {
   if (!key_set.empty()) {
-    TORCH_INTERNAL_ASSERT(data_type == ScalarType::Undefined || device_opt_.has_value());
+    AT_ASSERT(data_type.id() ==  caffe2::TypeIdentifier::uninitialized() ||
+              device_opt_.has_value());
     // UndefinedTensorImpl is a singleton, so we skip logging it
     C10_LOG_API_USAGE_ONCE("tensor.create");
   }
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 60472747bd2f..5b383303df92 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1777,13 +1777,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 //    strides SmallVector (pre-allocated 4)
 //    storage offset
 //    numel
-//    data type
+//    data type pointer
 //    (optional) device
 //    tensor type id
 //    miscellaneous bitfield
 //
 static_assert(sizeof(void*) != sizeof(int64_t) || // if 64-bit...
-              sizeof(TensorImpl) == sizeof(int64_t) * 30,
+              sizeof(TensorImpl) == sizeof(int64_t) * 31,
               "You changed the size of TensorImpl on 64-bit arch."
               "See Note [TensorImpl size constraints] on how to proceed.");
 } // namespace c10
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index e0d998cc49c4..dd92f919662f 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -4,7 +4,6 @@
 #include <c10/core/Backend.h>
 #include <c10/core/Layout.h>
 #include <c10/core/ScalarType.h>
-#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Device.h>
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/DispatchKeySet.h>
diff --git a/c10/core/UndefinedTensorImpl.h b/c10/core/UndefinedTensorImpl.h
index 26122ed305e2..9f1cb93c10eb 100644
--- a/c10/core/UndefinedTensorImpl.h
+++ b/c10/core/UndefinedTensorImpl.h
@@ -28,6 +28,8 @@ struct C10_API UndefinedTensorImpl final : public TensorImpl {
 private:
   UndefinedTensorImpl();
   static UndefinedTensorImpl _singleton;
+public:
+  friend struct UndefinedType;
 };
 
 } // namespace c10
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index f3fe048b4cca..e97eaa843979 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -14,41 +14,42 @@ namespace detail {
 C10_EXPORT void _ThrowRuntimeTypeLogicError(const string& msg) {
   // In earlier versions it used to be std::abort() but it's a bit hard-core
   // for a library
-  TORCH_CHECK(false, msg);
+  AT_ERROR(msg);
 }
-} // namespace detail
 
-[[noreturn]] void TypeMeta::error_unsupported_typemeta(caffe2::TypeMeta dtype) {
-  TORCH_CHECK(false, "Unsupported TypeMeta in ATen: ", dtype, " (please report this error)");
-}
 
-// see TypeMeta::addTypeMetaData
-std::atomic<uint16_t> TypeMeta::nextTypeIndex(NumScalarTypes);
+} // namespace detail
 
-// fixed length array of TypeMetaData instances
-detail::TypeMetaData* TypeMeta::typeMetaDatas() {
-  static detail::TypeMetaData instances[MaxTypeIndex + 1] = {
-#define SCALAR_TYPE_META(T, name)         \
-    /* ScalarType::name */                \
-    detail::TypeMetaData(                 \
-      sizeof(T),                          \
-      detail::_PickNew<T>(),              \
-      detail::_PickPlacementNew<T>(),     \
-      detail::_PickCopy<T>(),             \
-      detail::_PickPlacementDelete<T>(),  \
-      detail::_PickDelete<T>(),           \
-      TypeIdentifier::Get<T>(),           \
-      c10::util::get_fully_qualified_type_name<T>()),
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SCALAR_TYPE_META)
-#undef SCALAR_TYPE_META
-    // The remainder of the array is padded with TypeMetaData blanks.
-    // The first of these is the entry for ScalarType::Undefined.
-    // The rest are consumed by CAFFE_KNOWN_TYPE entries.
-  };
-  return instances;
+template <>
+EXPORT_IF_NOT_GCC const detail::TypeMetaData* TypeMeta::_typeMetaDataInstance<
+    detail::_Uninitialized>() noexcept {
+  static constexpr detail::TypeMetaData singleton = detail::TypeMetaData(
+      0,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      TypeIdentifier::uninitialized(),
+      "nullptr (uninitialized)");
+  return &singleton;
 }
 
+CAFFE_KNOWN_TYPE(uint8_t)
+CAFFE_KNOWN_TYPE(int8_t)
+CAFFE_KNOWN_TYPE(int16_t)
+CAFFE_KNOWN_TYPE(int)
+CAFFE_KNOWN_TYPE(int64_t)
+CAFFE_KNOWN_TYPE(at::Half)
+CAFFE_KNOWN_TYPE(float)
+CAFFE_KNOWN_TYPE(double)
+CAFFE_KNOWN_TYPE(c10::complex<c10::Half>)
+CAFFE_KNOWN_TYPE(c10::complex<float>)
+CAFFE_KNOWN_TYPE(c10::complex<double>)
+// 11 = undefined type id
+// 12 = Tensor (defined in tensor.cc)
 CAFFE_KNOWN_TYPE(std::string)
+CAFFE_KNOWN_TYPE(bool)
 CAFFE_KNOWN_TYPE(uint16_t)
 CAFFE_KNOWN_TYPE(char)
 CAFFE_KNOWN_TYPE(std::unique_ptr<std::mutex>)
@@ -78,11 +79,14 @@ using _guard_long_unique = std::conditional_t<
     _guard_long_unique_dummy<T>,
     T>;
 } // namespace detail
-
 CAFFE_KNOWN_TYPE(detail::_guard_long_unique<long>);
 CAFFE_KNOWN_TYPE(detail::_guard_long_unique<std::vector<long>>)
 
 CAFFE_KNOWN_TYPE(float*)
 CAFFE_KNOWN_TYPE(at::Half*)
+CAFFE_KNOWN_TYPE(c10::qint8)
+CAFFE_KNOWN_TYPE(c10::quint8)
+CAFFE_KNOWN_TYPE(c10::qint32)
+CAFFE_KNOWN_TYPE(at::BFloat16)
 
 } // namespace caffe2
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index c5dc8847aecb..62a0bdfc6644 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -21,14 +21,17 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/C++17.h>
 #include <c10/util/Exception.h>
+#include <c10/util/Half.h>
 #include <c10/util/IdWrapper.h>
 #include <c10/util/Type.h>
 #include <c10/util/TypeTraits.h>
 #include <c10/util/TypeIndex.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+#include <c10/util/BFloat16.h>
 #include <c10/util/flat_hash_map.h>
 
-#include <c10/core/ScalarType.h>
-
 /*
  * TypeIdentifier is a small type containing an id.
  * Types must be registered using CAFFE_KNOWN_TYPE() for them to have a type id.
@@ -63,7 +66,7 @@ namespace caffe2 {
  */
 class C10_API TypeIdentifier final
     : public at::IdWrapper<TypeIdentifier, c10::util::type_index> {
-public:
+ public:
   friend std::ostream& operator<<(std::ostream& stream, TypeIdentifier typeId);
   friend constexpr bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
 
@@ -83,8 +86,9 @@ class C10_API TypeIdentifier final
     return TypeIdentifier(c10::util::type_index{0});
   }
 
-private:
+ private:
   constexpr explicit TypeIdentifier(c10::util::type_index id) : IdWrapper(id) {}
+  friend class TypeMeta; // TODO Is this friend an issue?
 };
 
 // Allow usage in std::map / std::set
@@ -121,16 +125,7 @@ struct TypeMetaData final {
   using PlacementDelete = void(void*, size_t);
   using Delete = void(void*);
 
-  constexpr TypeMetaData() noexcept
-  : itemsize_(0),
-    new_(nullptr),
-    placementNew_(nullptr),
-    copy_(nullptr),
-    placementDelete_(nullptr),
-    delete_(nullptr),
-    id_(TypeIdentifier::uninitialized()),
-    name_("nullptr (uninitialized)") {}
-
+  TypeMetaData() = delete;
   constexpr TypeMetaData(
       size_t itemsize,
       New* newFn,
@@ -140,14 +135,14 @@ struct TypeMetaData final {
       Delete* deleteFn,
       TypeIdentifier id,
       c10::string_view name) noexcept
-  : itemsize_(itemsize),
-    new_(newFn),
-    placementNew_(placementNew),
-    copy_(copy),
-    placementDelete_(placementDelete),
-    delete_(deleteFn),
-    id_(id),
-    name_(name) {}
+      : itemsize_(itemsize),
+        new_(newFn),
+        placementNew_(placementNew),
+        copy_(copy),
+        placementDelete_(placementDelete),
+        delete_(deleteFn),
+        id_(id),
+        name_(name) {}
 
   size_t itemsize_;
   New* new_;
@@ -298,25 +293,25 @@ inline constexpr TypeMetaData::Delete* _PickDelete() noexcept {
   return &_Delete<T>;
 }
 
+template <class T>
+inline C10_TYPENAME_CONSTEXPR TypeMetaData _makeTypeMetaDataInstance() {
+  C10_HOST_CONSTEXPR_VAR auto typeId = TypeIdentifier::Get<T>();
+  C10_TYPENAME_CONSTEXPR auto typeName = c10::util::get_fully_qualified_type_name<T>();
+
+  return {sizeof(T),
+          _PickNew<T>(),
+          _PickPlacementNew<T>(),
+          _PickCopy<T>(),
+          _PickPlacementDelete<T>(),
+          _PickDelete<T>(),
+          typeId,
+          typeName};
+}
+
 class _Uninitialized final {};
 
 } // namespace detail
 
-//
-// note: these are outside TypeMeta bc gcc seems to have trouble
-// with scalarTypeItemSizes as a constexpr static member used by
-// a public inline instance method
-//
-static constexpr uint16_t NumScalarTypes = static_cast<uint16_t>(ScalarType::NumOptions);
-
-// item sizes for TypeMeta::itemsize() fast path
-static constexpr size_t scalarTypeItemSizes[NumScalarTypes] = {
-#define SCALAR_TYPE_SIZE(T, name) sizeof(T),
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SCALAR_TYPE_SIZE)
-#undef SCALAR_TYPE_SIZE
-    0, // Undefined
-};
-
 /**
  * TypeMeta is a thin class that allows us to store the type of a container such
  * as a blob, or the data type of a tensor, with a unique run-time id. It also
@@ -348,11 +343,11 @@ class C10_API TypeMeta final {
 
   TypeMeta(TypeMeta&& rhs) noexcept = default;
 
-private:
+ private:
   // TypeMeta can only be created by Make, making sure that we do not
   // create incorrectly mixed up TypeMeta objects.
-  explicit TypeMeta(const uint16_t index) noexcept
-  : index_(index) {
+  explicit TypeMeta(const detail::TypeMetaData* data) noexcept
+  : data_(data) {
   }
 
  public:
@@ -360,66 +355,48 @@ class C10_API TypeMeta final {
    * Returns the type id.
    */
   TypeIdentifier id() const noexcept {
-    return data().id_;
-  }
-  /**
-   * true if we represent some ScalarType type
-   */
-  inline bool isScalarType() const noexcept {
-    return index_ < NumScalarTypes;
-  }
-  /**
-   * true if we represent ScalarType scalar_type
-   */
-  inline bool isScalarType(ScalarType scalar_type) const noexcept {
-    return index_ == static_cast<uint16_t>(scalar_type);
+    return data_->id_;
   }
   /**
    * Returns the size of the item.
    */
-  inline size_t itemsize() const noexcept {
-    if (C10_LIKELY(isScalarType())) {
-      return scalarTypeItemSizes[index_];
-    }
-    return data().itemsize_;
+  size_t itemsize() const noexcept {
+    return data_->itemsize_;
   }
-  /**
-   * Returns the new function pointer for individual items.
-   */
   New* newFn() const noexcept {
-    return data().new_;
+    return data_->new_;
   }
   /**
    * Returns the placement new function pointer for individual items.
    */
   PlacementNew* placementNew() const noexcept {
-    return data().placementNew_;
+    return data_->placementNew_;
   }
   /**
    * Returns the typed copy function pointer for individual iterms.
    */
   Copy* copy() const noexcept {
-    return data().copy_;
+    return data_->copy_;
   }
   /**
    * Returns the destructor function pointer for individual items.
    */
   PlacementDelete* placementDelete() const noexcept {
-    return data().placementDelete_;
+    return data_->placementDelete_;
   }
   Delete* deleteFn() const noexcept {
-    return data().delete_;
+    return data_->delete_;
   }
   /**
    * Returns a printable name for the type.
    */
   c10::string_view name() const noexcept {
-    return data().name_;
+    return data_->name_;
   }
 
   friend bool operator==(
-      const TypeMeta lhs,
-      const TypeMeta rhs) noexcept;
+      const TypeMeta& lhs,
+      const TypeMeta& rhs) noexcept;
 
   template <typename T>
   bool Match() const noexcept {
@@ -434,7 +411,7 @@ class C10_API TypeMeta final {
   }
 
   template <class T>
-  static c10::string_view TypeName() noexcept {
+  static C10_TYPENAME_CONSTEXPR c10::string_view TypeName() noexcept {
     return c10::util::get_fully_qualified_type_name<T>();
   }
 
@@ -459,105 +436,35 @@ class C10_API TypeMeta final {
 #pragma GCC diagnostic ignored "-Wunknown-warning-option"
 #pragma GCC diagnostic ignored "-Wundefined-var-template"
 #endif
-    return TypeMeta(_typeMetaData<T>());
+    return TypeMeta(_typeMetaDataInstance<T>());
 #ifndef _MSC_VER
 #pragma GCC diagnostic pop
 #endif
   }
 
-  /**
-  * convert ScalarType enum values to TypeMeta handles
-  */
-  static inline caffe2::TypeMeta fromScalarType(ScalarType scalar_type) {
-    const size_t index = static_cast<uint16_t>(scalar_type);
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-      index < NumScalarTypes,
-      "Unrecognized Scalartype ", scalar_type, " (please report this error)");
-    return TypeMeta(index);
-  }
+ private:
+  const detail::TypeMetaData* data_;
 
-  /**
-   * convert TypeMeta handles to ScalarType enum values
-   */
-  inline ScalarType toScalarType() {
-    if (C10_LIKELY(isScalarType())) {
-      return static_cast<ScalarType>(index_);
-    }
-    error_unsupported_typemeta(*this);
-  }
-
-private:
-  [[noreturn]] static void error_unsupported_typemeta(caffe2::TypeMeta dtype);
-
-  // hard limit number of registered types
-  // note: constexpr provokes Windows compilation error "member may not be initialized"
-  // static constexpr size_t MaxTypeIndex = UINT8_MAX;
-  #define MaxTypeIndex UINT8_MAX
-
-  static std::atomic<uint16_t> nextTypeIndex;
-
-  static detail::TypeMetaData* typeMetaDatas();
-
-  template <class T>
-  static uint16_t addTypeMetaData() {
-    const uint16_t index = nextTypeIndex++;
-    TORCH_CHECK(index <= MaxTypeIndex,
-      "Maximum number of CAFFE_KNOWN_TYPE declarations has been exceeded. ",
-      "Please report this issue.");
-    typeMetaDatas()[index] = detail::TypeMetaData{
-      sizeof(T),
-      detail::_PickNew<T>(),
-      detail::_PickPlacementNew<T>(),
-      detail::_PickCopy<T>(),
-      detail::_PickPlacementDelete<T>(),
-      detail::_PickDelete<T>(),
-      TypeIdentifier::Get<T>(),
-      c10::util::get_fully_qualified_type_name<T>()};
-    return index;
-  }
-
-  // specializations return indexes into typeMetaDataInstances()
   template <class T>
-  C10_API static uint16_t _typeMetaData() noexcept;
-
-  //
-  // TypeMeta just wraps this index
-  //
-
-  uint16_t index_;
-
-  inline const detail::TypeMetaData& data() const {
-    return typeMetaDatas()[index_];
-  }
+  C10_API static const detail::TypeMetaData* _typeMetaDataInstance() noexcept;
 };
 
-// specializations of TypeMeta::_typeMetaData for ScalarType types
-
-#define DEFINE_SCALAR_METADATA_INSTANCE(T, name)              \
-  template <>                                                 \
-  constexpr uint16_t TypeMeta::_typeMetaData<T>() noexcept {  \
-    return static_cast<uint16_t>(ScalarType::name);           \
-  }
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_METADATA_INSTANCE)
-#undef DEFINE_SCALAR_METADATA_INSTANCE
-
 template <>
-constexpr uint16_t TypeMeta::_typeMetaData<detail::_Uninitialized>() noexcept {
-  return static_cast<uint16_t>(ScalarType::Undefined);
-}
+C10_EXPORT const detail::TypeMetaData* TypeMeta::_typeMetaDataInstance<
+    detail::_Uninitialized>() noexcept;
 
 inline TypeMeta::TypeMeta() noexcept
-  : index_(_typeMetaData<detail::_Uninitialized>()) {
+    : data_(_typeMetaDataInstance<detail::_Uninitialized>()) {
 }
 
 inline bool operator==(
-    const TypeMeta lhs,
-    const TypeMeta rhs) noexcept {
-  return (lhs.index_ == rhs.index_);
+    const TypeMeta& lhs,
+    const TypeMeta& rhs) noexcept {
+  return (lhs.data_ == rhs.data_);
 }
 inline bool operator!=(
-    const TypeMeta lhs,
-    const TypeMeta rhs) noexcept {
+    const TypeMeta& lhs,
+    const TypeMeta& rhs) noexcept {
   return !operator==(lhs, rhs);
 }
 
@@ -592,11 +499,13 @@ inline std::ostream& operator<<(
 #define EXPORT_IF_NOT_GCC
 #endif
 
-#define CAFFE_KNOWN_TYPE(T)                                           \
-  template <>                                                         \
-  EXPORT_IF_NOT_GCC uint16_t TypeMeta::_typeMetaData<T>() noexcept {  \
-    static const uint16_t index = addTypeMetaData<T>();               \
-    return index;                                                     \
+#define CAFFE_KNOWN_TYPE(T)                                        \
+  template <>                                                      \
+  EXPORT_IF_NOT_GCC const detail::TypeMetaData*                    \
+  TypeMeta::_typeMetaDataInstance<T>() noexcept {                  \
+    static C10_TYPENAME_CONSTEXPR detail::TypeMetaData singleton = \
+        detail::_makeTypeMetaDataInstance<T>();                    \
+    return &singleton;                                             \
   }
 
 } // namespace caffe2
diff --git a/torch/csrc/DynamicTypes.h b/torch/csrc/DynamicTypes.h
index 774c5e99999f..0877fb317cb3 100644
--- a/torch/csrc/DynamicTypes.h
+++ b/torch/csrc/DynamicTypes.h
@@ -6,7 +6,6 @@
 
 #include <ATen/Device.h>
 #include <c10/core/ScalarType.h>
-#include <c10/core/ScalarTypeToTypeMeta.h>
 #include <c10/core/Backend.h>
 #include <c10/core/Layout.h>
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index 990127ffab56..772a0fa4c187 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -3,7 +3,6 @@
 #include <algorithm>
 #include <list>
 #include <string>
-#include <unordered_set>
 #include <vector>
 
 #include <torch/csrc/jit/tensorexpr/expr.h>

From fe9019cbfe431ad30180aaea841e81e3038eadc0 Mon Sep 17 00:00:00 2001
From: Heitor Schueroff de Souza <heitorschueroff@fb.com>
Date: Tue, 29 Sep 2020 09:44:56 -0700
Subject: [PATCH 262/449] Reorganized Sorting.cpp method order (#45083)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45083

This PR just reorders the methods in Sorting.cpp placing related methods next to each other.

Test Plan: Imported from OSS

Reviewed By: glaringlee

Differential Revision: D23908817

Pulled By: heitorschueroff

fbshipit-source-id: 1dd7b693b5135fddf5dff12303474e85ce0c2f83
---
 aten/src/ATen/native/Sorting.cpp | 160 +++++++++++++++----------------
 1 file changed, 80 insertions(+), 80 deletions(-)

diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
index ffddddfd2ba5..7394365903ed 100644
--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@@ -274,33 +274,6 @@ std::tuple<Tensor&, Tensor&> kthvalue_out_impl_cpu(
 
 } // namespace
 
-std::tuple<Tensor&, Tensor&> kthvalue_out_cpu(
-    Tensor& values,
-    Tensor& indices,
-    const Tensor& self,
-    int64_t k,
-    int64_t dim,
-    bool keepdim) {
-  auto result = [&]() {
-    NoNamesGuard guard;
-    return kthvalue_out_impl_cpu(values, indices, self, k, dim, keepdim);
-  }();
-  namedinference::propagate_names_for_reduction(values, self, dim, keepdim);
-  namedinference::propagate_names_for_reduction(indices, self, dim, keepdim);
-  return result;
-}
-
-std::tuple<Tensor, Tensor> kthvalue(
-    const Tensor& self,
-    int64_t k,
-    int64_t dim,
-    bool keepdim) {
-  Tensor values = at::empty({0}, self.options());
-  Tensor indices = at::empty({0}, self.options().dtype(kLong));
-  at::kthvalue_out(values, indices, self, k, dim, keepdim);
-  return std::make_tuple(values, indices);
-}
-
 Tensor& quantile_out(
     Tensor& out,
     const Tensor& self,
@@ -395,6 +368,52 @@ Tensor nanquantile(
       self, at::scalar_tensor(q, self.options()), std::move(_dim), keepdim);
 }
 
+std::tuple<Tensor&, Tensor&> kthvalue_out_cpu(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t k,
+    int64_t dim,
+    bool keepdim) {
+  auto result = [&]() {
+    NoNamesGuard guard;
+    return kthvalue_out_impl_cpu(values, indices, self, k, dim, keepdim);
+  }();
+  namedinference::propagate_names_for_reduction(values, self, dim, keepdim);
+  namedinference::propagate_names_for_reduction(indices, self, dim, keepdim);
+  return result;
+}
+
+std::tuple<Tensor&, Tensor&> kthvalue_out(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t k,
+    Dimname dim,
+    bool keepdim) {
+  return at::kthvalue_out(
+      values, indices, self, k, dimname_to_position(self, dim), keepdim);
+}
+
+std::tuple<Tensor, Tensor> kthvalue(
+    const Tensor& self,
+    int64_t k,
+    int64_t dim,
+    bool keepdim) {
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
+  at::kthvalue_out(values, indices, self, k, dim, keepdim);
+  return std::make_tuple(values, indices);
+}
+
+std::tuple<Tensor, Tensor> kthvalue(
+    const Tensor& self,
+    int64_t k,
+    Dimname dim,
+    bool keepdim) {
+  return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
+}
+
 std::tuple<Tensor&, Tensor&> topk_out_cpu(
     Tensor& values,
     Tensor& indices,
@@ -432,6 +451,33 @@ std::tuple<Tensor, Tensor> topk(
   return std::make_tuple(values, indices);
 }
 
+// this does not reduce to median with dim because we don't want to copy twice
+Tensor median_cpu(const Tensor& self) {
+  NoNamesGuard guard;
+  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
+  if (self.dim() == 0 && self.numel() == 1) {
+    return self.clone(at::MemoryFormat::Contiguous);
+  }
+  auto tmp_values = self.clone(at::MemoryFormat::Contiguous).view(-1);
+  auto result = at::empty({1}, self.options());
+  AT_DISPATCH_ALL_TYPES(self.scalar_type(), "median", [&] {
+    // note, quick_select is 0 based while kthvalue is not
+    int64_t k = (tmp_values.size(0) - 1) / 2;
+    auto val_accessor = tmp_values.accessor<scalar_t, 1>();
+    quick_select_template(
+        val_accessor,
+        k,
+        [](scalar_t x, scalar_t y) -> bool {
+          return ((_isnan<scalar_t>(x) && !_isnan<scalar_t>(y)) || (x > y));
+        },
+        [&](int64_t i, int64_t j) {
+          std::swap(val_accessor[i], val_accessor[j]);
+        });
+    result.fill_(tmp_values[k]);
+  });
+  return result.view({});
+}
+
 std::tuple<Tensor&, Tensor&> median_out(
     Tensor& values,
     Tensor& indices,
@@ -444,16 +490,6 @@ std::tuple<Tensor&, Tensor&> median_out(
   return std::forward_as_tuple(values, indices);
 }
 
-std::tuple<Tensor, Tensor> median(
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim) {
-  Tensor values = at::empty({0}, self.options());
-  Tensor indices = at::empty({0}, self.options().dtype(kLong));
-  at::median_out(values, indices, self, dim, keepdim);
-  return std::make_tuple(values, indices);
-}
-
 std::tuple<Tensor&, Tensor&> median_out(
     Tensor& values,
     Tensor& indices,
@@ -466,55 +502,19 @@ std::tuple<Tensor&, Tensor&> median_out(
 
 std::tuple<Tensor, Tensor> median(
     const Tensor& self,
-    Dimname dim,
-    bool keepdim) {
-  return at::median(self, dimname_to_position(self, dim), keepdim);
-}
-
-std::tuple<Tensor&, Tensor&> kthvalue_out(
-    Tensor& values,
-    Tensor& indices,
-    const Tensor& self,
-    int64_t k,
-    Dimname dim,
+    int64_t dim,
     bool keepdim) {
-  return at::kthvalue_out(
-      values, indices, self, k, dimname_to_position(self, dim), keepdim);
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
+  at::median_out(values, indices, self, dim, keepdim);
+  return std::make_tuple(values, indices);
 }
 
-std::tuple<Tensor, Tensor> kthvalue(
+std::tuple<Tensor, Tensor> median(
     const Tensor& self,
-    int64_t k,
     Dimname dim,
     bool keepdim) {
-  return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
-}
-
-// this does not reduce to median with dim because we don't want to copy twice
-Tensor median_cpu(const Tensor& self) {
-  NoNamesGuard guard;
-  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
-  if (self.dim() == 0 && self.numel() == 1) {
-    return self.clone(at::MemoryFormat::Contiguous);
-  }
-  auto tmp_values = self.clone(at::MemoryFormat::Contiguous).view(-1);
-  auto result = at::empty({1}, self.options());
-  AT_DISPATCH_ALL_TYPES(self.scalar_type(), "median", [&] {
-    // note, quick_select is 0 based while kthvalue is not
-    int64_t k = (tmp_values.size(0) - 1) / 2;
-    auto val_accessor = tmp_values.accessor<scalar_t, 1>();
-    quick_select_template(
-        val_accessor,
-        k,
-        [](scalar_t x, scalar_t y) -> bool {
-          return ((_isnan<scalar_t>(x) && !_isnan<scalar_t>(y)) || (x > y));
-        },
-        [&](int64_t i, int64_t j) {
-          std::swap(val_accessor[i], val_accessor[j]);
-        });
-    result.fill_(tmp_values[k]);
-  });
-  return result.view({});
+  return at::median(self, dimname_to_position(self, dim), keepdim);
 }
 
 std::tuple<Tensor&, Tensor&> sort_out_cpu(

From 2c300fd74c120c31fb1f35ca1fd30623a5387d1a Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Tue, 29 Sep 2020 09:56:01 -0700
Subject: [PATCH 263/449] [android][vulkan] Module load argument to specify
 device cpu/vulkan (#44896)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44896

Test Plan: Imported from OSS

Reviewed By: dreiss

Differential Revision: D23763771

Pulled By: IvanKobzarev

fbshipit-source-id: 990a386ad13c704f03345dbe09e180281af913c9
---
 .../src/main/cpp/pytorch_jni_common.cpp       | 11 ++++++
 .../src/main/cpp/pytorch_jni_common.h         |  7 ++++
 .../src/main/cpp/pytorch_jni_jit.cpp          | 36 ++++++++++++++-----
 .../src/main/cpp/pytorch_jni_lite.cpp         | 22 +++++-------
 .../src/main/java/org/pytorch/Module.java     | 17 +++++++--
 .../src/main/java/org/pytorch/NativePeer.java | 12 +++----
 .../main/java/org/pytorch/PyTorchAndroid.java |  7 +++-
 7 files changed, 81 insertions(+), 31 deletions(-)

diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
index 11696daf43a2..fed6170c2bf3 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp
@@ -18,6 +18,17 @@
 
 namespace pytorch_jni {
 
+c10::DeviceType deviceJniCodeToDeviceType(jint deviceJniCode) {
+  if (deviceJniCode == kDeviceCPU) {
+    return at::kCPU;
+  } else if (deviceJniCode == kDeviceVulkan) {
+    return at::kVulkan;
+  }
+
+  facebook::jni::throwNewJavaException(
+      facebook::jni::gJavaLangIllegalArgumentException, "Unknown device");
+}
+
 bool Trace::is_initialized_ = false;
 
 #if defined(TRACE_ENABLED) && defined(__ANDROID__)
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.h b/android/pytorch_android/src/main/cpp/pytorch_jni_common.h
index fb974d4ad702..9b4e7e5f84a1 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.h
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <fbjni/fbjni.h>
 #include <torch/csrc/api/include/torch/types.h>
 
@@ -18,6 +20,11 @@
 
 namespace pytorch_jni {
 
+constexpr static int kDeviceCPU = 1;
+constexpr static int kDeviceVulkan = 2;
+
+c10::DeviceType deviceJniCodeToDeviceType(jint deviceJniCode);
+
 class Trace {
  public:
 #if defined(TRACE_ENABLED) && defined(__ANDROID__)
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
index b05c19665f20..e4bb4c083160 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@@ -67,22 +67,25 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
  private:
   friend HybridBase;
   torch::jit::Module module_;
+  c10::DeviceType deviceType_;
 
  public:
   constexpr static auto kJavaDescriptor = "Lorg/pytorch/NativePeer;";
 
   static facebook::jni::local_ref<jhybriddata> initHybrid(
       facebook::jni::alias_ref<jclass>,
-      facebook::jni::alias_ref<jstring> modelPath) {
-    return makeCxxInstance(modelPath);
+      facebook::jni::alias_ref<jstring> modelPath,
+      jint device) {
+    return makeCxxInstance(modelPath, device);
   }
 
 #ifdef __ANDROID__
   static facebook::jni::local_ref<jhybriddata> initHybridAndroidAsset(
       facebook::jni::alias_ref<jclass>,
       facebook::jni::alias_ref<jstring> assetName,
-      facebook::jni::alias_ref<jobject> assetManager) {
-    return makeCxxInstance(assetName, assetManager);
+      facebook::jni::alias_ref<jobject> assetManager,
+      jint device) {
+    return makeCxxInstance(assetName, assetManager, device);
   }
 #endif
 
@@ -127,17 +130,19 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     ((void)once);
   }
 
-  PytorchJni(facebook::jni::alias_ref<jstring> modelPath) {
+  PytorchJni(facebook::jni::alias_ref<jstring> modelPath, jint device) {
     preModuleLoadSetup();
     JITCallGuard guard;
     module_ = torch::jit::load(std::move(modelPath->toStdString()));
     module_.eval();
+    deviceType_ = deviceJniCodeToDeviceType(device);
   }
 
 #ifdef __ANDROID__
   PytorchJni(
       facebook::jni::alias_ref<jstring> assetName,
-      facebook::jni::alias_ref<jobject> assetManager) {
+      facebook::jni::alias_ref<jobject> assetManager,
+      jint device) {
     preModuleLoadSetup();
     JNIEnv* env = facebook::jni::Environment::current();
     AAssetManager* mgr = AAssetManager_fromJava(env, assetManager.get());
@@ -166,6 +171,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
         assetBuffer, AAsset_getLength(asset)));
     AAsset_close(asset);
     module_.eval();
+    deviceType_ = deviceJniCodeToDeviceType(device);
   }
 #endif
 
@@ -191,7 +197,14 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     inputs.reserve(n);
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      inputs.push_back(std::move(atIValue));
+      if (at::kVulkan == deviceType_) {
+        inputs.push_back(
+            atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
+                                : std::move(atIValue));
+      } else {
+        TORCH_CHECK(at::kCPU == deviceType_);
+        inputs.push_back(std::move(atIValue));
+      }
     }
     auto output = [&]() {
       JITCallGuard guard;
@@ -212,7 +225,14 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     inputs.reserve(n);
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      inputs.push_back(std::move(atIValue));
+      if (at::kVulkan == deviceType_) {
+        inputs.push_back(
+            atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
+                                : std::move(atIValue));
+      } else {
+        TORCH_CHECK(at::kCPU == deviceType_);
+        inputs.push_back(std::move(atIValue));
+      }
     }
     if (auto method = module_.find_method(methodName)) {
       auto output = [&]() {
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
index 061b85221fe9..8a96e395f267 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp
@@ -30,9 +30,6 @@ struct LiteJITCallGuard {
 } // namespace
 
 class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
-  constexpr static int kDeviceCPU = 1;
-  constexpr static int kDeviceVulkan = 2;
-
  private:
   friend HybridBase;
   torch::jit::mobile::Module module_;
@@ -51,15 +48,7 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
   PytorchJni(facebook::jni::alias_ref<jstring> modelPath, jint device) {
     LiteJITCallGuard guard;
     module_ = torch::jit::_load_for_mobile(std::move(modelPath->toStdString()));
-    if (device == kDeviceCPU) {
-      deviceType_ = at::kCPU;
-    } else if (device == kDeviceVulkan) {
-      deviceType_ = at::kVulkan;
-    } else {
-      facebook::jni::throwNewJavaException(
-          facebook::jni::gJavaLangIllegalArgumentException,
-          "Unknown device specified");
-    }
+    deviceType_ = deviceJniCodeToDeviceType(device);
   }
 
   static void registerNatives() {
@@ -108,7 +97,14 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
     inputs.reserve(n);
     for (size_t i = 0; i < n; i++) {
       at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i));
-      inputs.push_back(std::move(atIValue));
+      if (at::kVulkan == deviceType_) {
+        inputs.push_back(
+            atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()}
+                                : std::move(atIValue));
+      } else {
+        TORCH_CHECK(at::kCPU == deviceType_);
+        inputs.push_back(std::move(atIValue));
+      }
     }
     if (auto method = module_.find_method(methodName)) {
       auto output = [&]() {
diff --git a/android/pytorch_android/src/main/java/org/pytorch/Module.java b/android/pytorch_android/src/main/java/org/pytorch/Module.java
index 9dafc687f993..70273286f7e2 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/Module.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Module.java
@@ -11,16 +11,27 @@ public class Module {
   private INativePeer mNativePeer;
 
   /**
-   * Loads a serialized TorchScript module from the specified path on the disk.
+   * Loads a serialized TorchScript module from the specified path on the disk to run on specified device.
    *
    * @param modelPath path to file that contains the serialized TorchScript module.
+   * @param device {@link org.pytorch.Device} to use for running specified module.
    * @return new {@link org.pytorch.Module} object which owns torch::jit::Module.
    */
-  public static Module load(final String modelPath) {
+  public static Module load(final String modelPath, final Device device) {
     if (!NativeLoader.isInitialized()) {
       NativeLoader.init(new SystemDelegate());
     }
-    return new Module(new NativePeer(modelPath));
+    return new Module(new NativePeer(modelPath, device));
+  }
+
+  /**
+   * Loads a serialized TorchScript module from the specified path on the disk to run on CPU.
+   *
+   * @param modelPath path to file that contains the serialized TorchScript module.
+   * @return new {@link org.pytorch.Module} object which owns torch::jit::Module.
+   */
+  public static Module load(final String modelPath) {
+    return load(modelPath, Device.CPU);
   }
 
   Module(INativePeer nativePeer) {
diff --git a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
index 5c6ef31061ae..723e9eef3fb3 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
@@ -13,18 +13,18 @@ class NativePeer implements INativePeer {
   private final HybridData mHybridData;
 
   @DoNotStrip
-  private static native HybridData initHybrid(String moduleAbsolutePath);
+  private static native HybridData initHybrid(String moduleAbsolutePath, int deviceJniCode);
 
   @DoNotStrip
   private static native HybridData initHybridAndroidAsset(
-      String assetName, /* android.content.res.AssetManager */ Object androidAssetManager);
+      String assetName, /* android.content.res.AssetManager */ Object androidAssetManager, int deviceJniCode);
 
-  NativePeer(String moduleAbsolutePath) {
-    mHybridData = initHybrid(moduleAbsolutePath);
+  NativePeer(String moduleAbsolutePath, Device device) {
+    mHybridData = initHybrid(moduleAbsolutePath, device.jniCode);
   }
 
-  NativePeer(String assetName, /* android.content.res.AssetManager */ Object androidAssetManager) {
-    mHybridData = initHybridAndroidAsset(assetName, androidAssetManager);
+  NativePeer(String assetName, /* android.content.res.AssetManager */ Object androidAssetManager, Device device) {
+    mHybridData = initHybridAndroidAsset(assetName, androidAssetManager, device.jniCode);
   }
 
   public void resetNative() {
diff --git a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
index 15664dd040ea..b775c2bb2e2c 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java
@@ -21,9 +21,14 @@ public final class PyTorchAndroid {
    *
    * <p>This method is meant to use in tests and demos.
    */
+  public static Module loadModuleFromAsset(
+      final AssetManager assetManager, final String assetName, final Device device) {
+    return new Module(new NativePeer(assetName, assetManager, device));
+  }
+
   public static Module loadModuleFromAsset(
       final AssetManager assetManager, final String assetName) {
-    return new Module(new NativePeer(assetName, assetManager));
+    return new Module(new NativePeer(assetName, assetManager, Device.CPU));
   }
 
   /**

From 17be7c6e5c7b6b51fbd7c65ce82978fec3800cb0 Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Tue, 29 Sep 2020 09:56:01 -0700
Subject: [PATCH 264/449] [vulkan][android][test_app] Add test_app variant that
 runs module on Vulkan (#44897)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44897

Test Plan: Imported from OSS

Reviewed By: dreiss

Differential Revision: D23763770

Pulled By: IvanKobzarev

fbshipit-source-id: 6ad16b7271c745313a71da64a629a764258bbc85
---
 android/test_app/app/build.gradle                     | 11 ++++++++++-
 .../main/java/org/pytorch/testapp/MainActivity.java   |  5 ++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle
index c592728ce9f4..9be0ad2d7aa6 100644
--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@@ -40,6 +40,7 @@ android {
         buildConfigField("String", "LOGCAT_TAG", "@string/app_name")
         buildConfigField("long[]", "INPUT_TENSOR_SHAPE", "new long[]{1, 3, 224, 224}")
         buildConfigField("boolean", "NATIVE_BUILD", 'false')
+        buildConfigField("boolean", "USE_VULKAN_DEVICE", 'false')
         addManifestPlaceholders([APP_NAME: "@string/app_name", MAIN_ACTIVITY: "org.pytorch.testapp.MainActivity"])
     }
     buildTypes {
@@ -66,9 +67,17 @@ android {
             addManifestPlaceholders([APP_NAME: "MBQ"])
             buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"")
         }
+        mbvulkan {
+            dimension "model"
+            applicationIdSuffix ".mbvulkan"
+            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"")
+            buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
+            addManifestPlaceholders([APP_NAME: "MBQ"])
+            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"")
+        }
         resnet18 {
             dimension "model"
-            applicationIdSuffix ".resneti18"
+            applicationIdSuffix ".resnet18"
             buildConfigField("String", "MODULE_ASSET_NAME", "\"resnet18.pt\"")
             addManifestPlaceholders([APP_NAME: "RN18"])
             buildConfigField("String", "LOGCAT_TAG", "\"pytorch-resnet18\"")
diff --git a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
index 5cc233011c8a..1510890a15b8 100644
--- a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
+++ b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
@@ -17,6 +17,7 @@
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.FloatBuffer;
+import org.pytorch.Device;
 import org.pytorch.IValue;
 import org.pytorch.Module;
 import org.pytorch.PyTorchAndroid;
@@ -126,7 +127,9 @@ protected Result doModuleForward() {
       mInputTensorBuffer = Tensor.allocateFloatBuffer((int) numElements);
       mInputTensor = Tensor.fromBlob(mInputTensorBuffer, BuildConfig.INPUT_TENSOR_SHAPE);
       PyTorchAndroid.setNumThreads(1);
-      mModule = PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME);
+      mModule = BuildConfig.USE_VULKAN_DEVICE
+        ? PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.VULKAN)
+        : PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME);
     }
 
     final long startTime = SystemClock.elapsedRealtime();

From 5f49d14be261b25e1fe7540faae62bc3312d75f4 Mon Sep 17 00:00:00 2001
From: Akshit Khurana <axit@fb.com>
Date: Tue, 29 Sep 2020 10:02:45 -0700
Subject: [PATCH 265/449] Add mobile_optimized tag to optimized model. (#45479)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45479

Add a top level boolean attribute to the model called mobile_optimized that is set to true if it is optimized.

Test Plan: buck test //caffe2/test:mobile passes

Reviewed By: kimishpatel

Differential Revision: D23956728

fbshipit-source-id: 79c5931702208b871454319ca2ab8633596b1eb8
---
 test/test_mobile_optimizer.py             | 17 +++++++++++++++++
 torch/csrc/jit/passes/xnnpack_rewrite.cpp |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
index eae6175fb024..38d8b0dbed7a 100644
--- a/test/test_mobile_optimizer.py
+++ b/test/test_mobile_optimizer.py
@@ -131,6 +131,23 @@ def forward(self, x):
         bn_input = torch.rand(1, 1, 6, 6)
         torch.testing.assert_allclose(bn_scripted_module(bn_input), no_bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
 
+        class MyMobileOptimizedTagTest(torch.nn.Module):
+            def __init__(self):
+                super(MyMobileOptimizedTagTest, self).__init__()
+                self.linear_weight = torch.nn.Parameter(torch.Tensor(torch.rand(linear_weight_shape)))
+                self.linear_bias = torch.nn.Parameter(torch.Tensor(torch.rand((weight_output_dim))))
+
+            def forward(self, x):
+                o = F.linear(x, self.linear_weight, self.linear_bias)
+                return F.relu(o)
+
+        mobile_optimized_tag_module = MyMobileOptimizedTagTest()
+        m = torch.jit.script(mobile_optimized_tag_module)
+        m.eval()
+        opt_m = optimize_for_mobile(m)
+        tag = getattr(opt_m, "mobile_optimized", None)
+        self.assertTrue(tag)
+
         class MyPreserveMethodsTest(torch.nn.Module):
             def __init__(self):
                 super(MyPreserveMethodsTest, self).__init__()
diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.cpp b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
index 3ebfab1d3264..5c89c6aceab3 100644
--- a/torch/csrc/jit/passes/xnnpack_rewrite.cpp
+++ b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
@@ -390,7 +390,7 @@ script::Module optimizeForMobile(
   if (!optimization_blocklist.count(MobileOptimizerType::FUSE_ADD_RELU)) {
     FuseAddRelu(cloned_module);
   }
-
+  cloned_module.register_attribute("mobile_optimized", BoolType::get(), true);
   return cloned_module;
 }
 

From 09b3e16b40167c3a0765d9a47147d0be3cc9181f Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Tue, 29 Sep 2020 10:20:00 -0700
Subject: [PATCH 266/449] [JIT] Enable @unused syntax for ignoring properties
 (#45261)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45261

**Summary**
This commit enables `unused` syntax for ignoring
properties. Inoring properties is more intuitive with this feature enabled.
`ignore` is not supported because class type properties cannot be
executed in Python (because they exist only as TorchScript types) like
an `ignored` function and module properties that cannot be scripted
are not added to the `ScriptModule` wrapper so that they
may execute in Python.

**Test Plan**
This commit updates the existing unit tests for class type and module
properties to test properties ignored using `unused`.

Test Plan: Imported from OSS

Reviewed By: navahgar, Krovatkin, mannatsingh

Differential Revision: D23971881

Pulled By: SplitInfinity

fbshipit-source-id: 8d3cc1bbede7753d6b6f416619e4660c56311d33
---
 test/jit/test_class_type.py | 11 ++++++++++-
 test/test_jit_py3.py        | 11 ++++++++++-
 torch/_jit_internal.py      |  9 +++++++++
 torch/fx/graph_module.py    |  2 +-
 torch/jit/_script.py        |  2 +-
 torch/jit/frontend.py       |  4 ++--
 torch/nn/modules/rnn.py     |  2 +-
 7 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
index dda6916b5591..7c9e323163e6 100644
--- a/test/jit/test_class_type.py
+++ b/test/jit/test_class_type.py
@@ -1167,7 +1167,7 @@ def free_function(x: int) -> int:
 
         @torch.jit.script
         class Properties(object):
-            __ignored_properties__ = ["unsupported"]
+            __jit_unused_properties__ = ["unsupported"]
 
             def __init__(self, a: int):
                 self.a = a
@@ -1180,6 +1180,15 @@ def attr(self) -> int:
             def unsupported(self) -> int:
                 return sum([self.a])
 
+            @torch.jit.unused
+            @property
+            def unsupported_2(self) -> int:
+                return sum([self.a])
+
+            @unsupported_2.setter
+            def unsupported_2(self, value):
+                self.a = sum([self.a])
+
             @attr.setter
             def attr(self, value: int):
                 self.a = value + 3
diff --git a/test/test_jit_py3.py b/test/test_jit_py3.py
index 4de5db884035..212b03d9658b 100644
--- a/test/test_jit_py3.py
+++ b/test/test_jit_py3.py
@@ -621,7 +621,7 @@ def if_function(inp: torch.Tensor) -> Any:
 
     def test_module_properties(self):
         class ModuleWithProperties(torch.nn.Module):
-            __ignored_properties__ = ["ignored_attr"]
+            __jit_unused_properties__ = ["ignored_attr"]
 
             def __init__(self, a: int):
                 super().__init__()
@@ -639,6 +639,15 @@ def attr(self):
             def ignored_attr(self):
                 return sum([self.a])
 
+            @torch.jit.unused
+            @property
+            def ignored_attr_2(self):
+                return sum([self.a])
+
+            @ignored_attr_2.setter
+            def ignored_attr_2(self, value):
+                self.a = sum([self.a])
+
             @attr.setter
             def attr(self, a: int):
                 if a > 0:
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 5fa2ee639a9f..e9fb21c5e854 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -390,6 +390,15 @@ def forward(self, x):
             # exception raised
             m(torch.rand(100))
     """
+    if isinstance(fn, property):
+        prop = fn
+        setattr(prop.fget, "_torchscript_modifier", FunctionModifiers.UNUSED)  # noqa: B010
+
+        if prop.fset:
+            setattr(prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED)  # noqa: B010
+
+        return prop
+
     fn._torchscript_modifier = FunctionModifiers.UNUSED
     return fn
 
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index e635819550ad..9c7d50b1d9dc 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -164,7 +164,7 @@ def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph):
     # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
     #
     # Shouldn't be an issue since these methods shouldn't be used in TorchScript anyway
-    __ignored_properties__ = ['graph']
+    __jit_unused_properties__ = ['graph']
 
     @property
     def graph(self):
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index 4d28a5f2ad13..0adbefc02cee 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -276,7 +276,7 @@ class ScriptModule(with_metaclass(ScriptMeta, Module)):  # type: ignore
         contain methods, attributes, parameters, and
         constants. These can be accessed the same as on a normal ``nn.Module``.
         """
-        __ignored_properties__ = ['code', 'code_with_constants', 'graph', 'inlined_graph', 'original_name']
+        __jit_unused_properties__ = ['code', 'code_with_constants', 'graph', 'inlined_graph', 'original_name']
 
         def __init__(self):
             super(ScriptModule, self).__init__()
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 4cfba50d0466..fdf1e613461e 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -142,12 +142,12 @@ def get_class_properties(cls, self_name):
     props = inspect.getmembers(
         cls, predicate=lambda m: isinstance(m, property))
     # Any property that should not compiled must be in this list on the Module.
-    ignored_properties = getattr(cls, "__ignored_properties__", [])
+    unused_properties = getattr(cls, "__jit_unused_properties__", [])
 
     # Create Property TreeView objects from inspected property objects.
     properties = []
     for prop in props:
-        if prop[0] not in ignored_properties:
+        if prop[0] not in unused_properties and not should_drop(prop[1].fget):
             getter = get_jit_def(prop[1].fget, f"__{prop[0]}_getter", self_name=self_name)
             setter = get_jit_def(prop[1].fset, f"__{prop[0]}_setter", self_name=self_name) if prop[1].fset else None
             properties.append(Property(getter.range(), Ident(getter.range(), prop[0]), getter, setter))
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index e6589b9ef1d9..b8da2a877dd9 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -24,7 +24,7 @@ def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tens
 class RNNBase(Module):
     __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
                      'batch_first', 'dropout', 'bidirectional']
-    __ignored_properties__ = ['all_weights']
+    __jit_unused_properties__ = ['all_weights']
 
     mode: str
     input_size: int

From ea59251f51bfc448b8a07aa0820f60bbc3710da6 Mon Sep 17 00:00:00 2001
From: Xingying Cheng <xcheng16@fb.com>
Date: Tue, 29 Sep 2020 10:35:13 -0700
Subject: [PATCH 267/449] Fix model_name not logged properly issue. (#45488)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45488

model_name logging was broken, issue is from the recent change of assigning the method name into the module name, this diff is fixing it.
ghstack-source-id: 113103942

Test Plan:
made sure that now the model_name is logged from module_->name().
verified with one model which does not contain the model metadata, and the model_name field is logged as below:

09-28 21:59:30.065 11530 12034 W module.cpp: TESTINGTESTING run() module = __torch__.Model
09-28 21:59:30.065 11530 12034 W module.cpp: TESTINGTESTING metadata does not have model_name assigning to __torch__.Model
09-28 21:59:30.066 11530 12034 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod log  model_name = __torch__.Model
09-28 21:59:30.066 11530 12034 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod log  method_name = labels
09-28 21:59:30.068 11530 12034 W MobileModuleQPLObserver.cpp: TESTINGTESTING onExitRunMethod()

Reviewed By: linbinyu

Differential Revision: D23984165

fbshipit-source-id: 5b00f50ea82106b695c2cee14029cb3b2e02e2c8
---
 torch/csrc/jit/mobile/module.cpp | 2 +-
 torch/csrc/jit/mobile/module.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index fc8cde35aabf..f294436d9d7e 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -126,7 +126,7 @@ void Method::run(Stack& stack) {
   std::unordered_map<std::string, std::string> copied_metadata =
       owner_->metadata();
   if (owner_->metadata().find("model_name") == owner_->metadata().end()) {
-    copied_metadata["model_name"] = name();
+    copied_metadata["model_name"] = owner_->name();
   }
   if (observer) {
     observer->onEnterRunMethod(copied_metadata, function_->name());
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index be66bd84ef75..00a2005df8d5 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -43,7 +43,7 @@ class TORCH_API Module {
     return get_method("forward")(std::move(inputs));
   }
   c10::optional<Method> find_method(const std::string& basename) const;
-  std::string name() {
+  const std::string name() const {
     return object_->name();
   }
   const std::vector<at::IValue>& slots() const {

From 15f85eea18a8daf46b47a95fb289e4afcadf4781 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Tue, 29 Sep 2020 10:58:05 -0700
Subject: [PATCH 268/449] Support bfloat16 and complex dtypes for logical_not
 (#43537)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43537

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D23751950

Pulled By: mruberry

fbshipit-source-id: d07ecd9aae263eb8e00928d4fc981e0d66066fbb
---
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   |  4 +-
 aten/src/ATen/native/cuda/UnarySignKernels.cu |  6 +--
 test/test_torch.py                            | 45 ++++++++-----------
 3 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 433a1054e366..cc9eedebff5a 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -238,9 +238,9 @@ static void logical_not_kernel(TensorIterator& iter) {
   // NOTE: this implementation differs from the CUDA implementation which only does single dispatch
   // (to avoid expensive compilation) because CPU kernels don't handle dynamic_casting
   // (see needs_dynamic_casting).
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cpu", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cpu", [&]() {
     using self_t = scalar_t;
-    AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cpu", [&]() {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cpu", [&]() {
       cpu_kernel(iter, [](self_t a) -> scalar_t { return static_cast<scalar_t>(!a); });
     });
   });
diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu
index e0b9e33baf99..aae3906575f9 100644
--- a/aten/src/ATen/native/cuda/UnarySignKernels.cu
+++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu
@@ -11,11 +11,11 @@
 namespace at { namespace native {
 
 void logical_not_kernel_cuda(TensorIterator& iter) {
-  // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND2(...)
+  // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND_COMPLEX_AND3(...)
   // so we don't have to maintain a separate list or to do double dispatch.
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cuda", [&]() {});
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cuda", [&]() {});
 
-  AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cuda", [&]() {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !a; });
   });
 }
diff --git a/test/test_torch.py b/test/test_torch.py
index 0633ae2ee7f5..0dc75123fd03 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6302,39 +6302,32 @@ def test_heaviside_complex(self, device, dtypes):
     def test_logical_not(self, device, dtype):
         data = [10, 1, 0.3, 0, -0.3, -1, -10]
         a = torch.tensor(data, dtype=dtype, device=device)
-
-        # do this before constructing the numpy array because np can't construct
-        # bfloat16 tensors.  Can we define our own dtype in NumPy so testing would be easier?
-        if dtype == torch.bfloat16 or dtype.is_complex:
-            self.assertRaises(RuntimeError, lambda: a.logical_not())
-            self.assertRaises(RuntimeError, lambda: a.logical_not_())
-            raise unittest.SkipTest('logical_not not supported on {}'.format(dtype))
-
-        a_np = np.array(data, dtype=torch_to_numpy_dtype_dict[dtype])
-        self.assertEqual(np.logical_not(a_np), torch.logical_not(a).to('cpu'))
-        self.assertEqual(np.logical_not(a_np, out=a_np), a.logical_not_().to('cpu'))
+        if dtype == torch.bfloat16:  # numpy doesn't support these dtypes
+            result = [False, False, False, True, False, False, False]
+            self.assertEqual(torch.logical_not(a), torch.tensor(result, dtype=torch.bool, device=device))
+        else:
+            a_np = np.array(data, dtype=torch_to_numpy_dtype_dict[dtype])
+            self.assertEqual(np.logical_not(a_np), torch.logical_not(a).to('cpu'))
+            self.assertEqual(np.logical_not(a_np, out=a_np), a.logical_not_().to('cpu'))
 
     @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
-    @dtypes(*list(product(torch.testing.get_all_dtypes(),
-                          torch.testing.get_all_dtypes())))
+    @dtypes(*product(torch.testing.get_all_dtypes(),
+                     torch.testing.get_all_dtypes()))
     def test_logical_not_out(self, device, dtypes):
         dtype = dtypes[0]
         out_dtype = dtypes[1]
         data = [10, 1, 0.3, 0, -0.3, -1, -10]
         a = torch.tensor(data, dtype=dtype, device=device)
-        out = torch.empty(a.shape, dtype=out_dtype, device=device)
-
-        if (dtype == torch.bfloat16 or dtype.is_complex or
-                out_dtype == torch.bfloat16 or out_dtype.is_complex):
-            self.assertRaises(RuntimeError, lambda: torch.logical_not(a, out=out))
-            raise unittest.SkipTest('logical_not not supported on {}'.format(out_dtype))
-
-        out_np = np.empty(a.shape, dtype=torch_to_numpy_dtype_dict[out_dtype])
-
-        self.assertEqual(a, a.cpu().numpy())
-        torch.logical_not(a, out=out)
-        np.logical_not(a.cpu().numpy(), out=out_np)
-        self.assertEqual(out_np, out.to('cpu'))
+        out = torch.empty_like(a, dtype=out_dtype, device=device)
+        if torch.bfloat16 in dtypes:  # numpy doesn't support these dtypes
+            result = [not i for i in a]
+            self.assertEqual(torch.logical_not(a, out=out), torch.tensor(result, dtype=out_dtype, device=device))
+        else:
+            out_np = np.empty(a.shape, dtype=torch_to_numpy_dtype_dict[out_dtype])
+            self.assertEqual(a, a.cpu().numpy())
+            torch.logical_not(a, out=out)
+            np.logical_not(a.cpu().numpy(), out=out_np)
+            self.assertEqual(out_np, out.to('cpu'))
 
     def _test_logical(self, device, dtypes, op, a_, b_, expected_res_):
         expected_res = torch.tensor(expected_res_, dtype=dtypes[0], device=device)

From f47fd0eb72407f5c65b3a8eab2a2aa43212ef520 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Tue, 29 Sep 2020 10:59:20 -0700
Subject: [PATCH 269/449] Updated `cholesky_backward` for complex inputs
 (#45267)

Summary:
Updated `cholesky_backward` to work correctly for complex input.
Note that the current implementation gives the conjugate of what JAX would return. anjali411 is that correct thing to do?
Ref. https://github.com/pytorch/pytorch/issues/44895

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45267

Reviewed By: bwasti

Differential Revision: D23975269

Pulled By: anjali411

fbshipit-source-id: 9908b0bb53c411e5ad24027ff570c4f0abd451e6
---
 test/test_autograd.py                   | 26 +++++++++++++++----------
 torch/csrc/autograd/FunctionsManual.cpp | 10 +++++-----
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index e68f1f667d6b..015a5b5b29dc 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -31,7 +31,7 @@
 from torch.utils.checkpoint import checkpoint
 from torch.testing._internal.common_utils import (TEST_MKL, TEST_WITH_ROCM, TestCase, run_tests, skipIfNoLapack,
                                                   suppress_warnings, slowTest,
-                                                  load_tests, random_symmetric_pd_matrix, random_symmetric_matrix,
+                                                  load_tests, random_symmetric_matrix,
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck)
 from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
@@ -2501,22 +2501,28 @@ def test_var_mean_differentiable(self):
     @skipIfNoLapack
     def test_cholesky(self):
         def func(root, upper):
-            x = torch.matmul(root, root.transpose(-1, -2)) + 1e-05
+            x = 0.5 * (root + root.transpose(-1, -2).conj())
             return torch.cholesky(x, upper)
 
-        def run_test(upper, dims):
-            root = torch.rand(*dims, requires_grad=True)
+        def run_test(upper, dims, dtype):
+            root = torch.rand(*dims, dtype=dtype, requires_grad=True)
+            root = root + torch.eye(dims[-1])
 
             gradcheck(func, [root, upper])
-            gradgradcheck(func, [root, upper])
+            # TODO: gradgradcheck does not work correctly yet for complex
+            if not dtype.is_complex:
+                gradgradcheck(func, [root, upper])
 
-            root = random_symmetric_pd_matrix(dims[-1], *dims[:-2]).requires_grad_()
+            root = torch.rand(*dims, dtype=dtype)
+            root = torch.matmul(root, root.transpose(-1, -2).conj())
+            root.requires_grad_()
             chol = root.cholesky().sum().backward()
-            self.assertEqual(root.grad, root.grad.transpose(-1, -2))  # Check the gradient is symmetric
+            self.assertEqual(root.grad, root.grad.transpose(-1, -2).conj())  # Check the gradient is hermitian
 
-        for upper, dims in product([True, False], [(3, 3), (4, 3, 2, 2)]):
-            run_test(upper, dims)
-            run_test(upper, dims)
+        for upper, dims, dtype in product([True, False],
+                                          [(3, 3), (4, 3, 2, 2)],
+                                          [torch.double, torch.cdouble]):
+            run_test(upper, dims, dtype)
 
     @skipIfNoLapack
     def test_cholesky_solve(self):
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 2f1dd09cb328..c744d2152106 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -715,15 +715,15 @@ Tensor cholesky_backward(Tensor grad, bool upper, Tensor L) {
   // leads to stable gradient updates, and retains symmetry of the updated matrix if it
   // were updated by a gradient based algorithm.
   if (upper) {
-    L = L.transpose(-1, -2);
-    grad = grad.transpose(-1, -2);
+    L = L.transpose(-1, -2).conj();
+    grad = grad.transpose(-1, -2).conj();
   }
   auto L_inverse = std::get<0>(at::triangular_solve(at::eye(L.size(-1), L.options()), L, /*upper=*/false));
-  auto phi = at::matmul(L.transpose(-1, -2), grad);
+  auto phi = at::matmul(L.transpose(-1, -2).conj(), grad);
   phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5);
 
-  auto grad_input = at::matmul(at::matmul(L_inverse.transpose(-1, -2), phi), L_inverse);
-  return grad_input.add(grad_input.transpose(-1, -2)).mul_(0.5);  // Symmetrizing the gradient
+  auto grad_input = at::matmul(at::matmul(L_inverse.transpose(-1, -2).conj(), phi), L_inverse);
+  return grad_input.add(grad_input.transpose(-1, -2).conj()).mul_(0.5);  // Symmetrizing the gradient
 }
 
 Tensor cholesky_inverse_backward(Tensor grad, Tensor L, bool upper, Tensor inverse) {

From aa2bd7e1aee86d691db29155dc540c553ac64b21 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Tue, 29 Sep 2020 11:00:02 -0700
Subject: [PATCH 270/449] Conservative-ish persistent RNN heuristics for
 compute capability 8.0+ (#43165)

Summary:
Based on https://github.com/pytorch/pytorch/pull/43165#issuecomment-697033663 and tests by Vasily Volkov ([persistentRNN-speedup.xlsx](https://github.com/pytorch/pytorch/files/5298001/persistentRNN-speedup.xlsx)).  See comments in code.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43165

Reviewed By: zhangguanheng66, mruberry

Differential Revision: D23991756

Pulled By: ngimel

fbshipit-source-id: 4c2c14c9002be2fec76fb21ba55b7dab79497510
---
 aten/src/ATen/native/cudnn/RNN.cpp | 71 ++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 18 deletions(-)

diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 5be7d6eea8ea..aa99490deb2d 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -587,25 +587,60 @@ namespace {
     }
   }
 
-  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){
-      cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
-      const int64_t bsize = tensors.mini_batch;
-      //excluding Turing from using persistent rnn.
-      if (prop->major == 7 && prop->minor != 5 && getCudnnDataType(input) == CUDNN_DATA_HALF && !tensors.is_input_packed()) {
-          if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && rnn.num_directions() == 1 &&
-                  rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){
-              //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
-              //weed them out
-              if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){
-                  if ((tensors.seq_length >=40 && bsize <=128) ||
-                     (tensors.seq_length >=20 && bsize <=96) ||
-                     (tensors.seq_length >=10 && bsize <=32)) {
-                     return CUDNN_RNN_ALGO_PERSIST_STATIC;
-                  }
-              }
-          }
+  inline bool use_persist_common_heuristics(const RNNDescriptorParams& rnn,
+                                            const TensorDescriptorListParams& tensors) {
+    return rnn.num_layers == 1 &&
+           rnn.hidden_size <= 1024 &&
+           rnn.num_directions() == 1 &&
+           rnn.hidden_size % 128 == 0 &&
+           tensors.input_size % 128 == 0;
+  }
+
+  inline bool use_persist_device_heuristics(const RNNDescriptorParams& rnn,
+                                            const TensorDescriptorListParams& tensors) {
+    auto bsize = tensors.mini_batch;
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    if (prop->major == 7) {
+      if (prop->minor == 5) {
+        // Excludes Turing from using persistent rnn.
+        return false;
+      } else {
+        // technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
+        // weed them out
+        return ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8) &&
+               ((tensors.seq_length >=40 && bsize <=128) ||
+                (tensors.seq_length >=20 && bsize <=96) ||
+                (tensors.seq_length >=10 && bsize <=32));
+      }
+    } else if (prop->major >= 8) {
+      // Based on tests by Vasily Volkov and xwang233.  Vasily only tried bsize <= 128,
+      // so conservatively enable persistence for bsize <= 128 only.
+      // TODO:  Run more tests for bsize > 128.
+      if (rnn.mode == CUDNN_GRU) {
+        // Persistent GRU performance is flakier than other RNN types.  Exclude them for now.
+        // TODO:  Write a more refined GRU heuristic.
+        return false;
+      } else if (rnn.mode == CUDNN_LSTM) {
+        // Persistent LSTMs are comparable to or better than non-persistent for bsize <= 128.
+        return bsize <= 128;
+      } else {
+        // Persistent RNN_RELU and TANH show poor performance when bsize >= 96 AND hidden size >= 896.
+        return (bsize <= 128) && (bsize < 96 || rnn.hidden_size < 896);
       }
-      return CUDNN_RNN_ALGO_STANDARD;
+    } else {
+      return false;
+    }
+  }
+
+  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input) {
+    if (getCudnnDataType(input) == CUDNN_DATA_HALF &&
+        !tensors.is_input_packed()) {
+      if (use_persist_common_heuristics(rnn, tensors) &&
+          use_persist_device_heuristics(rnn, tensors)) {
+        return CUDNN_RNN_ALGO_PERSIST_STATIC;
+      }
+    }
+    return CUDNN_RNN_ALGO_STANDARD;
   }
 
   cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {

From b2925671b61e4bc04a5dcabea94a42435824a5be Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Tue, 29 Sep 2020 11:01:00 -0700
Subject: [PATCH 271/449] Updates deterministic flag to throw a warning, makes
 docs consistent (#45410)

Summary:
Per feedback in the recent design review. Also tweaks the documentation to clarify what "deterministic" means and adds a test for the behavior.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45410

Reviewed By: ngimel

Differential Revision: D23974988

Pulled By: mruberry

fbshipit-source-id: e48307da9c90418fc6834fbd67b963ba2fe0ba9d
---
 aten/src/ATen/Context.cpp |  5 +++++
 test/test_torch.py        | 16 ++++++++++++++++
 torch/__init__.py         | 14 ++++++++------
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 18673877c219..1977f945a0fb 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -64,6 +64,11 @@ bool Context::deterministic() const {
 }
 
 void Context::setDeterministic(bool b) {
+  if (b) {
+    TORCH_WARN_ONCE("torch.set_deterministic is in beta, and its design and "
+      " functionality may change in the future.");
+  }
+
   _deterministic = b;
 }
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 0dc75123fd03..f25645c26fca 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -4690,6 +4690,22 @@ def add_neg_dim_tests():
 class TestTorchDeviceType(TestCase):
     exact_dtype = True
 
+    @onlyCPU
+    def test_set_deterministic_beta_warning(self, device):
+        det = torch.is_deterministic()
+        try:
+            # Ensures setting to false does not throw a warning
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                torch.set_deterministic(False)
+                self.assertEqual(len(w), 0)
+
+            # Setting set_deterministic(True) throws a warning once per process
+            with self.maybeWarnsRegex(UserWarning, "torch.set_deterministic is in beta"):
+                torch.set_deterministic(True)
+        finally:
+            torch.set_deterministic(det)
+
     # Tests that trying to add, inplace, a CUDA tensor to a CPU tensor
     #   throws the correct error message
     @onlyCUDA
diff --git a/torch/__init__.py b/torch/__init__.py
index 8b69a4612ae7..ea4b906a4f99 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -326,14 +326,16 @@ def set_default_dtype(d):
     _C._set_default_dtype(d)
 
 def set_deterministic(d):
-    r""" Sets whether native PyTorch operations must use deterministic
-    algorithms. When True, operations without deterministic algorithms
-    will throw a :class:RuntimeError when called.
+    r""" Sets whether PyTorch operations must use "deterministic"
+    algorithms. That is, algorithms which, given the same input, and when
+    run on the same software and hardware, always produce the same output.
+    When True, operations will use deterministic algorithms when available,
+    and if only nondeterministic algorithms are available they will throw a
+    :class:RuntimeError when called.
 
     .. warning::
-        This feature is a beta feature, so it does not affect every
-        nondeterministic operation yet. The following operations are
-        affected by this flag.
+        This feature is in beta, and its design and implementation may change
+        in the future.
 
     The following normally-nondeterministic operations will act
     deterministically when `d=True`:

From 6e55a26e1024603025f8fd9f02b43407547d8707 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Tue, 29 Sep 2020 11:31:16 -0700
Subject: [PATCH 272/449] Move mobile specific CPUCachingAllocator to
 c10/mobile folder. (#45364)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45364

Plus add some more comments about the usage, limitations and cons.

Test Plan: Build and run benchmark binary.

Reviewed By: gchanan

Differential Revision: D23944193

fbshipit-source-id: 30d4f4991d2185a0ab768d94c846d73730fc0835
---
 BUILD.bazel                                   |  2 +
 .../ATen/test/cpu_caching_allocator_test.cpp  |  2 +-
 binaries/speed_benchmark_torch.cc             |  2 +-
 c10/CMakeLists.txt                            |  1 +
 c10/core/CPUAllocator.cpp                     |  2 +-
 c10/{core => mobile}/CPUCachingAllocator.cpp  |  2 +-
 c10/{core => mobile}/CPUCachingAllocator.h    | 42 ++++++++++++++-----
 7 files changed, 39 insertions(+), 14 deletions(-)
 rename c10/{core => mobile}/CPUCachingAllocator.cpp (98%)
 rename c10/{core => mobile}/CPUCachingAllocator.h (71%)

diff --git a/BUILD.bazel b/BUILD.bazel
index 016863ff0958..6aacd560c40a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -55,6 +55,7 @@ cc_library(
         "c10/cuda/*.h",
         "c10/cuda/impl/*.h",
         "c10/macros/*.h",
+        "c10/mobile/*.h",
         "c10/util/*.h",
         "c10/util/*.hpp",
     ]),
@@ -71,6 +72,7 @@ cc_library(
     srcs = glob([
         "c10/core/*.cpp",
         "c10/core/impl/*.cpp",
+        "c10/mobile/*.cpp",
         "c10/util/*.cpp",
     ]) + if_cuda(
         glob([
diff --git a/aten/src/ATen/test/cpu_caching_allocator_test.cpp b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
index 28a9b0476524..cead52f5a7cc 100644
--- a/aten/src/ATen/test/cpu_caching_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_caching_allocator_test.cpp
@@ -3,7 +3,7 @@
 #include <ATen/cpu/vec256/vec256.h>
 #include <ATen/ATen.h>
 
-#include <c10/core/CPUCachingAllocator.h>
+#include <c10/mobile/CPUCachingAllocator.h>
 
 TEST(CPUCachingAllocatorTest, check_alloc_free) {
   c10::CPUCachingAllocator caching_allocator;
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
index db78467cfb43..09f1cabb8e15 100644
--- a/binaries/speed_benchmark_torch.cc
+++ b/binaries/speed_benchmark_torch.cc
@@ -24,7 +24,7 @@
 #include "torch/csrc/jit/serialization/import.h"
 #include "torch/script.h"
 
-#include "c10/core/CPUCachingAllocator.h"
+#include "c10/mobile/CPUCachingAllocator.h"
 
 #include <chrono>
 using namespace std::chrono;
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 17fd7e680122..818226cdc893 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -32,6 +32,7 @@ file(GLOB C10_SRCS
         core/dispatch/*.cpp
         core/op_registration/*.cpp
         core/impl/*.cpp
+        mobile/*.cpp
         macros/*.cpp
         util/*.cpp
         )
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index e830aa4832d0..5502aaf4b3d6 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -1,6 +1,6 @@
 #include <c10/core/CPUAllocator.h>
-#include <c10/core/CPUCachingAllocator.h>
 #include <c10/core/DeviceType.h>
+#include <c10/mobile/CPUCachingAllocator.h>
 
 // TODO: rename flags to C10
 C10_DEFINE_bool(
diff --git a/c10/core/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp
similarity index 98%
rename from c10/core/CPUCachingAllocator.cpp
rename to c10/mobile/CPUCachingAllocator.cpp
index 232b8f2306e2..b2f193299089 100644
--- a/c10/core/CPUCachingAllocator.cpp
+++ b/c10/mobile/CPUCachingAllocator.cpp
@@ -1,4 +1,4 @@
-#include <c10/core/CPUCachingAllocator.h>
+#include <c10/mobile/CPUCachingAllocator.h>
 
 namespace c10 {
 
diff --git a/c10/core/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h
similarity index 71%
rename from c10/core/CPUCachingAllocator.h
rename to c10/mobile/CPUCachingAllocator.h
index ac5f3a95c881..6a748f4f1791 100644
--- a/c10/core/CPUCachingAllocator.h
+++ b/c10/mobile/CPUCachingAllocator.h
@@ -10,6 +10,38 @@
 #include <c10/util/SmallVector.h>
 #include <c10/util/flat_hash_map.h>
 
+/*
+ * CPUCachingAllocator:
+ * DISCLAIMER:
+ *    This is subject to change (beta) and only supported on mobile builds.
+ *    If code snippet such as in 'Usage pattern' is used outside of mobile
+ *    build you will not observe the intended behavior.
+ *    See below for more information.
+ * Why?
+ *    It has been observed that some mobile platforms, such as pixel 3, return
+ *    memory aggressively to the system. This results in page faults in some cases
+ *    and ends up hurting performance. This caching allocator aims to address that.
+ *    Furthermore it also allows users to specify their own allocator by implementing
+ *    allocate/free virtual interfaces.
+ * What are the cons?
+ *    There are some cons that were observed where use of caching allocator led to
+ *    worse performance on some platforms. Reason being that the caching mechanism
+ *    used by this allocator left us worse off compared to the corresonding platform's
+ *    tuned memory allocator. In that case it seemed better to not use this allocator.
+ *    Note there are some ideas to fix this in the works.
+ *
+ * Usage:
+ * Usage pattern:
+ * Instantiate and own the caching allocator.
+ * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
+ *   std::make_unique<c10::CPUCachingAllocator>();
+ * Use caching allocator with a scoped guard at inference time.
+ * {
+ * WithCPUCachingAllocatorGuard(caching_allocator.get());
+ * ... model.forward(...);
+ * }
+ */
+
 namespace c10 {
 
 class C10_API CPUCachingAllocator {
@@ -64,16 +96,6 @@ CPUCachingAllocator* GetDefaultCPUCachingAllocator();
 bool ThreadLocalCachingAllocatorEnabled();
 CPUCachingAllocator* GetThreadLocalCachingAllocator();
 
-/*
- * Usage pattern:
- * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
- *   std::make_unique<c10::CPUCachingAllocator>();
- * {
- * WithCPUCachingAllocatorGuard(caching_allocator.get());
- * ...
- * }
- */
-
 class C10_API WithCPUCachingAllocatorGuard {
   public:
     WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);

From 0df99ad4706d1dceddcf0424bb3c17453fcf4722 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Tue, 29 Sep 2020 11:50:58 -0700
Subject: [PATCH 273/449] Remove unnecessary __at_align32__ in
 int_elementwise_binary_256 (#45470)

Summary:
They were added in 4b3046ed286e92b5910769bf97f2bc6a1ad473d1 based on a
misunderstanding of `_mm256_storeu_si256`, but they
are actually unnecessary. The [document][1] of `_mm256_storeu_si256` says:

> Moves values from a integer vector to an **unaligned** memory location.

In this case, it's better to remove the `__at_align32__` qualifier to
leave the compiler and linker more flexibility to optimize.

[1]: https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html

Close https://github.com/pytorch/pytorch/issues/44810

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45470

Reviewed By: zhangguanheng66

Differential Revision: D23980060

Pulled By: glaringlee

fbshipit-source-id: 12b3558b76c6e81d88a72081060fdb8674464768
---
 aten/src/ATen/cpu/vec256/vec256_int.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h
index 6486d06182eb..30bf6421adb3 100644
--- a/aten/src/ATen/cpu/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec256/vec256_int.h
@@ -104,6 +104,8 @@ class Vec256<int64_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int64_t tmp_values[size()];
@@ -228,6 +230,8 @@ class Vec256<int32_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int32_t tmp_values[size()];
@@ -449,6 +453,8 @@ class Vec256<int16_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int16_t tmp_values[size()];
@@ -699,6 +705,8 @@ class Vec256<int8_t> : public Vec256i {
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
       _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
     } else if (count > 0) {
       __at_align32__ int8_t tmp_values[size()];
@@ -879,8 +887,8 @@ Vec256<int16_t> inline operator*(const Vec256<int16_t>& a, const Vec256<int16_t>
 
 template <typename T, typename Op>
 Vec256<T> inline int_elementwise_binary_256(const Vec256<T>& a, const Vec256<T>& b, Op op) {
-  __at_align32__ T values_a[Vec256<T>::size()];
-  __at_align32__ T values_b[Vec256<T>::size()];
+  T values_a[Vec256<T>::size()];
+  T values_b[Vec256<T>::size()];
   a.store(values_a);
   b.store(values_b);
   for (int i = 0; i != Vec256<T>::size(); i++) {

From b66ac1e92886a23126d5c29b4ee87466d5bf96a6 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Tue, 29 Sep 2020 12:15:01 -0700
Subject: [PATCH 274/449] Updates nonzero's as_tuple behavior to no longer
 warn. (#45413)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/44284.

[torch.nonzero](https://pytorch.org/docs/master/generated/torch.nonzero.html?highlight=nonzero#torch.nonzero) is distinct from [numpy.nonzero](https://numpy.org/doc/1.18/reference/generated/numpy.nonzero.html?highlight=nonzero#numpy.nonzero). The latter returns a tensor by default, and the former returns a tuple of tensors. The `as_tuple` argument was added as part of an intended deprecation process to make torch.nonzero consistent with numpy.nonzero, but this was a confusing change for users. A better deprecation path would be to offer torch.argwhere consistent with [numpy.argwhere](https://numpy.org/doc/stable/reference/generated/numpy.argwhere.html?highlight=argwhere#numpy.argwhere), which is equivalent to the default torch.nonzero behavior. Once this is offered a change to torch.nonzero should be more straightforward with less user disruption, if we decided that's the correct change to pursue.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45413

Reviewed By: ngimel

Differential Revision: D23975015

Pulled By: mruberry

fbshipit-source-id: b59237d0d8c2df984e952b62d0a7c247b49d84dc
---
 test/test_torch.py                            | 44 +++++++++++++------
 .../templates/python_torch_functions.cpp      | 30 ++++++-------
 2 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index f25645c26fca..e43335887dc0 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -10795,15 +10795,6 @@ def assert_tuple_empty(tup, dim):
         self.assertEqual(1, len(z))
         self.assertEqual(torch.empty(0, dtype=torch.long), z[0])
 
-    @onlyOnCPUAndCUDA
-    def test_nonzero_deprecated(self, device):
-        x = torch.randn((2, 3), device=device)
-        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
-            x.nonzero()
-
-        with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"):
-            torch.nonzero(x)
-
     # TODO: add torch.complex64, torch.complex128
     @dtypes(torch.float, torch.double)
     def test_normal(self, device, dtype):
@@ -13070,10 +13061,6 @@ def gen_nontrivial_input(shape, dtype, device):
             dst2 = tensor.nonzero(as_tuple=False)
             dst3 = torch.empty([], dtype=torch.long, device=device)
             torch.nonzero(tensor, out=dst3)
-            self.assertRaisesRegex(
-                TypeError,
-                "received an invalid combination of arguments",
-                lambda: torch.nonzero(tensor, as_tuple=True, out=dst3))
             if self.device_type != 'xla':
                 # xla does not raise runtime error
                 self.assertRaisesRegex(
@@ -13099,6 +13086,37 @@ def gen_nontrivial_input(shape, dtype, device):
             self.assertEqual(tup1, np_result, atol=0, rtol=0)
             self.assertEqual(tup2, np_result, atol=0, rtol=0)
 
+    def test_nonzero_astuple_out(self, device):
+        t = torch.randn((3, 3, 3), device=device)
+        out = torch.empty_like(t, dtype=torch.long)
+
+        with self.assertRaises(RuntimeError):
+            torch.nonzero(t, as_tuple=True, out=out)
+
+        self.assertEqual(torch.nonzero(t, as_tuple=False, out=out), torch.nonzero(t, out=out))
+
+        # Verifies that JIT script cannot handle the as_tuple kwarg
+        # See Issue https://github.com/pytorch/pytorch/issues/45499.
+        def _foo(t):
+            tuple_result = torch.nonzero(t, as_tuple=True)
+            nontuple_result = torch.nonzero(t, as_tuple=False)
+            out = torch.empty_like(nontuple_result)
+            torch.nonzero(t, as_tuple=False, out=out)
+            return tuple_result, nontuple_result, out
+
+        with self.assertRaises(RuntimeError):
+            scripted_foo = torch.jit.script(_foo)
+
+        # Verifies that JIT tracing works fine
+        traced_foo = torch.jit.trace(_foo, t)
+        traced_tuple, traced_nontuple, traced_out = traced_foo(t)
+        expected_tuple = torch.nonzero(t, as_tuple=True)
+        expected_nontuple = torch.nonzero(t)
+
+        self.assertEqual(traced_tuple, expected_tuple)
+        self.assertEqual(traced_nontuple, expected_nontuple)
+        self.assertEqual(traced_out, expected_nontuple)
+
     @onlyOnCPUAndCUDA
     def test_nonzero_discontiguous(self, device):
         shape = (4, 4)
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index 673af99bce77..aac41111e1bf 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -583,29 +583,29 @@ static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject*
 {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
-    "nonzero(Tensor input, *, Tensor out=None)|deprecated",
-    "nonzero(Tensor input, *, bool as_tuple)",
+    "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)",
   });
-  ParsedArgs<2> parsed_args;
+  ParsedArgs<3> parsed_args;
   auto r = parser.parse(args, kwargs, parsed_args);
 
   if(r.has_torch_function()){
     return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch");
   }
 
-  if (r.idx == 0) {
-    if (r.isNone(1)) {
-      return wrap(dispatch_nonzero(r.tensor(0)));
-    } else {
-      return wrap(dispatch_nonzero(r.tensor(0), r.tensor(1)));
-    }
-  } else {
-    if (r.toBool(1)) {
-      return wrap(dispatch_nonzero_numpy(r.tensor(0)));
-    } else {
-      return wrap(dispatch_nonzero(r.tensor(0)));
-    }
+  const auto as_tuple = r.toBool(1);
+  const auto has_out = !r.isNone(2);
+
+  if (as_tuple) {
+    TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True");
+    return wrap(dispatch_nonzero_numpy(r.tensor(0)));
+  }
+
+  if (has_out) {
+    return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2)));
   }
+
+  return wrap(dispatch_nonzero(r.tensor(0)));
+
   END_HANDLE_TH_ERRORS
 }
 

From 147c88ef2d123dd8e3f6274e2234fb32614942d4 Mon Sep 17 00:00:00 2001
From: mattip <matti.picus@gmail.com>
Date: Tue, 29 Sep 2020 12:28:54 -0700
Subject: [PATCH 275/449] Add docs to a pytorch.github.io/doc/tag directory
 when repo is tagged (#45204)

Summary:
In coordination with jlin27.

This PR is meant to build documentation when the repo is tagged. For instance, tagging the repo with 1.7.0rc1 will push that commit's documentation to pytorch/pytorch.github.io/docs/1.7.

Subsequently tagging 1.7.0rc2 will override the 1.7 docs, as will 1.7.0, and 1.7.1. I think this is as it should be: there should be one, latest, version for the 1.7 docs. This can be tweaked differently if desired.

There is probably work that needs to be done to adjust the [versions.html](https://pytorch.org/docs/versions.html) to add the new tag?

Is there a way to test the tagging side of this without breaking the production documentation?

As an aside, the documentation is being built via the `pytorch_linux_xenial_py3_6_gcc5_4_build` image. Some projects are starting to move on from python3.6 since [it is in security-only support mode](https://devguide.python.org/#status-of-python-branches), no new binaries are being released.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45204

Reviewed By: zhangguanheng66

Differential Revision: D23996800

Pulled By: seemethere

fbshipit-source-id: a94a080348a47738c1de5832ab37b2b0d57d2d57
---
 .../cimodel/data/pytorch_build_definitions.py | 25 +++++++++---
 .../cimodel/data/simple/docker_definitions.py | 29 ++++++++------
 .circleci/config.yml                          | 39 ++++++++++++++++++-
 .../job-specs/job-specs-custom.yml            | 10 ++++-
 4 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index d582348b00c8..ccd97a053516 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -6,7 +6,7 @@
 import cimodel.lib.conf_tree as conf_tree
 import cimodel.lib.miniutils as miniutils
 from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode
-from cimodel.data.simple.util.branch_filters import gen_filter_dict
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
 from cimodel.data.simple.util.docker_constants import gen_docker_image
 
 
@@ -110,6 +110,8 @@ def gen_workflow_params(self, phase):
             parameters["resource_class"] = resource_class
         if phase == "build" and self.rocm_version is not None:
             parameters["resource_class"] = "xlarge"
+        if hasattr(self, 'filters'):
+            parameters['filters'] = self.filters
         return parameters
 
     def gen_workflow_job(self, phase):
@@ -139,14 +141,16 @@ def gen_workflow_job(self, phase):
 
 # TODO This is a hack to special case some configs just for the workflow list
 class HiddenConf(object):
-    def __init__(self, name, parent_build=None):
+    def __init__(self, name, parent_build=None, filters=None):
         self.name = name
         self.parent_build = parent_build
+        self.filters = filters
 
     def gen_workflow_job(self, phase):
         return {
             self.gen_build_name(phase): {
-                "requires": [self.parent_build.gen_build_name("build")]
+                "requires": [self.parent_build.gen_build_name("build")],
+                "filters": self.filters,
             }
         }
 
@@ -166,7 +170,8 @@ def gen_workflow_job(self, phase):
                 "branch": self.branch,
                 "requires": [self.parent_build],
                 "context": "org-member",
-                "filters": gen_filter_dict(branches_list=["nightly"])
+                "filters": gen_filter_dict(branches_list=["nightly"],
+                                           tags_list=RC_PATTERN)
             }
         }
 
@@ -205,7 +210,9 @@ def gen_docs_configs(xenial_parent_config):
     configs.append(
         HiddenConf(
             "pytorch_python_doc_build",
-            parent_build=xenial_parent_config
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(branches_list=r"/.*/",
+                                    tags_list=RC_PATTERN),
         )
     )
     configs.append(
@@ -219,7 +226,9 @@ def gen_docs_configs(xenial_parent_config):
     configs.append(
         HiddenConf(
             "pytorch_cpp_doc_build",
-            parent_build=xenial_parent_config
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(branches_list=r"/.*/",
+                                    tags_list=RC_PATTERN),
         )
     )
     configs.append(
@@ -348,6 +357,8 @@ def instantiate_configs():
 
         # run docs builds on "pytorch-linux-xenial-py3.6-gcc5.4". Docs builds
         # should run on a CPU-only build that runs on all PRs.
+        # XXX should this be updated to a more modern build? Projects are
+        #     beginning to drop python3.6
         if (
             distro_name == "xenial"
             and fc.find_prop("pyver") == "3.6"
@@ -358,6 +369,8 @@ def instantiate_configs():
             and compiler_name == "gcc"
             and fc.find_prop("compiler_version") == "5.4"
         ):
+            c.filters = gen_filter_dict(branches_list=r"/.*/",
+                                        tags_list=RC_PATTERN)
             c.dependent_tests = gen_docs_configs(c)
 
         if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch:
diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
index 90d776311601..2b3add33b9a8 100644
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -1,6 +1,7 @@
 from collections import OrderedDict
 
 from cimodel.lib.miniutils import quote
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
 
 
 # TODO: make this generated from a matrix rather than just a static list
@@ -24,7 +25,7 @@
     "pytorch-linux-xenial-py3.8",
     "pytorch-linux-xenial-py3.6-clang7",
     "pytorch-linux-xenial-py3.6-gcc4.8",
-    "pytorch-linux-xenial-py3.6-gcc5.4",
+    "pytorch-linux-xenial-py3.6-gcc5.4",  # this one is used in doc builds
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
     "pytorch-linux-bionic-rocm3.7-py3.6",
@@ -34,16 +35,20 @@
 
 def get_workflow_jobs():
     """Generates a list of docker image build definitions"""
-    return [
-        OrderedDict(
+    ret = []
+    for image_name in IMAGE_NAMES:
+        parameters = OrderedDict({
+            "name": quote(f"docker-{image_name}"),
+            "image_name": quote(image_name),
+        }) 
+        if image_name == "pytorch-linux-xenial-py3.6-gcc5.4":
+            # pushing documentation on tags requires CircleCI to also
+            # build all the dependencies on tags, including this docker image
+            parameters['filters'] = gen_filter_dict(branches_list=r"/.*/",
+                                                    tags_list=RC_PATTERN)
+        ret.append(OrderedDict(
             {
-                "docker_build_job": OrderedDict(
-                    {
-                        "name": quote(f"docker-{image_name}"),
-                        "image_name": quote(image_name),
-                    }
-                )
+                "docker_build_job": parameters
             }
-        )
-        for image_name in IMAGE_NAMES
-    ]
+        ))
+    return ret
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4029eed41b53..c82f99e1453e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1188,10 +1188,13 @@ jobs:
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -1230,10 +1233,13 @@ jobs:
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -6432,6 +6438,11 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc5.4"
           image_name: "pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7.2"
           image_name: "pytorch-linux-xenial-py3.6-gcc7.2"
@@ -6450,6 +6461,11 @@ workflows:
             - "docker-pytorch-linux-xenial-py3.6-gcc5.4"
           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - pytorch_linux_test:
           name: pytorch_linux_xenial_py3_6_gcc5_4_test
           requires:
@@ -6457,7 +6473,17 @@ workflows:
           build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4"
           resource_class: large
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
       - pytorch_python_doc_build:
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
       - pytorch_doc_push:
@@ -6467,10 +6493,17 @@ workflows:
             branches:
               only:
                 - nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: pytorch_python_doc_push
           requires:
             - pytorch_python_doc_build
       - pytorch_cpp_doc_build:
+          filters:
+            branches:
+              only: /.*/
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
       - pytorch_doc_push:
@@ -6480,6 +6513,8 @@ workflows:
             branches:
               only:
                 - nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
           name: pytorch_cpp_doc_push
           requires:
             - pytorch_cpp_doc_build
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 9cc75136cfdd..70a38e45733a 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -43,10 +43,13 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 
@@ -85,10 +88,13 @@
           set -ex
           export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
           echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+          tag=${CIRCLE_TAG:1:5}
+          target=${tag:-master}
+          echo "building for ${target}"
           time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
           export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
 
-          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1'
 
           echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
 

From 18876b572275fce59c284d41a9b486ed1cd02bc8 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Tue, 29 Sep 2020 12:50:05 -0700
Subject: [PATCH 276/449] Update backward formula for torch.dot and add
 backward definition for torch.vdot (#45074)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45074

TODO: Add R -> C tests in https://github.com/pytorch/pytorch/pull/44744 (blocked on some JIT changes)

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D23975361

Pulled By: anjali411

fbshipit-source-id: 3512bd2962b588a198bc317673bd18cc96ac823f
---
 test/test_autograd.py                                 | 2 +-
 tools/autograd/derivatives.yaml                       | 8 ++++----
 torch/csrc/autograd/FunctionsManual.cpp               | 8 ++++++++
 torch/csrc/autograd/FunctionsManual.h                 | 1 +
 torch/testing/_internal/common_methods_invocations.py | 1 +
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index 015a5b5b29dc..dbcb8639a5a5 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4806,7 +4806,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
                 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
-                'cosh', '__rmul__', 'sgn', 'abs'] + separate_complex_tests
+                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot'] + separate_complex_tests
 
 # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
 # complex_list += ['fill_', 't', '__rdiv__', 'tanh']
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 3925d496e804..707baa3bbeaf 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -404,12 +404,12 @@
   self: div_tensor_self_backward(grad, at::scalar_to_tensor(other), self.scalar_type())
 
 - name: dot(Tensor self, Tensor tensor) -> Tensor
-  self: grad * tensor
-  tensor: grad * self
+  self: handle_r_to_c(self.scalar_type(), grad * tensor.conj())
+  tensor: handle_r_to_c(tensor.scalar_type(), grad * self.conj())
 
 - name: vdot(Tensor self, Tensor other) -> Tensor
-  self: 'not_implemented("vdot: self")'
-  other: 'not_implemented("vdot: other")'
+  self: handle_r_to_c(self.scalar_type(), grad.conj() * other)
+  other: handle_r_to_c(other.scalar_type(), grad * self)
 
 - name: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
   self: _fused_dropout_backward(grad, result1, p)
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index c744d2152106..f453aeaf15ab 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -87,6 +87,14 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) {
   return tensor;
 }
 
+Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result) {
+  if (!at::isComplexType(self_st) && gradient_result.is_complex()) {
+    // R -> C
+    return at::real(gradient_result);
+  }
+  return gradient_result;
+}
+
 Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim) {
   if (keepdim) {
     return output;
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index c253d9ba48b1..00171cbbf656 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -33,6 +33,7 @@ bool any_variable_defined(variable_list& variables);
 void copy_range(variable_list& out, IndexRange range, const at::Tensor & t);
 void copy_range(variable_list& out, IndexRange range, at::ArrayRef<at::Tensor> t);
 at::Tensor not_implemented(const char* name);
+at::Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result);
 at::Tensor maybe_multiply(const at::Tensor & t, const at::Scalar & s);
 int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim);
 Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index db2a4c2b92d1..290645fd0d32 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -961,6 +961,7 @@ def method_tests():
         ('addr', (S, M), ((S,), (M,)), 'coef', (), (), (), ident, {'beta': 0.2, 'alpha': 0.6}),
         ('addr', (), ((S,), (M,)), 'broadcast_lhs_coef', (), (), (), ident, {'beta': 0.2, 'alpha': 0.6}),
         ('dot', (L,), ((L,),), '', (True,)),
+        ('vdot', (L,), ((L,),),),
         ('mm', (S, M), ((M, S),), '', (True,)),
         ('bmm', (M, S, M), ((M, M, S),), '', (True,)),
         ('mv', (S, M), ((M,),), '', (True,)),

From ab5cf16b6c6c2c65cfe0823cd3b1f608b05995b2 Mon Sep 17 00:00:00 2001
From: Randall Hunt <ranman@fb.com>
Date: Tue, 29 Sep 2020 13:45:47 -0700
Subject: [PATCH 277/449] fix standard deviation gradient NaN behavior (#45468)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/4320

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45468

Reviewed By: zhangguanheng66

Differential Revision: D23991064

Pulled By: albanD

fbshipit-source-id: d4274895f2dac8b2cdbd73e5276ce3df466fc341
---
 torch/csrc/autograd/FunctionsManual.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index f453aeaf15ab..1314a98e9562 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -656,11 +656,11 @@ Tensor var_backward(Tensor grad, const Tensor & self, IntArrayRef dim, bool unbi
 }
 
 Tensor std_backward(const Tensor & result, const Tensor & grad, const Tensor & self, bool unbiased) {
-  return var_backward(grad / (result * 2), self, unbiased);
+  return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, unbiased);
 }
 
 Tensor std_backward(const Tensor & result, Tensor grad, const Tensor & self, IntArrayRef dim, bool unbiased, bool keepdim) {
-  return var_backward(grad / (result * 2), self, dim, unbiased, keepdim);
+  return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, dim, unbiased, keepdim);
 }
 
 Tensor mean_backward(Tensor grad, const IntArrayRef sizes, IntArrayRef dim, bool keepdim) {

From d642992877139671466d2a96663abede9e39ad55 Mon Sep 17 00:00:00 2001
From: Martin Yuan <myuan@fb.com>
Date: Tue, 29 Sep 2020 14:50:04 -0700
Subject: [PATCH 278/449] Quantized operators template selective (#45509)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45509

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44479

Test Plan: Imported from OSS

Reviewed By: dhruvbird

Differential Revision: D23626562

Pulled By: iseeyuan

fbshipit-source-id: c2fc8bad25f8e5e9a70eb1001b9066a711b8e8e7
---
 aten/src/ATen/native/quantized/cpu/qadd.cpp   |  38 +--
 .../ATen/native/quantized/cpu/qbatch_norm.cpp |  16 +-
 aten/src/ATen/native/quantized/cpu/qclamp.cpp |   2 +-
 .../src/ATen/native/quantized/cpu/qconcat.cpp |   8 +-
 aten/src/ATen/native/quantized/cpu/qconv.cpp  |  32 +-
 .../native/quantized/cpu/qconv_prepack.cpp    |  18 +-
 .../native/quantized/cpu/qconv_unpack.cpp     |  54 ++--
 aten/src/ATen/native/quantized/cpu/qelu.cpp   |   4 +-
 .../native/quantized/cpu/qembeddingbag.cpp    |  10 +-
 .../quantized/cpu/qembeddingbag_prepack.cpp   |   8 +-
 .../quantized/cpu/qembeddingbag_unpack.cpp    |   8 +-
 .../ATen/native/quantized/cpu/qhardswish.cpp  |   2 +-
 .../src/ATen/native/quantized/cpu/qlinear.cpp |   6 +-
 .../native/quantized/cpu/qlinear_dynamic.cpp  |   8 +-
 .../native/quantized/cpu/qlinear_prepack.cpp  |  14 +-
 .../native/quantized/cpu/qlinear_unpack.cpp   |   8 +-
 aten/src/ATen/native/quantized/cpu/qmul.cpp   |  36 +--
 .../native/quantized/cpu/qnormalization.cpp   |   6 +-
 aten/src/ATen/native/quantized/cpu/qpool.cpp  |   4 +-
 aten/src/ATen/native/quantized/cpu/qrelu.cpp  |   2 +-
 .../ATen/native/quantized/cpu/qthreshold.cpp  |   2 +-
 aten/src/ATen/native/quantized/library.cpp    | 284 ++++++++----------
 22 files changed, 275 insertions(+), 295 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp
index 22db20eeedb6..a12718502dd1 100644
--- a/aten/src/ATen/native/quantized/cpu/qadd.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp
@@ -266,29 +266,29 @@ Tensor qadd_scalar_tensor_out(Tensor qa, Tensor b, Tensor out) {
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("add",                 TORCH_FN(qadd</*ReLUFused=*/false>));
-  m.impl("add.out",             TORCH_FN(qadd_out</*ReLUFused=*/false>));
-  m.impl("add.Scalar",          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
-  m.impl("add.Scalar_out",      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
-  m.impl("add_relu",            TORCH_FN(qadd</*ReLUFused=*/true>));
-  m.impl("add_relu.out",        TORCH_FN(qadd_out</*ReLUFused=*/true>));
-  m.impl("add_relu.Scalar",     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
-  m.impl("add_relu.Scalar_out", TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add"),                 TORCH_FN(qadd</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add.out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar"),          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar_out"),      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu"),            TORCH_FN(qadd</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.out"),        TORCH_FN(qadd_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar"),     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar_out"), TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
   // deprecated functions, kept for backward compatibility
-  m.impl("add_out",             TORCH_FN(qadd_out</*ReLUFused=*/false>));
-  m.impl("add_relu_out",        TORCH_FN(qadd_out</*ReLUFused=*/true>));
-  m.impl("add_scalar",          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu",     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
-  m.impl("add_scalar_out",      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu_out", TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
-  m.impl("add_scalar.Tensor",   TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu.Tensor", TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/true>));
-  m.impl("add_scalar_out.Tensor", TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/false>));
-  m.impl("add_scalar_relu_out.Tensor", TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_out"),             TORCH_FN(qadd_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu_out"),        TORCH_FN(qadd_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar"),          TORCH_FN(qadd_scalar</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu"),     TORCH_FN(qadd_scalar</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out"),      TORCH_FN(qadd_scalar_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out"), TORCH_FN(qadd_scalar_out</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar.Tensor"),   TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu.Tensor"), TORCH_FN(qadd_scalar_tensor</*ReLUFused=*/true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out</*ReLUFused=*/true>));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("add", TORCH_FN(qadd</*ReLUFused=*/false>));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::add"), TORCH_FN(qadd</*ReLUFused=*/false>));
 }
 
 }  // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp
index effafcacc76e..b053940abba2 100644
--- a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp
@@ -378,14 +378,14 @@ Tensor quantized_batch_norm(
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("batch_norm",        TORCH_FN(q_batch_norm_impl<false>));
-  m.impl("batch_norm_relu",   TORCH_FN(q_batch_norm_impl<true>));
-  m.impl("batch_norm1d",      TORCH_FN(q_batch_norm1d_impl<false>));
-  m.impl("batch_norm1d_relu", TORCH_FN(q_batch_norm1d_impl<true>));
-  m.impl("batch_norm2d",      TORCH_FN(q_batch_norm2d_impl<false>));
-  m.impl("batch_norm2d_relu", TORCH_FN(q_batch_norm2d_impl<true>));
-  m.impl("batch_norm3d",      TORCH_FN(q_batch_norm3d_impl<false>));
-  m.impl("batch_norm3d_relu", TORCH_FN(q_batch_norm3d_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm"),        TORCH_FN(q_batch_norm_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm_relu"),   TORCH_FN(q_batch_norm_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm1d"),      TORCH_FN(q_batch_norm1d_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm1d_relu"), TORCH_FN(q_batch_norm1d_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm2d"),      TORCH_FN(q_batch_norm2d_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm2d_relu"), TORCH_FN(q_batch_norm2d_impl<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm3d"),      TORCH_FN(q_batch_norm3d_impl<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm3d_relu"), TORCH_FN(q_batch_norm3d_impl<true>));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/qclamp.cpp b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
index a70016307785..3a8b647d320f 100644
--- a/aten/src/ATen/native/quantized/cpu/qclamp.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
@@ -140,7 +140,7 @@ Tensor& hardtanh_quantized_cpu_(
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("clamp", TORCH_FN(clamp_quantized_cpu));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::clamp"), TORCH_FN(clamp_quantized_cpu));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/qconcat.cpp b/aten/src/ATen/native/quantized/cpu/qconcat.cpp
index 0656f40e3554..ca08c365d83d 100644
--- a/aten/src/ATen/native/quantized/cpu/qconcat.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconcat.cpp
@@ -102,10 +102,10 @@ Tensor qcat_out(const c10::List<Tensor>& qxs, int64_t dim, Tensor out) {
 } // namespace
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("cat", TORCH_FN(qcat<false>));
-  m.impl("cat_relu", TORCH_FN(qcat<true>));
-  m.impl("cat_out", TORCH_FN(qcat_out<false>));
-  m.impl("cat_relu_out", TORCH_FN(qcat_out<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat"), TORCH_FN(qcat<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat_relu"), TORCH_FN(qcat<true>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat_out"), TORCH_FN(qcat_out<false>));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::cat_relu_out"), TORCH_FN(qcat_out<true>));
 }
 
 Tensor cat_quantized_cpu(TensorList qxs, int64_t dim) {
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 621cdf31aeea..cb232a5d20c3 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -856,30 +856,30 @@ class QConvInt8ForBC final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("conv1d",          QConv1dInt8<false>::run);
-  m.impl("conv1d_relu",     QConv1dInt8<true>::run);
-  m.impl("conv2d.new",      QConvInt8<2, false>::run);
-  m.impl("conv2d_relu.new", QConvInt8<2, true>::run);
-  m.impl("conv3d.new",      QConvInt8<3, false>::run);
-  m.impl("conv3d_relu.new", QConvInt8<3, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d"),          QConv1dInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_relu"),     QConv1dInt8<true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d.new"),      QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d.new"),      QConvInt8<3, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_relu.new"), QConvInt8<3, true>::run);
   // for backward compatibility
-  m.impl("conv2d", QConvInt8ForBC<2, false>::run);
-  m.impl("conv2d_relu", QConvInt8ForBC<2, true>::run);
-  m.impl("conv3d", QConvInt8ForBC<3, false>::run);
-  m.impl("conv3d_relu", QConvInt8ForBC<3, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d"), QConvInt8ForBC<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu"), QConvInt8ForBC<2, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d"), QConvInt8ForBC<3, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_relu"), QConvInt8ForBC<3, true>::run);
 
   // transpose
-  m.impl("conv_transpose1d",  QConv1dInt8<false>::run);
-  m.impl("conv_transpose2d",  QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d"),  QConv1dInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d"),  QConvInt8<2, false>::run);
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("conv2d",      QConvInt8<2, false>::run);
-  m.impl("conv2d_relu", QConvInt8<2, true>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d"),      QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d_relu"), QConvInt8<2, true>::run);
 
   // transpose
-  m.impl("conv_transpose1d",  QConv1dInt8<false>::run);
-  m.impl("conv_transpose2d",  QConvInt8<2, false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose1d"),  QConv1dInt8<false>::run);
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose2d"),  QConvInt8<2, false>::run);
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 4387b255dfe1..7bf84c9d5646 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -415,21 +415,21 @@ class QConv1dPackWeightInt8 final {
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   // Conv
   // conv_prepack is deprecated, please use conv2d_prepack for 2D conv.
-  m.impl("conv_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv));
-  m.impl("conv1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_conv));
-  m.impl("conv2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv));
-  m.impl("conv3d_prepack", TORCH_FN(QConvPackWeightInt8<3>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_prepack"), TORCH_FN(QConvPackWeightInt8<3>::run_conv));
   // ConvTranspose
-  m.impl("conv_transpose1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_deconv));
-  m.impl("conv_transpose2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
   // Conv
-  m.impl("conv2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv));
   // ConvTranspose
-  m.impl("conv_transpose1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_deconv));
-  m.impl("conv_transpose2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
index 9e8a103cb17c..0886fdc7342e 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp
@@ -243,36 +243,36 @@ class QConvTranspose final {
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
   // conv_unpack is deprecated, please use conv2d_unpack for 2D conv.
-  m.impl("conv_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
   // We use  conv2d_unpack to be consistent with conv3d_unpack
-  m.impl("conv1d_unpack", TORCH_FN(QConv1dUnpackWeightsInt8::run));
-  m.impl("conv2d_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run));
-  m.impl("conv3d_unpack", TORCH_FN(QConvUnpackWeightsInt8<3>::run));
-
-  m.impl("conv2d_stride", TORCH_FN(QConvStride<2>::run));
-  m.impl("conv2d_padding", TORCH_FN(QConvPadding<2>::run));
-  m.impl("conv2d_output_padding", TORCH_FN(QConvOutputPadding<2>::run));
-  m.impl("conv2d_dilation", TORCH_FN(QConvDilation<2>::run));
-  m.impl("conv2d_groups", TORCH_FN(QConvGroups<2>::run));
-  m.impl("conv2d_transpose", TORCH_FN(QConvTranspose<2>::run));
-
-  m.impl("conv3d_stride", TORCH_FN(QConvStride<3>::run));
-  m.impl("conv3d_padding", TORCH_FN(QConvPadding<3>::run));
-  m.impl("conv3d_output_padding", TORCH_FN(QConvOutputPadding<3>::run));
-  m.impl("conv3d_dilation", TORCH_FN(QConvDilation<3>::run));
-  m.impl("conv3d_groups", TORCH_FN(QConvGroups<3>::run));
-  m.impl("conv3d_transpose", TORCH_FN(QConvTranspose<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_stride"), TORCH_FN(QConvStride<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_padding"), TORCH_FN(QConvPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_dilation"), TORCH_FN(QConvDilation<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_groups"), TORCH_FN(QConvGroups<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_stride"), TORCH_FN(QConvStride<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_padding"), TORCH_FN(QConvPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_dilation"), TORCH_FN(QConvDilation<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_groups"), TORCH_FN(QConvGroups<3>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_transpose"), TORCH_FN(QConvTranspose<3>::run));
 
   // ConvTranspose is the same, however, we want to have different name.
-  m.impl("conv_transpose1d_unpack", TORCH_FN(QConv1dUnpackWeightsInt8::run));
-  m.impl("conv_transpose2d_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run));
-
-  m.impl("conv_transpose2d_stride", TORCH_FN(QConvStride<2>::run));
-  m.impl("conv_transpose2d_padding", TORCH_FN(QConvPadding<2>::run));
-  m.impl("conv_transpose2d_output_padding", TORCH_FN(QConvOutputPadding<2>::run));
-  m.impl("conv_transpose2d_dilation", TORCH_FN(QConvDilation<2>::run));
-  m.impl("conv_transpose2d_groups", TORCH_FN(QConvGroups<2>::run));
-  m.impl("conv_transpose2d_transpose", TORCH_FN(QConvTranspose<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run));
+
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qelu.cpp b/aten/src/ATen/native/quantized/cpu/qelu.cpp
index 92b635471e78..e873506026e6 100644
--- a/aten/src/ATen/native/quantized/cpu/qelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qelu.cpp
@@ -24,8 +24,8 @@ Tensor quantized_celu(const Tensor& qx, double output_scale, int64_t output_zero
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("elu", quantized_elu);
-  m.impl("celu", quantized_celu);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::elu"), quantized_elu);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::celu"), quantized_celu);
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index da494936aad7..cb82d9aee469 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -451,14 +451,12 @@ class QEmbedding final {
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   // Function that works on TorchBind packed weights.
-  m.impl("embedding_bag_byte", TORCH_FN(QEmbeddingBag<8>::run));
-  m.impl("embedding_byte", TORCH_FN(QEmbedding<8>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte"), TORCH_FN(QEmbeddingBag<8>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_byte"), TORCH_FN(QEmbedding<8>::run));
 
   // Functions that work on at::Tensor packed weight.
-  m.impl(
-      "embedding_bag_byte_rowwise_offsets", embedding_bag_byte_rowwise_offsets);
-  m.impl(
-      "embedding_bag_4bit_rowwise_offsets", embedding_bag_4bit_rowwise_offsets);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_rowwise_offsets"), embedding_bag_byte_rowwise_offsets);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_rowwise_offsets"), embedding_bag_4bit_rowwise_offsets);
 }
 } // namespace
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 96d592594d04..d4e00d1ad48d 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -280,13 +280,13 @@ class QEmbeddingPackWeights final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("embedding_bag_byte_prepack", TORCH_FN(qembeddingbag_byte_prepack));
-  m.impl("embedding_bag_4bit_prepack", TORCH_FN(qembeddingbag_4bit_prepack));
-  m.impl("embedding_bag_2bit_prepack", TORCH_FN(qembeddingbag_2bit_prepack));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"), TORCH_FN(qembeddingbag_byte_prepack));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"), TORCH_FN(qembeddingbag_4bit_prepack));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"), TORCH_FN(qembeddingbag_2bit_prepack));
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("embedding_bag_prepack", TORCH_FN(QEmbeddingPackWeights::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_prepack"), TORCH_FN(QEmbeddingPackWeights::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
index 4a9ae73ee137..ca3d9dc71c7e 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp
@@ -171,15 +171,15 @@ class QEmbeddingUnpackWeights final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("embedding_bag_byte_unpack", qembeddingbag_byte_unpack);
-  m.impl("embedding_bag_4bit_unpack", qembeddingbag_4bit_unpack);
-  m.impl("embedding_bag_2bit_unpack", qembeddingbag_2bit_unpack);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_unpack"), qembeddingbag_byte_unpack);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_unpack"), qembeddingbag_4bit_unpack);
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_unpack"), qembeddingbag_2bit_unpack);
 }
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
   // Unpack the packed embedding_bag weights using TorchBind custom class.
   // TODO extend to support 4-bit qtensor.
-  m.impl("embedding_bag_unpack", TORCH_FN(QEmbeddingUnpackWeights::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_unpack"), TORCH_FN(QEmbeddingUnpackWeights::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
index f0dbd644b2be..064b88a8c91f 100644
--- a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp
@@ -85,7 +85,7 @@ Tensor quantized_hardswish(const Tensor& qx, double output_scale, int64_t output
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("hardswish", TORCH_FN(quantized_hardswish));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::hardswish"), TORCH_FN(quantized_hardswish));
 }
 
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index fdc6d1dd4d8b..a7b4f4b74357 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -397,12 +397,12 @@ class QLinearInt8 final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("linear", TORCH_FN(QLinearInt8<false>::run));
-  m.impl("linear_relu", TORCH_FN(QLinearInt8<true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), TORCH_FN(QLinearInt8<true>::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("linear", TORCH_FN(QLinearInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear"), TORCH_FN(QLinearInt8<false>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 2accf060deab..af2d7749ee50 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -455,13 +455,13 @@ class QLinearDynamicFp16 final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("linear_dynamic", TORCH_FN(QLinearDynamicInt8<false>::run));
-  m.impl("linear_relu_dynamic", TORCH_FN(QLinearDynamicInt8<true>::run));
-  m.impl("linear_dynamic_fp16", TORCH_FN(QLinearDynamicFp16<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic"), TORCH_FN(QLinearDynamicInt8<true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16<false>::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
-  m.impl("linear_dynamic", TORCH_FN(QLinearDynamicInt8<false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8<false>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index ee4b6ee2aaf6..23912f87d123 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -318,22 +318,22 @@ class QLinearPackWeightFp16Legacy final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("linear_prepack", TORCH_FN(QLinearPackWeightInt8::run));
-  m.impl("linear_prepack_legacy", TORCH_FN(QLinearPackWeightInt8Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_legacy"), TORCH_FN(QLinearPackWeightInt8Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("linear_prepack_fp16", TORCH_FN(QLinearPackWeightFp16::run));
-  m.impl("linear_prepack_fp16_legacy", TORCH_FN(QLinearPackWeightFp16Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) {
-  m.impl("linear_prepack", TORCH_FN(QLinearPackWeightInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run));
 }
 
 TORCH_LIBRARY_IMPL(_quantized, CPU, m) {
-  m.impl("linear_prepack_fp16", TORCH_FN(QLinearPackWeightFp16::run));
-  m.impl("linear_prepack_fp16_legacy", TORCH_FN(QLinearPackWeightFp16Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run));
+  m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
index 1bc8711a22f4..ecbae04dd957 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp
@@ -137,13 +137,13 @@ class QLinearUnpackWeightFp16Legacy final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, CPU, m) {
-  m.impl("linear_unpack.legacy", TORCH_FN(QLinearUnpackWeightInt8Legacy::run));
-  m.impl("linear_unpack_fp16.legacy", TORCH_FN(QLinearUnpackWeightFp16Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack.legacy"), TORCH_FN(QLinearUnpackWeightInt8Legacy::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16.legacy"), TORCH_FN(QLinearUnpackWeightFp16Legacy::run));
 }
 
 TORCH_LIBRARY_IMPL(quantized, CatchAll, m) {
-  m.impl("linear_unpack", TORCH_FN(QLinearUnpackWeightInt8::run));
-  m.impl("linear_unpack_fp16", TORCH_FN(QLinearUnpackWeightFp16::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp
index 13aa8acc669a..deeae36dc502 100644
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@@ -173,26 +173,26 @@ class QMulScalarTensorOut final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("mul",                 TORCH_FN(QMul</*ReLUFused=*/false>::run));
-  m.impl("mul.out",             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
-  m.impl("mul.Scalar",          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
-  m.impl("mul.Scalar_out",      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
-  m.impl("mul_relu",            TORCH_FN(QMul</*ReLUFused=*/true>::run));
-  m.impl("mul_relu.out",        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
-  m.impl("mul_relu.Scalar",     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
-  m.impl("mul_relu.Scalar_out", TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul"),                 TORCH_FN(QMul</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul.out"),             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar"),          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar_out"),      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu"),            TORCH_FN(QMul</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.out"),        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar"),     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar_out"), TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
   // deprecated functions, kept for backward compatibility
-  m.impl("mul_out",             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
-  m.impl("mul_relu_out",        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
-  m.impl("mul_scalar",          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu",     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
-  m.impl("mul_scalar_out",      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu_out", TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_out"),             TORCH_FN(QMulOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu_out"),        TORCH_FN(QMulOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar"),          TORCH_FN(QMulScalar</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu"),     TORCH_FN(QMulScalar</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_out"),      TORCH_FN(QMulScalarOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu_out"), TORCH_FN(QMulScalarOut</*ReLUFused=*/true>::run));
   // TODO: remove after broadcasting is supported
-  m.impl("mul_scalar.Tensor", TORCH_FN(QMulScalarTensor</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu.Tensor", TORCH_FN(QMulScalarTensor</*ReLUFused=*/true>::run));
-  m.impl("mul_scalar_out.Tensor", TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/false>::run));
-  m.impl("mul_scalar_relu_out.Tensor", TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar.Tensor"), TORCH_FN(QMulScalarTensor</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu.Tensor"), TORCH_FN(QMulScalarTensor</*ReLUFused=*/true>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_out.Tensor"), TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/false>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu_out.Tensor"), TORCH_FN(QMulScalarTensorOut</*ReLUFused=*/true>::run));
 }
 
 }  // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
index f5bef2b93a0a..6ed193cd82c9 100644
--- a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp
@@ -120,7 +120,7 @@ Tensor quantized_instance_norm_impl(
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   // TODO: this is kind of... blegh
-  m.impl("layer_norm", [](
+  m.impl(TORCH_SELECTIVE_NAME("quantized::layer_norm"), [](
     Tensor input,
     std::vector<int64_t> normalized_shape,  // because IntArrayRef doesn't work
     c10::optional<Tensor> weight,
@@ -134,7 +134,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
           bias.has_value() ? *bias : Tensor(),
           eps, output_scale, output_zero_point);
   });
-  m.impl("group_norm", [](
+  m.impl(TORCH_SELECTIVE_NAME("quantized::group_norm"), [](
       Tensor qx,
       int64_t num_groups,
       c10::optional<Tensor> weight,
@@ -148,7 +148,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
         bias.has_value() ? *bias : Tensor(),
         eps, output_scale, output_zero_point);
   });
-  m.impl("instance_norm", [](
+  m.impl(TORCH_SELECTIVE_NAME("quantized::instance_norm"), [](
       Tensor qx,
       c10::optional<Tensor> weight,
       c10::optional<Tensor> bias,
diff --git a/aten/src/ATen/native/quantized/cpu/qpool.cpp b/aten/src/ATen/native/quantized/cpu/qpool.cpp
index a4e255f030d4..7fa56619609b 100644
--- a/aten/src/ATen/native/quantized/cpu/qpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp
@@ -451,8 +451,8 @@ class QMaxPool_arr_args final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("max_pool1d", TORCH_FN(QMaxPool_arr_args<1>::run));
-  m.impl("max_pool2d", TORCH_FN(QMaxPool_arr_args<2>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool1d"), TORCH_FN(QMaxPool_arr_args<1>::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool2d"), TORCH_FN(QMaxPool_arr_args<2>::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
index 447e5cb23af5..a9fcf1a625ef 100644
--- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@@ -171,7 +171,7 @@ class QRelu6 final {
 };
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("relu6", TORCH_FN(QRelu6::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::relu6"), TORCH_FN(QRelu6::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/qthreshold.cpp b/aten/src/ATen/native/quantized/cpu/qthreshold.cpp
index 281274d27be2..a42da4081c71 100644
--- a/aten/src/ATen/native/quantized/cpu/qthreshold.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qthreshold.cpp
@@ -35,7 +35,7 @@ Tensor threshold_quantized_cpu(
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
-  m.impl("threshold", TORCH_FN(threshold_quantized_cpu));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::threshold"), TORCH_FN(threshold_quantized_cpu));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 093519015639..f6820dd9993b 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -20,175 +20,157 @@ TORCH_LIBRARY(quantized, m) {
   register_conv_params<3>();
   register_embedding_params();
 
-  m.def("add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc");
-  m.def("add.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add.Scalar(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_relu(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc");
-  m.def("add_relu.Scalar(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add_relu.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
   // deprecated functions, kept for backward compatibility
-  m.def("add_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_relu_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_scalar(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add_scalar_relu(Tensor qa, Scalar b) -> Tensor qc");
-  m.def("add_scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu(Tensor qa, Scalar b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"));
   // TODO: remove after broadcasting is supported
-  m.def("add_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out");
-  m.def("add_scalar.Tensor(Tensor qa, Tensor b) -> Tensor qc");
-  m.def("add_scalar_relu.Tensor(Tensor qa, Tensor b) -> Tensor qc");
-  m.def("add_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar.Tensor(Tensor qa, Tensor b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu.Tensor(Tensor qa, Tensor b) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out"));
   // This is needed for graph mode quantization, when we fuse
   // dequant - aten::batch_norm - quant into quantized::batch_norm
   // and dimension is unknown given only the aten op call
   // quantized::batch_norm supports both 2d and 3d batch norm right now
   // it should also support 1d batch_norm after quantized::batch_norm1d is
   // implemented
-  m.def("batch_norm(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm1d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm1d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm2d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm2d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm3d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("batch_norm3d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("clamp(Tensor qx, Scalar? min, Scalar? max) -> Tensor qy");
-  m.def("threshold(Tensor qx, Scalar threshold, Scalar value) -> Tensor qy");
-  m.def("cat(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor");
-  m.def("cat_relu(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor");
-  m.def("cat_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)");
-  m.def("cat_relu_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)");
-  m.def("conv1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv1d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv3d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm1d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm1d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm2d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm2d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm3d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm3d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::clamp(Tensor qx, Scalar? min, Scalar? max) -> Tensor qy"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::threshold(Tensor qx, Scalar threshold, Scalar value) -> Tensor qy"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_relu(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_relu_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"));
   // conv_prepack is deprecated, please use conv2d_prepack for 2D conv.
-  m.def("conv_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase"));
   // conv_unpack is deprecated, please use conv2d_unpack for 2D conv.
-  m.def("conv_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv3d_unpack(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
-  m.def("conv2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
-  m.def("conv3d_stride(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv3d_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_unpack(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_stride(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"));
   // conv_tranpsose
-  m.def("conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def("conv_transpose1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv_transpose2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)");
-  m.def("conv_transpose2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv_transpose2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv_transpose2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]");
-  m.def("conv_transpose2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
-  m.def("conv_transpose2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"));
 
-  m.def("elu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor");
-  m.def("embedding_bag_prepack(Tensor weight) -> __torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack");
-  m.def("embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin");
-  m.def("embedding_bag_byte_prepack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_byte_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor");
-  m.def("embedding_bag_4bit_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor");
-  m.def("embedding_bag_2bit_unpack(Tensor weight) -> Tensor");
-  m.def("embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor");
-  m.def("embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
-  m.def("embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor");
-  m.def("embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool sparse=False) -> Tensor");
-  m.def("celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor");
-  m.def("hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor");
-  m.def("group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor");
-  m.def("instance_norm(Tensor input, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def("layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor");
-  m.def(
-      "linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y");
-  m.def(
-      "linear_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y");
-  m.def(
-      "linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y");
-  m.def(
-      "linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y");
-  m.def(
-      "linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y");
-  m.def(
-      "linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def(
-      "linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def("linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
-  m.def(
-      "linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
-  m.def(
-      "linear_unpack(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def(
-      "linear_unpack_fp16(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def(
-      "linear_unpack.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def(
-      "linear_unpack_fp16.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)");
-  m.def("mul(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc");
-  m.def("mul.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul.Scalar(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_relu(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc");
-  m.def("mul_relu.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_relu.Scalar(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::elu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_prepack(Tensor weight) -> __torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool sparse=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::instance_norm(Tensor input, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack_fp16(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack_fp16.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
   // deprecated functions, kept for backward compatibility
-  m.def("mul_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_relu_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_scalar(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul_scalar_relu(Tensor qa, Scalar b)-> Tensor qc");
-  m.def("mul_scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu(Tensor qa, Scalar b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"));
   // TODO: remove after broadcasting is supported
-  m.def("mul_scalar.Tensor(Tensor qa, Tensor b)-> Tensor qc");
-  m.def("mul_scalar_relu.Tensor(Tensor qa, Tensor b)-> Tensor qc");
-  m.def("mul_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("mul_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out");
-  m.def("max_pool1d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor");
-  // NB: missing a space after comma here...
-  m.def("max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation,bool ceil_mode) -> Tensor");
-  m.def("relu6(Tensor qx, bool inplace=False) -> Tensor");
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar.Tensor(Tensor qa, Tensor b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu.Tensor(Tensor qa, Tensor b)-> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool1d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::relu6(Tensor qx, bool inplace=False) -> Tensor"));
 }
 
 // According to #33294: The "_" prefix registration will be
 // removed when the operators are all migrated to mobile.
 // https://github.com/pytorch/pytorch/issues/36510
 TORCH_LIBRARY(_quantized, m) {
-  m.def("add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc");
-  m.def("conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor");
-  m.def("conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase");
-  m.def(
-      "linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y");
-  m.def(
-      "linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y");
-  m.def(
-      "linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def(
-      "linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack");
-  m.def("linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
-  m.def(
-      "linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack");
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
+  m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"));
 }

From ef41472544743e4532fa984a3c9c3dc516f25ff3 Mon Sep 17 00:00:00 2001
From: Garret Catron <gcatron@fb.com>
Date: Tue, 29 Sep 2020 15:28:26 -0700
Subject: [PATCH 279/449] Create experimental FX graph manipulation library
 (#44775)

Summary:
This PR adds a new GraphManipulation library for operating on the GraphModule nodes.
It also adds an implementation of replace_target_nodes_with, which replaces all nodes in the GraphModule or a specific op/target with a new specified op/target. An example use of this function would be replacing a generic operator with an optimized operator for specific sizes and shapes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44775

Reviewed By: jamesr66a

Differential Revision: D23874561

Pulled By: gcatron

fbshipit-source-id: e1497cd11e0bbbf1fabdf137d65c746248998e0b
---
 test/test_fx.py                            | 18 ++++++++++++++
 torch/fx/experimental/GraphManipulation.py | 29 ++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 317fc4723ebe..1790cf3e3559 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -599,6 +599,24 @@ def forward(self, a):
         out = gm(input)
         self.assertEqual(out, ref_out)
 
+    def test_replace_target_nodes_with(self):
+        class testModule(torch.nn.Module):
+            def forward(self, a, b):
+                return a + b
+        m = testModule()
+        traced = symbolic_trace(m)
+        input1 = torch.randn(1)
+        input2 = torch.randn(1)
+        assert (input1 + input2) == traced(input1, input2)
+        GraphManipulation.replace_target_nodes_with(
+            fx_module=traced,
+            old_op="call_function",
+            old_target=operator.add,
+            new_op="call_function",
+            new_target=operator.mul,
+        )
+        assert (input1 * input2) == traced(input1, input2)
+
     def test_pretty_print(self):
         st = SimpleTest()
         traced = symbolic_trace(st)
diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py
index f3df7734ba99..c0dd78d28231 100644
--- a/torch/fx/experimental/GraphManipulation.py
+++ b/torch/fx/experimental/GraphManipulation.py
@@ -1,7 +1,9 @@
-from typing import List
+from typing import Dict, List
 from torch.fx.graph_module import GraphModule
 from typing import Any
-from torch.fx.node import Node
+from torch.fx.node import Node, Target
+from torch.fx.graph import Graph, map_arg
+
 
 """find_use is used to find out if the node is another node's arg or kwargs."""
 def find_use(arg: Any, node: Node) -> bool:
@@ -29,3 +31,26 @@ def get_all_users_of(fx_module: GraphModule, index: int) -> List[int]:
         if find_use(n.args, current_node) or find_use(n.kwargs, current_node):
             user_indexes.append(i)
     return user_indexes
+
+def replace_target_nodes_with(
+    fx_module: GraphModule,
+    old_op: str,
+    old_target: Target,
+    new_op: str,
+    new_target: Target,
+):
+    """Modifies all nodes in fx_module.graph.nodes which match the specified op code and target,
+    and updates them to match the new op code and target"""
+    new_graph = Graph()
+    val_map : Dict[Node, Node] = {}
+    for node in fx_module.graph.nodes:
+        if node.op == old_op and node.target == old_target:
+            args = map_arg(node.args, lambda n: val_map[n])
+            kwargs = map_arg(node.kwargs, lambda n: val_map[n])
+            assert isinstance(args, tuple)
+            assert isinstance(kwargs, dict)
+            val_map[node] = new_graph.create_node(new_op, new_target, args, kwargs, node.name)
+        else:
+            val_map[node] = new_graph.node_copy(node, lambda n : val_map[n])
+    new_graph.output(map_arg(fx_module.graph.result, lambda n: val_map[n]))
+    fx_module.graph = new_graph

From 06a566373a7ef34793ec4e19b385928eb6263cf6 Mon Sep 17 00:00:00 2001
From: Hongyi Jia <jiayisuse@fb.com>
Date: Tue, 29 Sep 2020 15:37:30 -0700
Subject: [PATCH 280/449] [PyTorch/NCCL] Fix async error handling (#45456)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45456

Remove work while not holding lock, to avoid deadlock with watchdog thread while GPU is 100%

SyncBatchNorm failure trace: P143879560

Test Plan:
**Desync test:**
BACKEND=nccl WORLD_SIZE=3 NCCL_ASYNC_ERROR_HANDLING=1 ./buck-out/gen/caffe2/test/distributed/distributed_nccl_spawn#binary.par -r test_DistributedDataParallel_desync

**SyncBatchNorm test:**
BACKEND=nccl WORLD_SIZE=3 NCCL_ASYNC_ERROR_HANDLING=1 ./buck-out/gen/caffe2/test/distributed/distributed_nccl_fork#binary.par -r test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_gradient

Reviewed By: osalpekar

Differential Revision: D23972071

fbshipit-source-id: f03d9637a6ec998d64dab1a062a81e0f3697275f
---
 torch/lib/c10d/ProcessGroupNCCL.cpp | 124 ++++++++++++++--------------
 torch/lib/c10d/ProcessGroupNCCL.hpp |  13 ++-
 2 files changed, 72 insertions(+), 65 deletions(-)

diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index a3765670c6b2..23c1731380f5 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -243,6 +243,18 @@ ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
   ncclComms_.resize(devices.size());
 }
 
+ProcessGroupNCCL::WorkNCCL::WorkNCCL(const WorkNCCL& w)
+    : std::enable_shared_from_this<WorkNCCL>(w),
+      devices_(w.devices_),
+      cudaEvents_(w.cudaEvents_),
+      ncclComms_(w.ncclComms_),
+      blockingWait_(w.blockingWait_),
+      opTimeout_(w.opTimeout_),
+      workStartTime_(w.workStartTime_) {
+  completed_ = w.completed_;
+  exception_ = w.exception_;
+}
+
 ProcessGroupNCCL::WorkNCCL::~WorkNCCL() {}
 
 bool ProcessGroupNCCL::WorkNCCL::isCompleted() {
@@ -465,30 +477,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
 
 ProcessGroupNCCL::~ProcessGroupNCCL() {
   terminateProcessGroup_.store(true);
-  watchdogCV_.notify_one();
-  workListCV_.notify_one();
-
-  if (asyncErrorHandling_) {
-    std::unique_lock<std::mutex> lock(workListMutex_);
-    // TODO: We can potentially merge this functionality into the workCleanup
-    // thread or just allow the destructor to free workList_.
-    // Clean up any remaining items in the workList_ instead of waiting for the
-    // workCleanup Thread to be scheduled again.
-    for (auto it = workList_.begin(); it != workList_.end();
-         /* no increment*/) {
-      auto& work = *it;
-      if (work->isCompleted()) {
-        it = workList_.erase(it);
-      } else {
-        ++it;
-      }
-    }
-    // Wait for workList_ to become empty before proceeding with shutdown.
-    workListCV_.wait(lock, [&]() -> bool { return workList_.empty(); });
-    lock.unlock();
-    workCleanupThread_.join();
-  }
 
+  watchdogCV_.notify_one();
 #ifdef ENABLE_NCCL_ERROR_CHECKING
   ncclCommWatchdogThread_.join();
 #endif
@@ -504,12 +494,17 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
       }
     }
   }
+
+  if (asyncErrorHandling_) {
+    workMetaListCV_.notify_one();
+    workCleanupThread_.join();
+  }
 }
 
 void ProcessGroupNCCL::ncclCommWatchdog() {
   try {
     ncclCommWatchdogInternal();
-    LOG(INFO) << "NCCL watchdog thread terminated normally";
+    LOG(INFO) << "[Rank " << rank_ << "] NCCL watchdog thread terminated normally";
   } catch (std::exception& e) {
     LOG(INFO) << "NCCL watchdog thread terminated with exception: " << e.what();
   } catch (...) {
@@ -562,21 +557,22 @@ void ProcessGroupNCCL::ncclCommWatchdogInternal() {
     }
 
     if (asyncErrorHandling_) {
-      std::unique_lock<std::mutex> lock(workListMutex_);
-      for (auto& work : workList_) {
-        work->checkAndSetException();
+      std::unique_lock<std::mutex> lock(workMetaListMutex_);
+      for (auto& work : workMetaList_) {
+        work.checkAndSetException();
         // Aborting NCCL Communicators due to errors is already handled above.
-        if (work->exception()) {
+        if (work.exception()) {
           continue;
         }
 
         // Check for Timeouts in the WorkNCCL Operations, and abort all
         // communicators accordingly.
-        if (work->timedOut()) {
+        if (work.timedOut()) {
+          LOG(INFO) << "[" << rank_ << "] caught collective operation timeout";
           std::exception_ptr exception_ptr = std::make_exception_ptr(
               std::runtime_error("NCCL Operation Timed Out"));
-          work->setException(exception_ptr);
-          for (const auto& ncclComm : work->ncclComms_) {
+          work.setException(exception_ptr);
+          for (const auto& ncclComm : work.ncclComms_) {
             ncclComm->ncclCommAbort();
             abortedCommIds.emplace(buildNcclUniqueIdStr(ncclComm->getNcclId()));
           }
@@ -639,36 +635,38 @@ void ProcessGroupNCCL::ncclCommWatchdogInternal() {
 }
 
 void ProcessGroupNCCL::workCleanupLoop() {
-  while (!terminateProcessGroup_.load()) {
-    std::unique_lock<std::mutex> lock(workListMutex_);
-    // We busy-poll the work vector every kWatchdogThreadSleepMillis
-    // milliseconds as long as the atomic is True.
-    workListCV_.wait_for(
-        lock,
-        std::chrono::milliseconds(kWorkCleanupThreadSleepMillis),
-        [&]() -> bool { return terminateProcessGroup_.load(); });
-
-    for (auto it = workList_.begin(); it != workList_.end();
-         /* no increment*/) {
-      auto& work = *it;
-      if (work->isCompleted()) {
-        // Handle Exceptions on failed GPU operations and remove completed
-        // workNCCL objects from work vector.
-        work->handleNCCLGuard();
-        it = workList_.erase(it);
-      } else {
-        // Increment the iterator if the current WorkNCCL object is not
-        // completed.
-        ++it;
+  bool done = false;
+  while (!terminateProcessGroup_.load() || !done) {
+    std::list<WorkNCCL> doneWorks;
+    {
+      std::unique_lock<std::mutex> lock(workMetaListMutex_);
+      // We busy-poll the work vector every kWatchdogThreadSleepMillis
+      // milliseconds as long as the atomic is True.
+      workMetaListCV_.wait_for(
+          lock,
+          std::chrono::milliseconds(kWorkCleanupThreadSleepMillis),
+          [&]() -> bool { return terminateProcessGroup_.load(); });
+
+      for (auto it = workMetaList_.begin(); it != workMetaList_.end();
+           /* no increment*/) {
+        auto& work = *it;
+        if (work.isCompleted()) {
+          // Handle Exceptions on failed GPU operations and remove completed
+          // workNCCL objects from work vector.
+          if (!terminateProcessGroup_.load()) {
+            work.handleNCCLGuard();
+          }
+          doneWorks.push_back(std::move(*it));
+          it = workMetaList_.erase(it);
+        } else {
+          // Increment the iterator if the current WorkNCCL object is not
+          // completed.
+          ++it;
+        }
       }
+      done = workMetaList_.empty();
     }
-
-    if (workList_.empty()) {
-      // Notify the main thread if it is blocked in the shutdown sequence,
-      // waiting for the work vector to become empty.
-      lock.unlock();
-      workListCV_.notify_one();
-    }
+    doneWorks.clear();
   }
 }
 
@@ -928,8 +926,12 @@ c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupNCCL::WorkNCCL::
 void ProcessGroupNCCL::workEnqueue(
     std::shared_ptr<ProcessGroupNCCL::WorkNCCL> work) {
   if (!terminateProcessGroup_.load()) {
-    std::lock_guard<std::mutex> lock(workListMutex_);
-    workList_.emplace_back(std::move(work));
+    std::lock_guard<std::mutex> lock(workMetaListMutex_);
+    // Avoid view tensors to be processed in cleanup thread.
+    // View tensors' destruction invokes autograd_meta, which
+    // needs to be destructed in user thread. Otherwise will
+    // get deadlock. Here we enqueue work without outputs_.
+    workMetaList_.emplace_back(WorkNCCL(*work));
   }
 }
 ProcessGroupNCCL::Options::Options()
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 5a5e5a718ad8..06b144438455 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -65,6 +65,11 @@ class ProcessGroupNCCL : public ProcessGroup {
    public:
     // Constructor takes a list of CUDA devices
     WorkNCCL(const std::vector<at::Device>& devices);
+    // Copy constructor doing partial copy without outputs_. Cleanup thread
+    // monitors and removes finished works. However it will deadlock when
+    // destructs outputs_ tensors who are view tensors in autograd graph.
+    WorkNCCL(const WorkNCCL& w);
+
     virtual ~WorkNCCL();
 
     // Checks if request has completed. In this specific case of NCCL, it checks
@@ -563,14 +568,14 @@ class ProcessGroupNCCL : public ProcessGroup {
   // Thread that removes NCCL Work upon timeout
   std::thread workCleanupThread_;
 
-  // Mutex to Guard workList_
-  std::mutex workListMutex_;
+  // Mutex to Guard workMetaList_
+  std::mutex workMetaListMutex_;
 
   // Condition Variable for timeout thread sleep
-  std::condition_variable workListCV_;
+  std::condition_variable workMetaListCV_;
 
   // Vector to Store WorkNCCL pointers
-  std::list<std::shared_ptr<ProcessGroupNCCL::WorkNCCL>> workList_;
+  std::list<ProcessGroupNCCL::WorkNCCL> workMetaList_;
 
   // Add Work Pointer to workVector
   void workEnqueue(std::shared_ptr<ProcessGroupNCCL::WorkNCCL>);

From 637570405bd0180cd6f08b87868590248ff7d52c Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@devfair004.maas>
Date: Tue, 29 Sep 2020 15:38:18 -0700
Subject: [PATCH 281/449] Disable multi tensor tesnor tests on rocm (#45535)

Summary:
Disable multi tensor test on rocm

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45535

Reviewed By: ngimel

Differential Revision: D24002557

Pulled By: izdeby

fbshipit-source-id: 608c9389e3d9cd7dac49ea42c9bb0af55662c754
---
 test/test_optim.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_optim.py b/test/test_optim.py
index 6d06758a5367..a41ee7297f8b 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -305,6 +305,7 @@ def test_sgd_sparse(self):
                 [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
             )
 
+    @skipIfRocm
     def test_multi_tensor_optimizers(self):
         if not torch.cuda.is_available():
             return

From 22a34bcf4e5eaa348f0117c414c3dd760ec64b13 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Tue, 29 Sep 2020 16:49:52 -0700
Subject: [PATCH 282/449] ROCm {emoji:2764} TensorExpr (#45506)

Summary:
This might be an alternative to reverting https://github.com/pytorch/pytorch/issues/45396 .
The obvious rough edge is that I'm not really seeing the work group limits that TensorExpr produces.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45506

Reviewed By: zhangguanheng66

Differential Revision: D23991410

Pulled By: Krovatkin

fbshipit-source-id: 11d3fc4600e4bffb1d1192c6b8dd2fe22c1e064e
---
 test/run_test.py                           |  1 -
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp | 34 ++++++++++++++++++----
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/test/run_test.py b/test/run_test.py
index f0658f5224d9..4309e65478ba 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -110,7 +110,6 @@
     'test_determination',
     'test_multiprocessing',
     'test_jit_legacy',
-    'test_tensorexpr',
     'test_type_hints',
     'test_openmp',
 ]
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 06e6703d494a..6cc058657f69 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -865,18 +865,30 @@ static std::ostream& operator<<(
   return out;
 }
 
-static const char* resource_string = R"(
+#ifdef USE_ROCM
+static const char* device_resource_string = R"(
+#include <hip/hip_runtime.h>
+#define POS_INFINITY INFINITY
+#define NEG_INFINITY -INFINITY
+
+)";
+#else
+static const char* device_resource_string = R"(
 #define NAN __int_as_float(0x7fffffff)
 #define POS_INFINITY __int_as_float(0x7f800000)
 #define NEG_INFINITY __int_as_float(0xff800000)
 
+)";
+#endif
+
+static const char* shared_resource_string = R"(
 template<typename T>
-T maximum(T a, T b) {
+__device__ T maximum(T a, T b) {
   return isnan(a) ? a : (a > b ? a : b);
 }
 
 template<typename T>
-T minimum(T a, T b) {
+__device__ T minimum(T a, T b) {
   return isnan(a) ? a : (a < b ? a : b);
 }
 
@@ -898,7 +910,7 @@ void CudaCodeGen::Initialize() {
   metavar_rewriter_ =
       std::make_unique<GPUMetaVarRewriter>(cuda_analysis_.get());
 
-  os() << resource_string;
+  os() << device_resource_string << shared_resource_string;
 
   if (has_random_) {
     os() << philox_random_string << std::endl;
@@ -914,7 +926,19 @@ void CudaCodeGen::Initialize() {
   }
 
   std::string func_name = GetUniqueFuncName("func");
-  os() << "extern \"C\" __global__" << std::endl << "void " << func_name << "(";
+  os() << "extern \"C\" __global__" << std::endl;
+#ifdef USE_ROCM
+  // CUDA has a default limit of threads per block (=flat work group size)
+  // of 1024, but ROCm uses 256 by default. At the time of writing
+  // (#45506), I am unaware of a stricter limit that TensorExpr imposes
+  // (maybe for perf),so I use 1024 as maximum flat work group size.
+  // We put a minimum value of 1, this is also used by hip (ROCm 3.8) in
+  // the __launch_bound__ implementation. The arguments for the attribute
+  // are (min, max), for details see the documentation at
+  // https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size
+  os() << "__attribute__((amdgpu_flat_work_group_size(1, 1024)))" << std::endl;
+#endif
+  os() << "void " << func_name << "(";
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {

From 33aba57e4c872a056251d9d9dc59e930f4097ef1 Mon Sep 17 00:00:00 2001
From: peter <peterghost86@gmail.com>
Date: Tue, 29 Sep 2020 18:04:18 -0700
Subject: [PATCH 283/449] Patch generate files for system protobuf (#44583)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/42939

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44583

Reviewed By: albanD

Differential Revision: D23692639

Pulled By: ezyang

fbshipit-source-id: 49781f704dd6ceab7717b63225d0b4076ce33daa
---
 cmake/ProtoBuf.cmake      |   3 +-
 cmake/ProtoBufPatch.cmake | 142 +++++++++++++++++++-------------------
 2 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 32cab7557f3b..9a4ad35567bd 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -196,7 +196,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
 
         # If we remove all reference to these pb.h files from external
         # libraries and binaries this rewrite can be removed.
-        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
 
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
@@ -209,6 +209,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var)
         COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
         COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --cpp_out=${DLLEXPORT_STR}${PROJECT_BINARY_DIR} ${abs_fil}
         COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --python_out "${PROJECT_BINARY_DIR}" ${abs_fil}
+        COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake
         DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil}
         COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
     endif()
diff --git a/cmake/ProtoBufPatch.cmake b/cmake/ProtoBufPatch.cmake
index 2124b6189799..704dcd7da154 100644
--- a/cmake/ProtoBufPatch.cmake
+++ b/cmake/ProtoBufPatch.cmake
@@ -1,41 +1,83 @@
 # CMake file to replace the string contents in ONNX, Caffe, and Caffe2 proto.
 # Usage example:
-#   cmake -DFILENAME=caffe2.pb.h -P ProtoBufPatch.cmake
+#   cmake -DFILENAME=caffe2.pb.h -DLOCAL_PROTOBUF=ON -P ProtoBufPatch.cmake
 
 file(READ ${FILENAME} content)
 
-# protobuf-3.6.0 pattern
-string(
-  REPLACE
-  "::google::protobuf::internal::GetEmptyStringAlreadyInited"
-  "GetEmptyStringAlreadyInited"
-  content
-  "${content}")
+if(LOCAL_PROTOBUF)
+  # protobuf-3.6.0 pattern
+  string(
+    REPLACE
+    "::google::protobuf::internal::GetEmptyStringAlreadyInited"
+    "GetEmptyStringAlreadyInited"
+    content
+    "${content}")
 
-# protobuf-3.8.0+ pattern
-string(
-  REPLACE
-  "::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited"
-  "GetEmptyStringAlreadyInited"
-  content
-  "${content}")
+  # protobuf-3.8.0+ pattern
+  string(
+    REPLACE
+    "::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited"
+    "GetEmptyStringAlreadyInited"
+    content
+    "${content}")
 
-string(
-  REPLACE
-  "PROTOBUF_CONSTEXPR"
-  ""
-  content
-  "${content}")
+  string(
+    REPLACE
+    "PROTOBUF_CONSTEXPR"
+    ""
+    content
+    "${content}")
 
-# https://github.com/protocolbuffers/protobuf/commit/0400cca3236de1ca303af38bf81eab332d042b7c
-# changes PROTOBUF_CONSTEXPR to constexpr, which breaks windows
-# build.
-string(
-  REGEX REPLACE
-  "static constexpr ([^ ]+) ([^ ]+) ="
-  "static \\1 const \\2 ="
-  content
-  "${content}")
+  # https://github.com/protocolbuffers/protobuf/commit/0400cca3236de1ca303af38bf81eab332d042b7c
+  # changes PROTOBUF_CONSTEXPR to constexpr, which breaks windows
+  # build.
+  string(
+    REGEX REPLACE
+    "static constexpr ([^ ]+) ([^ ]+) ="
+    "static \\1 const \\2 ="
+    content
+    "${content}")
+
+  foreach(ns ${NAMESPACES})
+    # Insert "const ::std::string& GetEmptyStringAlreadyInited();" within
+    # the namespace and make sure we only do it once in the file. Unfortunately
+    # using string(REPLACE ...) doesn't work because it will replace at all
+    # locations and there might be multiple declarations of the namespace
+    # depending on how the proto is structured.
+    set(search "namespace ${ns} {")
+    string(LENGTH "${search}" search_len)
+    string(FIND "${content}" "${search}" pos)
+    if(${pos} GREATER -1)
+      math(EXPR pos "${pos}+${search_len}")
+      string(SUBSTRING "${content}" 0 ${pos} content_pre)
+      string(SUBSTRING "${content}" ${pos} -1 content_post)
+      string(
+        CONCAT
+        content
+        "${content_pre}"
+        " const ::std::string& GetEmptyStringAlreadyInited(); "
+        "${content_post}")
+    endif()
+  endforeach()
+
+  # The moving constructor is defined in the header file, which will cause
+  # a link error that claims that the vftable is not found. Luckily, we
+  # could move the definition into the source file to solve the problem.
+  list(LENGTH NAMESPACES ns_count)
+  if("${FILENAME}" MATCHES ".pb.h" AND ns_count EQUAL 1)
+    string(REPLACE ".pb.h" ".pb.cc" SOURCE_FILENAME ${FILENAME})
+    file(READ ${SOURCE_FILENAME} content_cc_origin)
+
+    string(REGEX MATCHALL "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept[^}]*}" content_cc "${content}")
+    string(REGEX REPLACE "};" "}\n" content_cc "${content_cc}")
+    string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept" "  \\1::\\1(\\1&& from) noexcept" content_cc "${content_cc}")
+    set(content_cc "${content_cc_origin}\nnamespace ${NAMESPACES} {\n#if LANG_CXX11\n${content_cc}\n#endif\n}")
+
+    string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept([^}]*)}" "\\1(\\1&& from) noexcept;" content "${content}")
+
+    file(WRITE ${SOURCE_FILENAME} "${content_cc}")
+  endif()
+endif()
 
 # constexpr int TensorBoundShape_DimType_DimType_ARRAYSIZE = TensorBoundShape_DimType_DimType_MAX + 1;
 # throws
@@ -53,44 +95,4 @@ string(
   content
   "${content}")
 
-foreach(ns ${NAMESPACES})
-  # Insert "const ::std::string& GetEmptyStringAlreadyInited();" within
-  # the namespace and make sure we only do it once in the file. Unfortunately
-  # using string(REPLACE ...) doesn't work because it will replace at all
-  # locations and there might be multiple declarations of the namespace
-  # depending on how the proto is structured.
-  set(search "namespace ${ns} {")
-  string(LENGTH "${search}" search_len)
-  string(FIND "${content}" "${search}" pos)
-  if(${pos} GREATER -1)
-    math(EXPR pos "${pos}+${search_len}")
-    string(SUBSTRING "${content}" 0 ${pos} content_pre)
-    string(SUBSTRING "${content}" ${pos} -1 content_post)
-    string(
-      CONCAT
-      content
-      "${content_pre}"
-      " const ::std::string& GetEmptyStringAlreadyInited(); "
-      "${content_post}")
-  endif()
-endforeach()
-
-# The moving constructor is defined in the header file, which will cause
-# a link error that claims that the vftable is not found. Luckily, we
-# could move the definition into the source file to solve the problem.
-list(LENGTH NAMESPACES ns_count)
-if("${FILENAME}" MATCHES ".pb.h" AND ns_count EQUAL 1)
-  string(REPLACE ".pb.h" ".pb.cc" SOURCE_FILENAME ${FILENAME})
-  file(READ ${SOURCE_FILENAME} content_cc_origin)
-
-  string(REGEX MATCHALL "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept[^}]*}" content_cc "${content}")
-  string(REGEX REPLACE "};" "}\n" content_cc "${content_cc}")
-  string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept" "  \\1::\\1(\\1&& from) noexcept" content_cc "${content_cc}")
-  set(content_cc "${content_cc_origin}\nnamespace ${NAMESPACES} {\n#if LANG_CXX11\n${content_cc}\n#endif\n}")
-
-  string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept([^}]*)}" "\\1(\\1&& from) noexcept;" content "${content}")
-
-  file(WRITE ${SOURCE_FILENAME} "${content_cc}")
-endif()
-
 file(WRITE ${FILENAME} "${content}")

From eb39542e67e416a6cf444dc5954ef1751a4f1887 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <guilhermeleobas@gmail.com>
Date: Tue, 29 Sep 2020 18:04:31 -0700
Subject: [PATCH 284/449] Add typing annotations for torch.utils.data.* modules
 (#44136)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/44135

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44136

Reviewed By: gchanan

Differential Revision: D23963273

Pulled By: ezyang

fbshipit-source-id: 939234dddbe89949bd8e5ff05d06f6c8add6935c
---
 mypy.ini                          |  6 ------
 torch/utils/data/_utils/worker.py |  7 +++++--
 torch/utils/data/distributed.py   | 12 +++++++-----
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 03e0cb70fad9..cce220137507 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -195,12 +195,6 @@ ignore_errors = True
 [mypy-torch.contrib._tensorboard_vis]
 ignore_errors = True
 
-[mypy-torch.utils.data._utils.worker]
-ignore_errors = True
-
-[mypy-torch.utils.data.distributed]
-ignore_errors = True
-
 [mypy-torch.nn.utils.prune]
 ignore_errors = True
 
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 508c03950852..8802f40ecdb9 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -10,6 +10,7 @@
 from collections import namedtuple
 from torch._six import queue
 from torch._utils import ExceptionWrapper
+from typing import Union
 from . import signal_handling, MP_STATUS_CHECK_INTERVAL, IS_WINDOWS
 
 if IS_WINDOWS:
@@ -23,7 +24,8 @@ class ManagerWatchdog(object):
         def __init__(self):
             self.manager_pid = os.getppid()
 
-            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)
+            # mypy cannot detect this code is windows only
+            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)  # type: ignore
             self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD)
             self.kernel32.OpenProcess.restype = HANDLE
             self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD)
@@ -34,7 +36,7 @@ def __init__(self):
             self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid)
 
             if not self.manager_handle:
-                raise ctypes.WinError(ctypes.get_last_error())
+                raise ctypes.WinError(ctypes.get_last_error())  # type: ignore
 
             self.manager_dead = False
 
@@ -187,6 +189,7 @@ def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
                 # processing steps.
                 continue
             idx, index = r
+            data: Union[_IterableDatasetStopIteration, ExceptionWrapper]
             if init_exception is not None:
                 data = init_exception
                 init_exception = None
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index b4f5ac399504..7e9456aa523a 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -74,15 +74,17 @@ def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None,
         self.drop_last = drop_last
         # If the dataset length is evenly divisible by # of replicas, then there
         # is no need to drop any data, since the dataset will be split equally.
-        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore
             # Split to nearest available length that is evenly divisible.
             # This is to ensure each rank receives the same amount of data when
             # using this Sampler.
             self.num_samples = math.ceil(
-                (len(self.dataset) - self.num_replicas) / self.num_replicas
+                # `type:ignore` is required because Dataset cannot provide a default __len__
+                # see NOTE in pytorch/torch/utils/data/sampler.py
+                (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore
             )
         else:
-            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)  # type: ignore
         self.total_size = self.num_samples * self.num_replicas
         self.shuffle = shuffle
         self.seed = seed
@@ -92,9 +94,9 @@ def __iter__(self) -> Iterator[T_co]:
             # deterministically shuffle based on epoch and seed
             g = torch.Generator()
             g.manual_seed(self.seed + self.epoch)
-            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore
         else:
-            indices = list(range(len(self.dataset)))
+            indices = list(range(len(self.dataset)))  # type: ignore
 
         if not self.drop_last:
             # add extra samples to make it evenly divisible

From 375a83e6c148a3a6344e94ffdf051d96a66c0ffa Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <guilhermeleobas@gmail.com>
Date: Tue, 29 Sep 2020 18:05:51 -0700
Subject: [PATCH 285/449] Annotate torch.utils.(tensorboard/show_pickle/hypify)
 (#44216)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/44215

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44216

Reviewed By: gchanan

Differential Revision: D23963216

Pulled By: ezyang

fbshipit-source-id: b3fed51b2a1cbd05e3cd0222c89c38d61d8968c1
---
 caffe2/proto/caffe2_pb2.pyi              | 18 ++++++++++++++++++
 torch/utils/hipify/hipify_python.py      | 10 +++++++---
 torch/utils/show_pickle.py               |  8 +++++---
 torch/utils/tensorboard/_caffe2_graph.py | 16 +++++++++-------
 4 files changed, 39 insertions(+), 13 deletions(-)
 create mode 100644 caffe2/proto/caffe2_pb2.pyi

diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi
new file mode 100644
index 000000000000..060f60fc6c88
--- /dev/null
+++ b/caffe2/proto/caffe2_pb2.pyi
@@ -0,0 +1,18 @@
+
+# Defined in caffe2/proto/caffe2_pb2.h
+class DeviceType:
+    ...
+
+CPU: DeviceType = ...
+CUDA: DeviceType = ...
+OPENGL: DeviceType = ...
+OPENCL: DeviceType = ...
+MKLDNN: DeviceType = ...
+IDEEP: DeviceType = ...
+HIP: DeviceType = ...
+
+class NetDef:
+    ...
+
+class OperatorDef:
+    ...
\ No newline at end of file
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 52aad5ea1d69..1ce4a943c45b 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -34,11 +34,13 @@
 from .cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS
 from .cuda_to_hip_mappings import MATH_TRANSPILATIONS
 
+from typing import Dict, List
+from collections.abc import Mapping
+
 # Hardcode the PyTorch template map
 """This dictionary provides the mapping from PyTorch kernel template types
 to their actual types."""
 PYTORCH_TEMPLATE_MAP = {"Dtype": "scalar_t", "T": "scalar_t"}
-CAFFE2_TEMPLATE_MAP = {}
 
 
 class InputError(Exception):
@@ -168,7 +170,7 @@ def preprocess(
         clean_ctx = GeneratedFileCleaner(keep_intermediates=True)
 
     # Preprocessing statistics.
-    stats = {"unsupported_calls": [], "kernel_launches": []}
+    stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []}
 
     for filepath in all_files:
         result = preprocessor(output_directory, filepath, stats, hip_clang_launch, is_pytorch_extension, clean_ctx)
@@ -204,7 +206,7 @@ def add_dim3(kernel_string, cuda_kernel):
     count = 0
     closure = 0
     kernel_string = kernel_string.replace("<<<", "").replace(">>>", "")
-    arg_locs = [{} for _ in range(2)]
+    arg_locs: List[Dict[str, int]] = [{} for _ in range(2)]
     arg_locs[count]['start'] = 0
     for ind, c in enumerate(kernel_string):
         if count > 1:
@@ -444,6 +446,7 @@ def hip_header_magic(input_string):
         return output_string
 
     # Rough logic to detect if we're inside device code
+    hasDeviceLogic: int
     hasDeviceLogic = "hipLaunchKernelGGL" in output_string
     hasDeviceLogic += "__global__" in output_string
     hasDeviceLogic += "__shared__" in output_string
@@ -632,6 +635,7 @@ def pattern(self):
 PYTORCH_TRIE = Trie()
 PYTORCH_MAP = {}
 for mapping in CUDA_TO_HIP_MAPPINGS:
+    assert isinstance(mapping, Mapping)
     for src, value in mapping.items():
         dst = value[0]
         meta_data = value[1:]
diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py
index 51d524d63acf..0e2498d64c56 100644
--- a/torch/utils/show_pickle.py
+++ b/torch/utils/show_pickle.py
@@ -4,6 +4,7 @@
 import pprint
 import zipfile
 import fnmatch
+from typing import IO, BinaryIO, Union
 
 
 class FakeObject(object):
@@ -44,7 +45,7 @@ class FakeClass(object):
     def __init__(self, module, name):
         self.module = module
         self.name = name
-        self.__new__ = self.fake_new
+        self.__new__ = self.fake_new  # type: ignore
 
     def __repr__(self):
         return f"{self.module}.{self.name}"
@@ -56,7 +57,7 @@ def fake_new(self, *args):
         return FakeObject(self.module, self.name, args[1:])
 
 
-class DumpUnpickler(pickle._Unpickler):
+class DumpUnpickler(pickle._Unpickler):  # type: ignore
     def find_class(self, module, name):
         return FakeClass(module, name)
 
@@ -84,6 +85,7 @@ def main(argv, output_stream=None):
         return 2
 
     fname = argv[1]
+    handle: Union[IO[bytes], BinaryIO]
     if "@" not in fname:
         with open(fname, "rb") as handle:
             DumpUnpickler.dump(handle, output_stream)
@@ -110,6 +112,6 @@ def main(argv, output_stream=None):
     # I've tested on the following versions:
     #   3.7.4
     if True:
-        pprint.PrettyPrinter._dispatch[FakeObject.__repr__] = FakeObject.pp_format
+        pprint.PrettyPrinter._dispatch[FakeObject.__repr__] = FakeObject.pp_format  # type: ignore
 
     sys.exit(main(sys.argv))
diff --git a/torch/utils/tensorboard/_caffe2_graph.py b/torch/utils/tensorboard/_caffe2_graph.py
index 64f9f11f06d9..218f2382c86c 100644
--- a/torch/utils/tensorboard/_caffe2_graph.py
+++ b/torch/utils/tensorboard/_caffe2_graph.py
@@ -12,8 +12,10 @@
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 
+from typing import Set, Dict, Tuple, List
 
-def _make_unique_name(seen, name, min_version=0):
+
+def _make_unique_name(seen: Set[str], name: str, min_version: int = 0):
     '''
     Make the name unique by appending a unique number to the name. Used for SSA.
 
@@ -91,12 +93,12 @@ def _convert_to_ssa(shapes, blob_name_tracker, ops):
         None. Modifies blob_name_tracker and ops in-place.
     '''
     ir = core.IR(ops)
-    seen = set()
-    versioned = {}
+    seen: Set[str] = set()
+    versioned: Dict[Tuple[str, int], int] = {}
     new_shapes = {}
     new_blob_name_tracker = {}
 
-    def ssa_name(name, versions):
+    def ssa_name(name: str, versions: Dict[str, int]) -> int:
         assert name in versions
         version = versions[name]
         if (name, version) in versioned:
@@ -180,8 +182,8 @@ def _rename_all(shapes, blob_name_tracker, ops, rename_fn):
         None. Modifies shapes, blob_name_tracker and ops in-place using the
             specified 'rename_fn'.
     '''
-    seen = set()
-    renamed = {}
+    seen: Set[str] = set()
+    renamed: Dict[Tuple[str, int], int] = {}
 
     def g(name):
         """ Collision-free version of f.
@@ -683,7 +685,7 @@ def _operators_to_graph_def(
     _fill_missing_operator_names(ops)
     if show_simplified:  # use_tensorflow_naming
         _rename_tensorflow_style(shapes, blob_name_tracker, ops)
-    producing_ops = {}
+    producing_ops: Dict[caffe2_pb2.OperatorDef, List] = {}
     blobs = set()
     input_blobs, inter_blobs, _ = _compute_in_out(ops)
     current_graph = GraphDef()

From c1e6592964261d2856c84e166a0989684f946697 Mon Sep 17 00:00:00 2001
From: Guilherme Leobas <guilhermeleobas@gmail.com>
Date: Tue, 29 Sep 2020 18:05:57 -0700
Subject: [PATCH 286/449] Enable type-checking of torch.nn.quantized.* modules
 (#43110)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/43029

I am not changing the following files in this PR:
* `torch/nn/quantized/dynamic/modules/rnn.py` due to https://github.com/pytorch/pytorch/issues/43072
* `torch/nn/quantized/modules/conv.py`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43110

Reviewed By: gchanan

Differential Revision: D23963258

Pulled By: ezyang

fbshipit-source-id: 0fb0fd13af283f6f7b3434e7bbf62165357d1f98
---
 mypy.ini                               | 18 -----------
 tools/pyi/gen_pyi.py                   |  8 +++++
 torch/_C/__init__.pyi.in               |  2 +-
 torch/nn/functional.py                 |  3 +-
 torch/nn/quantized/functional.py       | 43 +++++++++++---------------
 torch/nn/quantized/modules/__init__.py |  3 ++
 torch/nn/quantized/modules/utils.py    |  3 +-
 7 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index cce220137507..24025938300f 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -141,21 +141,6 @@ ignore_errors = True
 [mypy-torch.nn.parallel.comm]
 ignore_errors = True
 
-[mypy-torch.nn.quantized.functional]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules.activation]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules.normalization]
-ignore_errors = True
-
-[mypy-torch.nn.quantized.modules.utils]
-ignore_errors = True
-
 [mypy-torch.nn.qat.modules.activations]
 ignore_errors = True
 
@@ -168,9 +153,6 @@ ignore_errors = True
 [mypy-torch.nn.quantized.modules.conv]
 ignore_errors = True
 
-[mypy-torch.nn.quantized.modules.functional_modules]
-ignore_errors = True
-
 [mypy-torch.cuda]
 ignore_errors = True
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 1b293f687359..d2c3074c696e 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -400,6 +400,14 @@ def gen_nn_functional(out):
     }
     write(out, 'torch/nn/functional.pyi', stubs, env)
 
+    # functional.pyi already contains the definitions for those functions
+    # so, we don't export then to it
+    from_c.extend(['hardtanh', 'leaky_relu', 'hardsigmoid'])
+    dispatch_code = ["{}: Callable".format(_) for _ in (dispatches + from_c)]
+    env = {
+        'imported_hints': import_code,
+        'dispatched_hints': dispatch_code
+    }
     stubs = CodeTemplate.from_file(os.path.join('torch', '_C', '_nn.pyi.in'))
     write(out, 'torch/_C/_nn.pyi', stubs, env)
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b148c181a693..590146bb2c5e 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -106,7 +106,7 @@ preserve_format: memory_format = ...
 # Defined in torch/csrc/QScheme.cpp
 class qscheme: ...
 
-# Defined in torch/csrc/utils/tensor_qschemes.cpp
+# Defined in torch/csrc/utils/tensor_qschemes.h
 per_tensor_affine: qscheme = ...
 per_channel_affine: qscheme = ...
 per_tensor_symmetric: qscheme = ...
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index c35f7daf5401..870b24b2effd 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1289,8 +1289,7 @@ def celu(input, alpha=1., inplace=False):
 """)
 
 
-def leaky_relu(input, negative_slope=0.01, inplace=False):
-    # type: (Tensor, float, bool) -> Tensor
+def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = False) -> Tensor:
     r"""
     leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor
 
diff --git a/torch/nn/quantized/functional.py b/torch/nn/quantized/functional.py
index 16006a0ea075..f2b090370ed3 100644
--- a/torch/nn/quantized/functional.py
+++ b/torch/nn/quantized/functional.py
@@ -1,10 +1,12 @@
 r""" Functional interface (quantized)."""
 from typing import List, Optional
+import warnings
 
 import torch
 from torch import Tensor
 from torch.nn.modules.utils import _pair, _triple
 from torch.nn.quantized.modules.utils import _pair_from_first
+from torch.jit.annotations import BroadcastingList2
 
 # Although some of the functions and docstrings are mirrored from the torch.nn,
 # we want to have them here for future changes.
@@ -71,8 +73,7 @@ def avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False,
                                           ceil_mode, count_include_pad,
                                           divisor_override)
 
-def adaptive_avg_pool2d(input, output_size):
-    # type: (Tensor, BroadcastingList2[int]) -> Tensor
+def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
     r"""
     Applies a 2D adaptive average pooling over a quantized input signal composed
     of several quantized input planes.
@@ -89,8 +90,7 @@ def adaptive_avg_pool2d(input, output_size):
         raise ValueError("Input to 'quantized.functional.adaptive_avg_pool2d' must be quantized!")
     return torch.nn.functional.adaptive_avg_pool2d(input, output_size)
 
-def adaptive_avg_pool3d(input, output_size):
-    # type: (Tensor, BroadcastingList2[int]) -> Tensor
+def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
     r"""
     Applies a 3D adaptive average pooling over a quantized input signal composed
     of several quantized input planes.
@@ -327,8 +327,10 @@ def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corne
     return torch.nn.functional.interpolate(input, size, scale_factor, mode,
                                            align_corners)
 
-def linear(input, weight, bias=None, scale=None, zero_point=None):
-    # type: (Tensor, Tensor, Optional[Tensor], Optional[float], Optional[int]) -> Tensor
+def linear(
+    input: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
+    scale: Optional[float] = None, zero_point: Optional[int] = None
+) -> Tensor:
     r"""
     Applies a linear transformation to the incoming quantized data:
     :math:`y = xA^T + b`.
@@ -392,8 +394,7 @@ def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
     return torch.nn.functional.max_pool2d(input, kernel_size, stride, padding,
                                           dilation, ceil_mode, return_indices)
 
-def celu(input, scale, zero_point, alpha=1.):
-    # type: (Tensor, float, int, Optional[float]) -> Tensor
+def celu(input: Tensor, scale: float, zero_point: int, alpha: Optional[float] = 1.) -> Tensor:
     r"""celu(input, scale, zero_point, alpha=1.) -> Tensor
 
     Applies the quantized CELU function element-wise.
@@ -409,8 +410,7 @@ def celu(input, scale, zero_point, alpha=1.):
     return torch.ops.quantized.celu(input, scale, zero_point, alpha)
 
 
-def relu(input, inplace=False):
-    # type: (Tensor, bool) -> Tensor
+def relu(input: Tensor, inplace: bool = False) -> Tensor:
     r"""relu(input, inplace=False) -> Tensor
 
     Applies the rectified linear unit function element-wise.
@@ -427,9 +427,8 @@ def relu(input, inplace=False):
     else:
         return torch.relu(input)
 
-def leaky_relu(input, negative_slope=0.01, inplace=False,
-               scale=None, zero_point=None):
-    # type: (Tensor, float, bool, float, int) -> Tensor
+def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = False,
+               scale: float = None, zero_point: int = None):
     r"""
     Quantized version of the.
     leaky_relu(input, negative_slope=0.01, inplace=False, scale, zero_point) -> Tensor
@@ -457,8 +456,7 @@ def leaky_relu(input, negative_slope=0.01, inplace=False,
         result = torch._C._nn.leaky_relu(input, negative_slope)
     return result
 
-def hardtanh(input, min_val=-1., max_val=1., inplace=False):
-    # type: (Tensor, float, float, bool) -> Tensor
+def hardtanh(input: Tensor, min_val: float = -1., max_val: float = 1., inplace: bool = False) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.hardtanh`.
     """
     if not input.is_quantized:
@@ -467,8 +465,7 @@ def hardtanh(input, min_val=-1., max_val=1., inplace=False):
         return torch._C._nn.hardtanh_(input, min_val, max_val)
     return torch._C._nn.hardtanh(input, min_val, max_val)
 
-def hardswish(input, scale, zero_point):
-    # type: (Tensor, float, int) -> Tensor
+def hardswish(input: Tensor, scale: float, zero_point: int) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.hardswish`.
 
     Args:
@@ -480,8 +477,7 @@ def hardswish(input, scale, zero_point):
         raise ValueError("Input to 'quantized.hardswish' must be quantized!")
     return torch._ops.ops.quantized.hardswish(input, scale, zero_point)
 
-def threshold(input, threshold, value):
-    # type: (Tensor, float, float) -> Tensor
+def threshold(input: Tensor, threshold: float, value: float) -> Tensor:
     r"""Applies the quantized version of the threshold function element-wise:
 
     .. math::
@@ -500,8 +496,7 @@ def threshold(input, threshold, value):
         raise ValueError("Input to 'value' must be specified!")
     return torch._ops.ops.quantized.threshold(input, threshold, value)
 
-def elu(input, scale, zero_point, alpha=1.):
-    # type: (Tensor, float, int, float) -> Tensor
+def elu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.elu`.
 
     Args:
@@ -514,16 +509,14 @@ def elu(input, scale, zero_point, alpha=1.):
         raise ValueError("Input to 'quantized.elu' must be quantized!")
     return torch.ops.quantized.elu(input, scale, zero_point, alpha)
 
-def hardsigmoid(input):
-    # type: (Tensor) -> Tensor
+def hardsigmoid(input: Tensor) -> Tensor:
     r"""This is the quantized version of :func:`~torch.nn.functional.hardsigmoid`.
     """
     if not input.is_quantized:
         raise ValueError("Input to 'quantized.hardsigmoid' must be quantized!")
     return torch._C._nn.hardsigmoid(input)
 
-def clamp(input, min_, max_):
-    # type: (Tensor, float, float) -> Tensor
+def clamp(input: Tensor, min_: float, max_: float) -> Tensor:
     r"""float(input, min_, max_) -> Tensor
 
     Applies the clamp function element-wise.
diff --git a/torch/nn/quantized/modules/__init__.py b/torch/nn/quantized/modules/__init__.py
index 40b11c89ef90..8b286e549460 100644
--- a/torch/nn/quantized/modules/__init__.py
+++ b/torch/nn/quantized/modules/__init__.py
@@ -35,6 +35,9 @@ class Quantize(torch.nn.Module):
                 [ 1., -1.]], size=(2, 2), dtype=torch.qint8, scale=1.0, zero_point=2)
     """
 
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
     def __init__(self, scale, zero_point, dtype):
         super(Quantize, self).__init__()
         self.register_buffer('scale', torch.tensor([scale]))
diff --git a/torch/nn/quantized/modules/utils.py b/torch/nn/quantized/modules/utils.py
index e043db89c647..d531983a6ff5 100644
--- a/torch/nn/quantized/modules/utils.py
+++ b/torch/nn/quantized/modules/utils.py
@@ -1,6 +1,7 @@
 import torch
 from torch._six import container_abcs
 from itertools import repeat
+from torch.nn.modules.module import _addindent
 
 def _quantize_weight(float_wt, observer):
     wt_scale, wt_zp = observer.calculate_qparams()
@@ -25,7 +26,7 @@ def _ntuple_from_first(n):
     """Converts the argument to a tuple of size n
     with the first element repeated."""
     def parse(x):
-        while isinstance(x, container_abcs.Iterable):
+        while isinstance(x, container_abcs.Sequence):
             if len(x) == n:
                 break
             x = x[0]

From 0a15646e1599611fca6e0add5fecb880759d4941 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 29 Sep 2020 18:13:44 -0700
Subject: [PATCH 287/449] CUDA RTX30 series support (#45489)

Summary:
I also opened a PR on cmake upstream: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/5292

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45489

Reviewed By: zhangguanheng66

Differential Revision: D23997844

Pulled By: ezyang

fbshipit-source-id: 4e7443dde9e70632ee429184f0d51cb9aa5a98b5
---
 caffe2/utils/GpuDefs.cuh                             |  9 +--------
 .../upstream/FindCUDA/select_compute_arch.cmake      | 12 +++++++++++-
 torch/utils/cpp_extension.py                         |  4 ++--
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh
index 46d8058c84b5..be591cc95b92 100644
--- a/caffe2/utils/GpuDefs.cuh
+++ b/caffe2/utils/GpuDefs.cuh
@@ -7,16 +7,9 @@ namespace caffe2 {
 
 // Static definition of GPU warp size for unrolling and code generation
 
-#ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ <= 800
-constexpr int kWarpSize = 32;
-#else
-#error Unknown __CUDA_ARCH__; please define parameters for compute capability
-#endif // __CUDA_ARCH__ types
-#elif defined(__HIP_PLATFORM_HCC__)
+#if defined(__HIP_PLATFORM_HCC__)
 constexpr int kWarpSize = warpSize;   // = 64 (Defined in hip_runtime.h)
 #else
-// dummy value for host compiler
 constexpr int kWarpSize = 32;
 #endif // __CUDA_ARCH__
 
diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
index c17dfa751417..9caf2f408a16 100644
--- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -84,9 +84,19 @@ endif()
 
 if(CUDA_VERSION VERSION_GREATER "10.5")
   list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0" "8.0+PTX")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0")
   list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0")
 
+  if(CUDA_VERSION VERSION_LESS "11.1")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.6")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX")
+  endif()
+endif()
+
+if(CUDA_VERSION VERSION_GREATER "11.0")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6" "8.6+PTX")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
+
   if(CUDA_VERSION VERSION_LESS "12.0")
     set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
   endif()
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 1b3d50a8244d..49fb7988c0d9 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1389,11 +1389,11 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
         ('Pascal', '6.0;6.1+PTX'),
         ('Volta', '7.0+PTX'),
         ('Turing', '7.5+PTX'),
-        ('Ampere', '8.0+PTX'),
+        ('Ampere', '8.0;8.6+PTX'),
     ])
 
     supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
-                        '7.0', '7.2', '7.5', '8.0']
+                        '7.0', '7.2', '7.5', '8.0', '8.6']
     valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
 
     # The default is sm_30 for CUDA 9.x and 10.x

From c87ff2cb9054c6757419e3b6d883014b2c773fef Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 29 Sep 2020 19:19:49 -0700
Subject: [PATCH 288/449] Enable transposed tensor copy for complex types
 (#45487)

Summary:
This enables a special copy operator for transposed tensors with more than 360 elements:
https://github.com/pytorch/pytorch/blob/417e3f85e52cdd7439c16c692f7a154d90729dfc/aten/src/ATen/native/Copy.cpp#L19

Steps to repro: python -c "import torch; print(torch.svd(torch.randn(61, 61, dtype=torch.complex64)))"

Fixes https://github.com/pytorch/pytorch/issues/45269

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45487

Reviewed By: anjali411

Differential Revision: D23984441

Pulled By: malfet

fbshipit-source-id: 10ce1d5f4425fb6de78e96adffd119e545b6624f
---
 aten/src/ATen/native/Copy.cpp | 2 +-
 test/test_torch.py            | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 79fb0a11fba4..23b81a655507 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -34,7 +34,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) {
   }
   Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options());
 
-  AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] {
     scalar_t* sp = src.data_ptr<scalar_t>();
     scalar_t* rp = self.data_ptr<scalar_t>();
     scalar_t* bp = buf.data_ptr<scalar_t>();
diff --git a/test/test_torch.py b/test/test_torch.py
index e43335887dc0..5f43089a9a63 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -654,6 +654,13 @@ def test_copy_transpose(self):
             self.assertEqual(y[:, 0], range(100))
             self.assertEqual(y[:, 40], range(4000, 4100))
 
+            # Validates regression reported in https://github.com/pytorch/pytorch/issues/45269
+            x = torch.arange(100 * 100).reshape(100, 100).to(dtype=torch.cfloat).t()
+            y = torch.empty(100, 100, dtype=torch.cfloat)
+            y.copy_(x)
+            self.assertEqual(y[:, 0], range(100))
+            self.assertEqual(y[:, 40], range(4000, 4100))
+
         def test_device(self):
             cpu = torch.device('cpu')
             self.assertEqual('cpu', str(cpu))

From ccad73ab4100d318001e034ab625f7138e7f702d Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Tue, 29 Sep 2020 19:24:24 -0700
Subject: [PATCH 289/449] Fix D23995953 import.

Summary: https://github.com/pytorch/pytorch/pull/45511 could not be properly imported

Test Plan: See https://github.com/pytorch/pytorch/pull/45511

Reviewed By: zhangguanheng66

Differential Revision: D23995953

fbshipit-source-id: a6224a67d54617ddf34c2392e65f2142c4e78ea4
---
 .../record_function_bench.py                       |  2 +-
 test/test_utils.py                                 | 14 +++++++-------
 torch/utils/_benchmark/__init__.py                 |  4 ----
 torch/utils/{_benchmark => benchmark}/README.md    |  4 ++--
 torch/utils/benchmark/__init__.py                  |  4 ++++
 .../{_benchmark => benchmark}/examples/__init__.py |  0
 .../{_benchmark => benchmark}/examples/compare.py  |  2 +-
 .../examples/end_to_end.py                         |  4 ++--
 .../{_benchmark => benchmark}/examples/fuzzer.py   |  2 +-
 .../examples/op_benchmark.py                       |  6 +++---
 .../examples/prepare_e2e.sh                        |  0
 .../examples/simple_timeit.py                      |  2 +-
 .../op_fuzzers/__init__.py                         |  0
 .../{_benchmark => benchmark}/op_fuzzers/binary.py |  2 +-
 .../{_benchmark => benchmark}/op_fuzzers/unary.py  |  2 +-
 .../{_benchmark => benchmark}/utils/__init__.py    |  0
 .../{_benchmark => benchmark}/utils/common.py      |  0
 .../{_benchmark => benchmark}/utils/compare.py     |  2 +-
 .../{_benchmark => benchmark}/utils/fuzzer.py      |  0
 .../utils/{_benchmark => benchmark}/utils/timer.py |  2 +-
 20 files changed, 26 insertions(+), 26 deletions(-)
 delete mode 100644 torch/utils/_benchmark/__init__.py
 rename torch/utils/{_benchmark => benchmark}/README.md (98%)
 create mode 100644 torch/utils/benchmark/__init__.py
 rename torch/utils/{_benchmark => benchmark}/examples/__init__.py (100%)
 rename torch/utils/{_benchmark => benchmark}/examples/compare.py (98%)
 rename torch/utils/{_benchmark => benchmark}/examples/end_to_end.py (99%)
 rename torch/utils/{_benchmark => benchmark}/examples/fuzzer.py (98%)
 rename torch/utils/{_benchmark => benchmark}/examples/op_benchmark.py (95%)
 rename torch/utils/{_benchmark => benchmark}/examples/prepare_e2e.sh (100%)
 rename torch/utils/{_benchmark => benchmark}/examples/simple_timeit.py (90%)
 rename torch/utils/{_benchmark => benchmark}/op_fuzzers/__init__.py (100%)
 rename torch/utils/{_benchmark => benchmark}/op_fuzzers/binary.py (97%)
 rename torch/utils/{_benchmark => benchmark}/op_fuzzers/unary.py (97%)
 rename torch/utils/{_benchmark => benchmark}/utils/__init__.py (100%)
 rename torch/utils/{_benchmark => benchmark}/utils/common.py (100%)
 rename torch/utils/{_benchmark => benchmark}/utils/compare.py (99%)
 rename torch/utils/{_benchmark => benchmark}/utils/fuzzer.py (100%)
 rename torch/utils/{_benchmark => benchmark}/utils/timer.py (99%)

diff --git a/benchmarks/record_function_benchmark/record_function_bench.py b/benchmarks/record_function_benchmark/record_function_bench.py
index ddd8243ebf0a..830328247bb5 100644
--- a/benchmarks/record_function_benchmark/record_function_bench.py
+++ b/benchmarks/record_function_benchmark/record_function_bench.py
@@ -1,7 +1,7 @@
 import argparse
 import sys
 import torch
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 try:
diff --git a/test/test_utils.py b/test/test_utils.py
index 24e2aefe0797..dc7efe5a8f5b 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -11,7 +11,7 @@
 import torch.utils.data
 import torch.cuda
 from torch.utils.checkpoint import checkpoint, checkpoint_sequential
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
@@ -680,7 +680,7 @@ def assert_reprs_match(measurement, expected):
         assert_reprs_match(
             MockTimer("pass").blocked_autorange(min_run_time=10),
             """
-            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
             pass
               Median: 7.98 ns
               IQR:    0.52 ns (7.74 to 8.26)
@@ -690,7 +690,7 @@ def assert_reprs_match(measurement, expected):
         assert_reprs_match(
             MockTimer("pass").adaptive_autorange(),
             """
-            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
             pass
               Median: 7.86 ns
               IQR:    0.71 ns (7.63 to 8.34)
@@ -700,7 +700,7 @@ def assert_reprs_match(measurement, expected):
         assert_reprs_match(
             MockTimer("cheap_fn()").blocked_autorange(min_run_time=10),
             """
-            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
             cheap_fn()
               Median: 3.98 us
               IQR:    0.27 us (3.85 to 4.12)
@@ -710,7 +710,7 @@ def assert_reprs_match(measurement, expected):
         assert_reprs_match(
             MockTimer("cheap_fn()").adaptive_autorange(),
             """
-            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
             cheap_fn()
               Median: 4.16 us
               IQR:    0.22 us (4.04 to 4.26)
@@ -720,7 +720,7 @@ def assert_reprs_match(measurement, expected):
         assert_reprs_match(
             MockTimer("expensive_fn()").blocked_autorange(min_run_time=10),
             """
-            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
             expensive_fn()
               Median: 19.97 us
               IQR:    1.35 us (19.31 to 20.65)
@@ -730,7 +730,7 @@ def assert_reprs_match(measurement, expected):
         assert_reprs_match(
             MockTimer("expensive_fn()").adaptive_autorange(),
             """
-            <torch.utils._benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
+            <torch.utils.benchmark.utils.common.Measurement object at 0xXXXXXXXXXXXX>
             expensive_fn()
               Median: 20.79 us
               IQR:    1.09 us (20.20 to 21.29)
diff --git a/torch/utils/_benchmark/__init__.py b/torch/utils/_benchmark/__init__.py
deleted file mode 100644
index 30a9543e4544..000000000000
--- a/torch/utils/_benchmark/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from torch.utils._benchmark.utils.common import *
-from torch.utils._benchmark.utils.timer import *
-from torch.utils._benchmark.utils.compare import *
-from torch.utils._benchmark.utils.fuzzer import *
diff --git a/torch/utils/_benchmark/README.md b/torch/utils/benchmark/README.md
similarity index 98%
rename from torch/utils/_benchmark/README.md
rename to torch/utils/benchmark/README.md
index e432a553e6ba..4a64b778181f 100644
--- a/torch/utils/_benchmark/README.md
+++ b/torch/utils/benchmark/README.md
@@ -15,8 +15,8 @@ into two broad categories:
 
 ### Integration and better measurement:
 
-  `Timer`, while modeled after the `timit` analog, uses a slightly different
-  API from `timit.Timer`.
+  `Timer`, while modeled after the `timeit` analog, uses a slightly different
+  API from `timeit.Timer`.
 
   * The constructor accepts additional metadata and timing methods return
   a `Measurement` class rather than a float. This `Measurement` class is
diff --git a/torch/utils/benchmark/__init__.py b/torch/utils/benchmark/__init__.py
new file mode 100644
index 000000000000..8ce3ff36fe4b
--- /dev/null
+++ b/torch/utils/benchmark/__init__.py
@@ -0,0 +1,4 @@
+from torch.utils.benchmark.utils.common import *
+from torch.utils.benchmark.utils.timer import *
+from torch.utils.benchmark.utils.compare import *
+from torch.utils.benchmark.utils.fuzzer import *
diff --git a/torch/utils/_benchmark/examples/__init__.py b/torch/utils/benchmark/examples/__init__.py
similarity index 100%
rename from torch/utils/_benchmark/examples/__init__.py
rename to torch/utils/benchmark/examples/__init__.py
diff --git a/torch/utils/_benchmark/examples/compare.py b/torch/utils/benchmark/examples/compare.py
similarity index 98%
rename from torch/utils/_benchmark/examples/compare.py
rename to torch/utils/benchmark/examples/compare.py
index 3373149c7039..f1688976af37 100644
--- a/torch/utils/_benchmark/examples/compare.py
+++ b/torch/utils/benchmark/examples/compare.py
@@ -9,7 +9,7 @@
 
 import torch
 
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 class FauxTorch(object):
diff --git a/torch/utils/_benchmark/examples/end_to_end.py b/torch/utils/benchmark/examples/end_to_end.py
similarity index 99%
rename from torch/utils/_benchmark/examples/end_to_end.py
rename to torch/utils/benchmark/examples/end_to_end.py
index b275b9a076a2..942c20e54173 100644
--- a/torch/utils/_benchmark/examples/end_to_end.py
+++ b/torch/utils/benchmark/examples/end_to_end.py
@@ -26,8 +26,8 @@
 
 import numpy as np
 import torch
-from torch.utils._benchmark.op_fuzzers import unary
-from torch.utils._benchmark import Timer, Measurement
+from torch.utils.benchmark.op_fuzzers import unary
+from torch.utils.benchmark import Timer, Measurement
 from typing import Dict, Tuple, List
 
 
diff --git a/torch/utils/_benchmark/examples/fuzzer.py b/torch/utils/benchmark/examples/fuzzer.py
similarity index 98%
rename from torch/utils/_benchmark/examples/fuzzer.py
rename to torch/utils/benchmark/examples/fuzzer.py
index 157782de4ccd..4446e2d85c0a 100644
--- a/torch/utils/_benchmark/examples/fuzzer.py
+++ b/torch/utils/benchmark/examples/fuzzer.py
@@ -5,7 +5,7 @@
 
 import sys
 
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 def main():
diff --git a/torch/utils/_benchmark/examples/op_benchmark.py b/torch/utils/benchmark/examples/op_benchmark.py
similarity index 95%
rename from torch/utils/_benchmark/examples/op_benchmark.py
rename to torch/utils/benchmark/examples/op_benchmark.py
index 1d3cc618fa35..65b69d84b41f 100644
--- a/torch/utils/_benchmark/examples/op_benchmark.py
+++ b/torch/utils/benchmark/examples/op_benchmark.py
@@ -6,9 +6,9 @@
 import numpy as np
 import torch
 
-from torch.utils._benchmark import Timer
-from torch.utils._benchmark.op_fuzzers.binary import BinaryOpFuzzer
-from torch.utils._benchmark.op_fuzzers.unary import UnaryOpFuzzer
+from torch.utils.benchmark import Timer
+from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
+from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
 
 
 _MEASURE_TIME = 1.0
diff --git a/torch/utils/_benchmark/examples/prepare_e2e.sh b/torch/utils/benchmark/examples/prepare_e2e.sh
similarity index 100%
rename from torch/utils/_benchmark/examples/prepare_e2e.sh
rename to torch/utils/benchmark/examples/prepare_e2e.sh
diff --git a/torch/utils/_benchmark/examples/simple_timeit.py b/torch/utils/benchmark/examples/simple_timeit.py
similarity index 90%
rename from torch/utils/_benchmark/examples/simple_timeit.py
rename to torch/utils/benchmark/examples/simple_timeit.py
index 4bd76ce4cceb..81aaa6dee981 100644
--- a/torch/utils/_benchmark/examples/simple_timeit.py
+++ b/torch/utils/benchmark/examples/simple_timeit.py
@@ -5,7 +5,7 @@
 
 import torch
 
-import torch.utils._benchmark as benchmark_utils
+import torch.utils.benchmark as benchmark_utils
 
 
 def main():
diff --git a/torch/utils/_benchmark/op_fuzzers/__init__.py b/torch/utils/benchmark/op_fuzzers/__init__.py
similarity index 100%
rename from torch/utils/_benchmark/op_fuzzers/__init__.py
rename to torch/utils/benchmark/op_fuzzers/__init__.py
diff --git a/torch/utils/_benchmark/op_fuzzers/binary.py b/torch/utils/benchmark/op_fuzzers/binary.py
similarity index 97%
rename from torch/utils/_benchmark/op_fuzzers/binary.py
rename to torch/utils/benchmark/op_fuzzers/binary.py
index 848cc7c36875..91289d88db8a 100644
--- a/torch/utils/_benchmark/op_fuzzers/binary.py
+++ b/torch/utils/benchmark/op_fuzzers/binary.py
@@ -1,7 +1,7 @@
 import numpy as np
 import torch
 
-from torch.utils._benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
 
 
 _MIN_DIM_SIZE = 16
diff --git a/torch/utils/_benchmark/op_fuzzers/unary.py b/torch/utils/benchmark/op_fuzzers/unary.py
similarity index 97%
rename from torch/utils/_benchmark/op_fuzzers/unary.py
rename to torch/utils/benchmark/op_fuzzers/unary.py
index 10cee4316c1c..a0f810d0b9fa 100644
--- a/torch/utils/_benchmark/op_fuzzers/unary.py
+++ b/torch/utils/benchmark/op_fuzzers/unary.py
@@ -1,7 +1,7 @@
 import numpy as np
 import torch
 
-from torch.utils._benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
 
 
 _MIN_DIM_SIZE = 16
diff --git a/torch/utils/_benchmark/utils/__init__.py b/torch/utils/benchmark/utils/__init__.py
similarity index 100%
rename from torch/utils/_benchmark/utils/__init__.py
rename to torch/utils/benchmark/utils/__init__.py
diff --git a/torch/utils/_benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
similarity index 100%
rename from torch/utils/_benchmark/utils/common.py
rename to torch/utils/benchmark/utils/common.py
diff --git a/torch/utils/_benchmark/utils/compare.py b/torch/utils/benchmark/utils/compare.py
similarity index 99%
rename from torch/utils/_benchmark/utils/compare.py
rename to torch/utils/benchmark/utils/compare.py
index bb54f8ae9238..ab3fed104ab6 100644
--- a/torch/utils/_benchmark/utils/compare.py
+++ b/torch/utils/benchmark/utils/compare.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from torch.utils._benchmark.utils import common
+from torch.utils.benchmark.utils import common
 
 __all__ = ["Compare"]
 
diff --git a/torch/utils/_benchmark/utils/fuzzer.py b/torch/utils/benchmark/utils/fuzzer.py
similarity index 100%
rename from torch/utils/_benchmark/utils/fuzzer.py
rename to torch/utils/benchmark/utils/fuzzer.py
diff --git a/torch/utils/_benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
similarity index 99%
rename from torch/utils/_benchmark/utils/timer.py
rename to torch/utils/benchmark/utils/timer.py
index c0fba0bfa8a0..41d3892c86b3 100644
--- a/torch/utils/_benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 import torch
-from torch.utils._benchmark.utils import common
+from torch.utils.benchmark.utils import common
 
 
 __all__ = ["Timer", "timer"]

From 772ce9ac2cac933e049181db078d428680e9da10 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Tue, 29 Sep 2020 20:22:56 -0700
Subject: [PATCH 290/449] Fix memory corruption when running torch.svd for
 complex.doubles (#45486)

Summary:
According to http://www.netlib.org/lapack/explore-html/d3/da8/group__complex16_g_esing_gaccb06ed106ce18814ad7069dcb43aa27.html
rwork should be an array of doubles, but it was allocated as array of floats (actually ints)

Fixes crash from https://github.com/pytorch/pytorch/issues/45269

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45486

Reviewed By: walterddr

Differential Revision: D23984444

Pulled By: malfet

fbshipit-source-id: 6a1b00a27de47046496ccf6a91b6e8ad283e42e6
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp | 26 +++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index f4babb2a14a3..fc21464b153d 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -71,9 +71,9 @@ extern "C" void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float
 
 // gesdd
 extern "C" void zgesdd_(char *jobz, int *m, int *n, std::complex<double> *a, int *lda,
-                        double *s, std::complex<double> *u, int *ldu, std::complex<double> *vt, int *ldvt, std::complex<double> *work, int *lwork, int *rwork, int *iwork, int *info);
+                        double *s, std::complex<double> *u, int *ldu, std::complex<double> *vt, int *ldvt, std::complex<double> *work, int *lwork, double *rwork, int *iwork, int *info);
 extern "C" void cgesdd_(char *jobz, int *m, int *n, std::complex<float> *a, int *lda,
-                        float *s, std::complex<float> *u, int *ldu, std::complex<float> *vt, int *ldvt, std::complex<float> *work, int *lwork, int *rwork, int *iwork, int *info);
+                        float *s, std::complex<float> *u, int *ldu, std::complex<float> *vt, int *ldvt, std::complex<float> *work, int *lwork, float *rwork, int *iwork, int *info);
 extern "C" void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda,
                         double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
 extern "C" void sgesdd_(char *jobz, int *m, int *n, float *a, int *lda,
@@ -121,7 +121,7 @@ void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, scalar_t *w
 
 template<class scalar_t, class value_t=scalar_t>
 void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda,
-               value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *rwork, int *iwork, int *info);
+               value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
 
 template<class scalar_t>
 void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
@@ -264,24 +264,24 @@ template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int l
 }
 
 template<> void lapackSvd<c10::complex<double>, double>(char jobz, int m, int n, c10::complex<double> *a, int lda,
-                                  double *s, c10::complex<double> *u, int ldu, c10::complex<double> *vt, int ldvt, c10::complex<double> *work, int lwork, int *rwork, int *iwork, int *info) {
+                                  double *s, c10::complex<double> *u, int ldu, c10::complex<double> *vt, int ldvt, c10::complex<double> *work, int lwork, double *rwork, int *iwork, int *info) {
   zgesdd_(&jobz, &m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, s, reinterpret_cast<std::complex<double>*>(u), &ldu,
           reinterpret_cast<std::complex<double>*>(vt), &ldvt, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, iwork, info);
 }
 
 template<> void lapackSvd<c10::complex<float>, float>(char jobz, int m, int n, c10::complex<float> *a, int lda,
-                                 float *s, c10::complex<float> *u, int ldu, c10::complex<float> *vt, int ldvt, c10::complex<float> *work, int lwork, int *rwork, int *iwork, int *info) {
+                                 float *s, c10::complex<float> *u, int ldu, c10::complex<float> *vt, int ldvt, c10::complex<float> *work, int lwork, float *rwork, int *iwork, int *info) {
   cgesdd_(&jobz, &m, &n, reinterpret_cast<std::complex<float>*>(a), &lda, s, reinterpret_cast<std::complex<float>*>(u), &ldu,
           reinterpret_cast<std::complex<float>*>(vt), &ldvt, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, iwork, info);
 }
 
 template<> void lapackSvd<double>(char jobz, int m, int n, double *a, int lda,
-                                  double *s, double *u, int ldu, double *vt, int ldvt, double *work, int lwork, int *rwork, int *iwork, int *info) {
+                                  double *s, double *u, int ldu, double *vt, int ldvt, double *work, int lwork, double *rwork, int *iwork, int *info) {
   dgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }
 
 template<> void lapackSvd<float>(char jobz, int m, int n, float *a, int lda,
-                                 float *s, float *u, int ldu, float *vt, int ldvt, float *work, int lwork, int *rwork, int *iwork, int *info) {
+                                 float *s, float *u, int ldu, float *vt, int ldvt, float *work, int lwork, float *rwork, int *iwork, int *info) {
   sgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info);
 }
 
@@ -953,6 +953,7 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
   auto S_stride = S.size(-1);
   auto VT_stride = matrixStride(VT);
   auto batchsize = batchCount(self);
+  auto self_stype = at::typeMetaToScalarType(self.dtype());
 
   int info;
   auto m = self.size(-2);
@@ -961,7 +962,7 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
   Tensor iwork = at::empty({8*mn}, at::kInt);
   auto iwork_data = iwork.data_ptr<int>();
   Tensor rwork;
-  int* rwork_data = nullptr;
+  value_t* rwork_data = nullptr;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
     auto mx = std::max(m, n);
     int64_t lrwork; // These settings are valid for on LAPACK 3.6+
@@ -972,8 +973,9 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
     } else {
       lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn);
     }
-    rwork = at::empty({std::max(int64_t(1), lrwork)}, at::kInt);
-    rwork_data = rwork.data_ptr<int>();
+    // rwork is an array of floats or doubles depending on the type
+    rwork = at::empty({std::max(int64_t(1), lrwork)}, at::typeMetaToScalarType(S.dtype()));
+    rwork_data = rwork.data_ptr<value_t>();
   }
 
   // Run once, first to get the optimum work size.
@@ -992,7 +994,7 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
     value_t* S_working_ptr = &S_data[i * S_stride];
     scalar_t* U_working_ptr = &U_data[i * U_stride];
     scalar_t* VT_working_ptr = &VT_data[i * VT_stride];
-    
+
     // Compute S, U (optionally) and VT (optionally)
     lapackSvd<scalar_t, value_t>(jobz, m, n, self_working_ptr, m,
                         S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work_data, lwork, rwork_data, iwork_data, &info);
@@ -1008,7 +1010,7 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cpu(const Tensor& self, bool some
   std::vector<int64_t> infos(batchCount(self), 0);
   int64_t m = self.size(-2), n = self.size(-1);
   int64_t k = std::min(m, n);
-  
+
   char jobz = compute_uv ? (some ? 'S' : 'A') : 'N';
 
   Tensor U_working_copy, S_working_copy, VT_working_copy;

From 4aca63d38aaed749a72b31eab098f7305ea711f4 Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Tue, 29 Sep 2020 20:50:53 -0700
Subject: [PATCH 291/449] [TensorExpr] Change API for creating Load and Store
 expressions. (#45520)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45520

With this change `Load`s and `Store`s no longer accept `Placeholder`s in
their constructor and `::make` functions and can only be built with
`Buf`.
`Placeholder` gets its own `store`, `load`, `storeWithMask`, and
`loadWithMask` method for more convenient construction.

Test Plan: Imported from OSS

Reviewed By: glaringlee

Differential Revision: D23998789

Pulled By: ZolotukhinM

fbshipit-source-id: 3fe018e00c1529a563553b2b215f403b34aea912
---
 test/cpp/tensorexpr/test_aten.cpp            | 214 ++++++--------
 test/cpp/tensorexpr/test_boundsinference.cpp |  25 +-
 test/cpp/tensorexpr/test_cuda.cpp            | 125 +++++----
 test/cpp/tensorexpr/test_expr.cpp            |  49 ++--
 test/cpp/tensorexpr/test_llvm.cpp            | 277 +++++--------------
 test/cpp/tensorexpr/test_loopnest.cpp        |  61 ++--
 test/cpp/tensorexpr/test_reductions.cpp      |  18 +-
 test/cpp/tensorexpr/test_registerizer.cpp    |  46 +--
 test/cpp/tensorexpr/test_simplify.cpp        | 110 ++++----
 test/cpp/tensorexpr/test_train_impl.cpp      |   2 +-
 torch/csrc/jit/tensorexpr/expr.h             |   1 -
 torch/csrc/jit/tensorexpr/ir.cpp             |  74 ++---
 torch/csrc/jit/tensorexpr/ir.h               |  10 +-
 torch/csrc/jit/tensorexpr/kernel.cpp         |   2 +-
 torch/csrc/jit/tensorexpr/loopnest.cpp       |   2 +-
 torch/csrc/jit/tensorexpr/stmt.h             |  13 -
 torch/csrc/jit/tensorexpr/tensor.cpp         |   2 +-
 torch/csrc/jit/tensorexpr/tensor.h           |  63 +++--
 18 files changed, 453 insertions(+), 641 deletions(-)

diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp
index 1bd13bcd9569..ca642d1db96e 100644
--- a/test/cpp/tensorexpr/test_aten.cpp
+++ b/test/cpp/tensorexpr/test_aten.cpp
@@ -19,9 +19,9 @@ void testATen_cast_Float() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
+  ExprHandle load_a = a_buf.load(index);
   ExprHandle to_float = Cast::make(kFloat, load_a);
-  Stmt* store_b = Store::make(b_buf, {index}, to_float, 1);
+  Stmt* store_b = b_buf.store({index}, to_float);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -47,9 +47,9 @@ void testATennegInt() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
+  ExprHandle load_a = a_buf.load(index);
   ExprHandle to_float = Sub::make(0, load_a);
-  Stmt* store_b = Store::make(b_buf, {index}, to_float, 1);
+  Stmt* store_b = b_buf.store({index}, to_float);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -75,9 +75,9 @@ void testATennegFloat() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
+  ExprHandle load_a = a_buf.load(index);
   ExprHandle to_float = Sub::make(0, load_a);
-  Stmt* store_b = Store::make(b_buf, {index}, to_float, 1);
+  Stmt* store_b = b_buf.store({index}, to_float);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -105,10 +105,10 @@ void testATenaddInt() {
   Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a + load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a + load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -142,10 +142,10 @@ void testATenaddFloat() {
   Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a + load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a + load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -179,10 +179,10 @@ void testATensubInt() {
   Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a - load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a - load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -216,10 +216,10 @@ void testATensubFloat() {
   Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d = Store::make(d_buf, {index}, load_a - load_b * load_c, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a - load_b * load_c);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -253,11 +253,10 @@ void testATenlerp() {
   Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  Stmt* store_d =
-      Store::make(d_buf, {index}, load_a + load_c * (load_b - load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  Stmt* store_d = d_buf.store({index}, load_a + load_c * (load_b - load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_d);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -292,12 +291,11 @@ void testATenaddcmulInt() {
   Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  ExprHandle load_d = Load::make(d_buf, {index}, 1);
-  Stmt* store_e =
-      Store::make(e_buf, {index}, load_a + load_b * load_c * load_d, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  ExprHandle load_d = d_buf.load(index);
+  Stmt* store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_e);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -335,12 +333,11 @@ void testATenaddcmulFloat() {
   Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  ExprHandle load_c = Load::make(c_buf, {index}, 1);
-  ExprHandle load_d = Load::make(d_buf, {index}, 1);
-  Stmt* store_e =
-      Store::make(e_buf, {index}, load_a + load_b * load_c * load_d, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  ExprHandle load_c = c_buf.load(index);
+  ExprHandle load_d = d_buf.load(index);
+  Stmt* store_e = e_buf.store({index}, load_a + load_b * load_c * load_d);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_e);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -376,9 +373,9 @@ void testATenmulInt() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a * load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a * load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -408,9 +405,9 @@ void testATenmulFloat() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a * load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a * load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -440,9 +437,9 @@ void testATendivInt() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a / load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a / load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -472,9 +469,9 @@ void testATendivFloat() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c = Store::make(c_buf, {index}, load_a / load_b, 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, load_a / load_b);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -504,10 +501,9 @@ void testATenmaxInt() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Max::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -537,10 +533,9 @@ void testATenmaxFloat() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Max::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Max::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -570,10 +565,9 @@ void testATenminInt() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Min::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -603,10 +597,9 @@ void testATenminFloat() {
   Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  ExprHandle load_b = Load::make(b_buf, {index}, 1);
-  Stmt* store_c =
-      Store::make(c_buf, {index}, Min::make(load_a, load_b, true), 1);
+  ExprHandle load_a = a_buf.load(index);
+  ExprHandle load_b = b_buf.load(index);
+  Stmt* store_c = c_buf.store({index}, Min::make(load_a, load_b, true));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_c);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -635,8 +628,8 @@ void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, FloatImm::make(1.0f) / load_a, 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, FloatImm::make(1.0f) / load_a);
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -662,8 +655,8 @@ void testATenreluInt() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, Max::make(load_a, 0, false), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, Max::make(load_a, 0, false));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<int> a_v(kTotalSize);
@@ -689,12 +682,10 @@ void testATenreluFloat() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(
-      b_buf,
-      {index},
-      Max::make(load_a, 0, false), // relu does not propagate nans
-      1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store(
+      {index}, Max::make(load_a, 0, false) // relu does not propagate nans
+  );
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -720,8 +711,8 @@ void testATenlogFloat() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, log(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, log(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -747,8 +738,8 @@ void testATenlog10Float() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, log10(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, log10(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -774,8 +765,8 @@ void testATenlog2Float() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, log2(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, log2(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -801,8 +792,8 @@ void testATenexpFloat() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, exp(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, exp(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -828,8 +819,8 @@ void testATenerfFloat() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, erf(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, erf(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -855,8 +846,8 @@ void testATencosFloat() {
   Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat));
 
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(a_buf, {index}, 1);
-  Stmt* store_b = Store::make(b_buf, {index}, cos(load_a), 1);
+  ExprHandle load_a = a_buf.load(index);
+  Stmt* store_b = b_buf.store({index}, cos(load_a));
   Stmt* stmt = For::make(index, 0, kTotalSize, store_b);
 
   PaddedBuffer<float> a_v(kTotalSize);
@@ -885,20 +876,15 @@ void testATeneqInt() {
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -916,20 +902,15 @@ void testATengeInt() {
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGE)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -947,20 +928,15 @@ void testATengtInt() {
   std::vector<int> b_buffer(N, 3);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGT)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -978,20 +954,15 @@ void testATenleInt() {
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLE)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -1009,20 +980,15 @@ void testATenltInt() {
   std::vector<int> b_buffer(N, 5);
   std::vector<int> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLT)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp
index 7af7e3296e05..11c1c34f24a1 100644
--- a/test/cpp/tensorexpr/test_boundsinference.cpp
+++ b/test/cpp/tensorexpr/test_boundsinference.cpp
@@ -49,7 +49,7 @@ void testBoundsInference_1() {
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -74,7 +74,7 @@ void testBoundsInference_2() {
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -98,8 +98,9 @@ void testBoundsInference_3() {
   KernelScope kernel_scope;
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n + 10}, kFloat));
-  Tensor* b = Compute(
-      "b", {{n, "i"}}, [&](const VarHandle& i) { return a(i) * a(i + 10); });
+  Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) {
+    return a.load(i) * a.load(i + 10);
+  });
   LoopNest l({b});
   auto bounds_info = inferBounds(l.root_stmt());
 
@@ -133,7 +134,7 @@ void testBoundsInference_4() {
       });
   Tensor* c = Compute(
       "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a(y, x) * b->call(y, x);
+        return a.load(y, x) * b->call(y, x);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -207,7 +208,7 @@ void testBoundsInference_5() {
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
   For* outer;
@@ -265,7 +266,7 @@ void testBoundsInference_6() {
       });
   Tensor* c = Compute(
       "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) {
-        return a(y + 100, x + 100) * b->call(y * 2, x * 5);
+        return a.load(y + 100, x + 100) * b->call(y * 2, x * 5);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -328,9 +329,9 @@ void testBoundsInferenceNonOverlapping() {
   ExprHandle H(3);
   Placeholder a(BufHandle("a", {10}, kFloat));
   Tensor* b =
-      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); });
+      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
   Tensor* c = Compute(
-      "c", {{H, "x"}}, [&](const VarHandle& x) { return a(x + H + 1); });
+      "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H + 1); });
   LoopNest l({b, c});
   std::vector<For*> loops = NodeFinder<For>::find(l.root_stmt());
 
@@ -389,9 +390,9 @@ void testBoundsInferenceAdjacent() {
   ExprHandle H(6);
   Placeholder a(BufHandle("a", {20}, kFloat));
   Tensor* b =
-      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); });
-  Tensor* c =
-      Compute("c", {{H, "x"}}, [&](const VarHandle& x) { return a(x + H); });
+      Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); });
+  Tensor* c = Compute(
+      "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); });
   LoopNest l({b, c});
   std::vector<For*> loops = NodeFinder<For>::find(l.root_stmt());
 
diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp
index c1afd889412b..6dba8c574c57 100644
--- a/test/cpp/tensorexpr/test_cuda.cpp
+++ b/test/cpp/tensorexpr/test_cuda.cpp
@@ -40,7 +40,7 @@ void testCudaTestVectorAdd01_impl() {
           {block_size, "t_id"},
       },
       [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
-        return a_buf(n, b_id, t_id) + b_buf(n, b_id, t_id);
+        return a_buf.load(n, b_id, t_id) + b_buf.load(n, b_id, t_id);
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -105,7 +105,7 @@ void testCudaSigmoid() {
           {block_size, "t_id"},
       },
       [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) {
-        return sigmoid(sigmoid(a_buf(n, b_id, t_id)));
+        return sigmoid(sigmoid(a_buf.load(n, b_id, t_id)));
       });
   LoopNest l({c});
   std::vector<For*> loops = l.getLoopStmtsFor(c);
@@ -168,7 +168,7 @@ static void testCudaTestVectorAdd02_impl(int N, int block_size) {
       {
           {N, "N"},
       },
-      [&](const VarHandle& n) { return a_buf(n) + b_buf(n); });
+      [&](const VarHandle& n) { return a_buf.load(n) + b_buf.load(n); });
   LoopNest l({c});
   For* n_outer;
   For* n_inner;
@@ -225,7 +225,7 @@ void testCudaHalfCast() {
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
   Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(kFloat, a(i));
+    return Cast::make(kFloat, a.load(i));
   });
 
   LoopNest l({b});
@@ -268,7 +268,7 @@ void testCudaDynamicShape2D() {
     Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
-          return a(i, j) + b(i, j);
+          return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
     l.prepareForCodegen();
@@ -386,8 +386,8 @@ void testCudaDynamicShapeSplit() {
   constexpr int N = 4096;
   VarHandle n("n", kInt);
   Placeholder a(BufHandle("a", {n}, kFloat));
-  Tensor* b =
-      Compute("b", {{n, "n"}}, [&](const VarHandle& i) { return a(i) * 2.0f; });
+  Tensor* b = Compute(
+      "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; });
   LoopNest l({b});
   For* outer;
   For* inner;
@@ -448,12 +448,12 @@ void testCudaOneBlockOneThreadGlobalReduce1() {
   //   }
   // }
 
-  Store* init_store = Store::make(output_buf, {0}, 0.f, 1);
+  Store* init_store = output_buf.store({0}, 0.f);
   VarHandle i1("i1", kInt);
-  ExprHandle load_data = Load::make(data_buf, {i1}, 1);
-  ExprHandle load_output = Load::make(output_buf, {0}, 1);
+  ExprHandle load_data = Load::make(BufHandle(data_buf.data()), {i1}, 1);
+  ExprHandle load_output = Load::make(BufHandle(output_buf.data()), {0}, 1);
   ExprHandle add_value = load_output + load_data;
-  Store* store_output = Store::make(output_buf, {0}, add_value, 1);
+  Store* store_output = output_buf.store({0}, add_value);
   For* for_output = For::make(i1, 0, N, store_output);
   Stmt* reduce_block = Block::make({init_store, for_output});
   VarHandle thread_idx("tidx", kInt);
@@ -517,7 +517,7 @@ void testCudaOneBlockMultiThreadGlobalReduce1() {
   Placeholder a_buf("a", kFloat, {N});
   Placeholder b_buf("b", kFloat, {1});
 
-  Store* init_store = Store::make(b_buf, {0}, 0.f, 1);
+  Store* init_store = b_buf.store({0}, 0.f);
   VarHandle t("t", kInt);
   VarHandle b("b", kInt);
 
@@ -533,10 +533,10 @@ void testCudaOneBlockMultiThreadGlobalReduce1() {
 
   //  for t in 0..1024: // thread-idx
   //    b[0] = b[0] + a[t] // implied atomic
-  ExprHandle load_a = Load::make(a_buf, {t}, 1);
-  ExprHandle load_b = Load::make(b_buf, {0}, 1);
+  ExprHandle load_a = Load::make(BufHandle(a_buf.data()), {t}, 1);
+  ExprHandle load_b = Load::make(BufHandle(b_buf.data()), {0}, 1);
   ExprHandle add_value = load_b + load_a;
-  Store* store_b = Store::make(b_buf, {0}, add_value, 1);
+  Store* store_b = b_buf.store({0}, add_value);
   For* for_b = For::make(t, 0, N, store_b, thread_idx_options);
 
   Stmt* reduce_block = Block::make({for_init, for_b});
@@ -607,15 +607,15 @@ void testCudaNoThreadIdxWrite_1() {
   //   a[0] = 0
   //   for n in 0..2:
   //     a[0] = a[0] + n
-  Store* store_a0_0 = Store::make(a_buf, {0}, 0.f, 1);
-  ExprHandle load_a0 = Load::make(a_buf, {0}, 1);
+  Store* store_a0_0 = a_buf.store({0}, 0.f);
+  ExprHandle load_a0 = Load::make(BufHandle(a_buf.data()), {0}, 1);
   ExprHandle v1 = load_a0 + n;
-  Store* store_a0_v1 = Store::make(a_buf, {0}, v1, 1);
+  Store* store_a0_v1 = a_buf.store({0}, v1);
   For* loop_a_0 = For::make(n, 0, 2, store_a0_v1);
 
   //   for m in 0..1024: // thread-idx
   //     b[m] = m
-  Store* store_bm_m = Store::make(b_buf, {m}, m + 0.f, 1);
+  Store* store_bm_m = b_buf.store({m}, m + 0.f);
   LoopOptions thread_idx_options;
   thread_idx_options.set_gpu_thread_index(0);
   For* loop_b_1 = For::make(m, 0, N, store_bm_m, thread_idx_options);
@@ -623,10 +623,10 @@ void testCudaNoThreadIdxWrite_1() {
   //   a[1] = 1
   //   for l in 0..2:
   //     a[1] = a[1] + l
-  Store* store_a1_1 = Store::make(a_buf, {1}, 1.f, 1);
-  ExprHandle load_a1 = Load::make(a_buf, {1}, 1);
+  Store* store_a1_1 = a_buf.store({1}, 1.f);
+  ExprHandle load_a1 = a_buf.load(1);
   ExprHandle v2 = load_a1 + l;
-  Store* store_a1_v2 = Store::make(a_buf, {1}, v2, 1);
+  Store* store_a1_v2 = a_buf.store({1}, v2);
   For* loop_a_1 = For::make(l, 0, 2, store_a1_v2);
 
   Stmt* reduce_block =
@@ -728,7 +728,8 @@ void testCudaSharedMemReduce_1() {
     //    for n in 0..64:  // thread_idx
     //      c(n) = c(n) + a(k, m, n)
     ExprHandle load_cn = Load::make(kFloat, c, {n}, 1);
-    ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n}, 1);
+    ExprHandle a_kmn =
+        Load::make(BufHandle(a.data()), {k * (M * N) + m * N + n}, 1);
     ExprHandle v_add = load_cn + a_kmn;
     Store* store_cn_v = Store::make(c, {n}, v_add);
     For* loop_n2 = For::make(n, 0, N, store_cn_v, thread_idx_opt);
@@ -740,12 +741,12 @@ void testCudaSharedMemReduce_1() {
     //    b(k) = 0
     //    for n in 0..64:  // thread_idx
     //      b(k) = b(k) + c(n)
-    Store* store_bk_0 = Store::make(b, {k}, 0.f, 1);
+    Store* store_bk_0 = b.store({k}, 0.f);
     block.push_back(store_bk_0);
-    ExprHandle load_bk = Load::make(b, {k}, 1);
+    ExprHandle load_bk = b.load(k);
     ExprHandle load_cn = Load::make(kFloat, c, {n}, 1);
     ExprHandle v_add = load_bk + load_cn;
-    Store* store_bk = Store::make(b, {k}, v_add, 1);
+    Store* store_bk = b.store({k}, v_add);
     For* loop_n3 = For::make(n, 0, N, store_bk, thread_idx_opt);
     block.push_back(loop_n3);
   }
@@ -847,7 +848,7 @@ void testCudaLocalMemReduce_1() {
   std::vector<Stmt*> block_k;
   {
     //    b(k) = 0
-    Store* store_bk_0 = Store::make(b, {k}, 0.f, 1);
+    Store* store_bk_0 = b.store({k}, 0.f);
     block_k.push_back(store_bk_0);
   }
   std::vector<Stmt*> block_n;
@@ -865,7 +866,7 @@ void testCudaLocalMemReduce_1() {
     //      for m in 0..128:
     //        c(0) = c(0) + a(k, m, n)
     ExprHandle load_c0 = Load::make(kFloat, c, {0}, 1);
-    ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n}, 1);
+    ExprHandle a_kmn = a.load(k * (M * N) + m * N + n);
     ExprHandle v_add = load_c0 + a_kmn;
     Store* store_c0_v = Store::make(c, {0}, v_add);
     For* loop_m = For::make(m, 0, M, store_c0_v);
@@ -873,10 +874,10 @@ void testCudaLocalMemReduce_1() {
   }
   {
     //      b(k) = b(k) + c(0)
-    ExprHandle load_bk = Load::make(b, {k}, 1);
+    ExprHandle load_bk = b.load(k);
     ExprHandle load_c0 = Load::make(kFloat, c, {0}, 1);
     ExprHandle v_add = load_bk + load_c0;
-    Store* store_bk = Store::make(b, {k}, v_add, 1);
+    Store* store_bk = b.store({k}, v_add);
     block_n.push_back(store_bk);
   }
   {
@@ -931,7 +932,7 @@ void testCudaHalfSupport() {
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
   Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) {
-    return Cast::make(half, ExprHandle(2.0f) * a(i));
+    return Cast::make(half, ExprHandle(2.0f) * a.load(i));
   });
 
   Tensor* c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) {
@@ -989,7 +990,7 @@ void testCudaHalfPropagation() {
   auto half = ToDtype<at::Half>();
   Placeholder a("a", half, {4});
   Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) {
-    return Max::make(a(i), ExprHandle(new HalfImm(0)), true);
+    return Max::make(a.load(i), ExprHandle(new HalfImm(0)), true);
   });
 
   LoopNest l({relu});
@@ -1050,13 +1051,13 @@ void testCudaPrioritizeDependents() {
    *   c[i] = (i < 10 ? a[i] + b[i] : b[i]);
    * }
    */
-  ExprHandle load_a = Load::make(a, {i}, 1);
-  ExprHandle load_b = Load::make(b, {i}, 1);
+  ExprHandle load_a = Load::make(BufHandle(a.data()), {i}, 1);
+  ExprHandle load_b = Load::make(BufHandle(b.data()), {i}, 1);
   ExprHandle cmp = CompareSelect::make(i, 10, CompareSelectOperation::kLT);
   ExprHandle ite = IfThenElse::make(cmp, Add::make(load_a, load_b), load_b);
 
-  For* loop = For::make(
-      i, 0, 12, Block::make({Store::make(c, {i}, ite, 1)}), block_idx_opt);
+  For* loop =
+      For::make(i, 0, 12, Block::make({c.store({i}, ite)}), block_idx_opt);
 
   CudaCodeGen cuda_cg(loop, a, b, c);
 
@@ -1113,10 +1114,11 @@ void testCudaMaskBlockDim() {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i) + b_buf(i);
+    return a_buf.load(i) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1205,10 +1207,11 @@ void testCudaMaskThreadDim() {
   int B_SIZE = 100;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i / 2) + b_buf(i);
+    return a_buf.load(i / 2) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1299,10 +1302,11 @@ void testCudaMaskMultiBlockDim() {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i) + b_buf(i);
+    return a_buf.load(i) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1392,10 +1396,11 @@ void testCudaMaskBlockAndThreadDim() {
   int B_SIZE = 50;
   Placeholder a_buf("a", kFloat, {A_SIZE});
   Placeholder b_buf("b", kFloat, {B_SIZE});
-  Tensor* c = Compute(
-      "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; });
+  Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i) + 10;
+  });
   Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) {
-    return a_buf(i) + b_buf(i);
+    return a_buf.load(i) + b_buf.load(i);
   });
 
   LoopNest l({c, d});
@@ -1488,13 +1493,13 @@ void testCudaMaskMultiDim() {
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -1618,13 +1623,13 @@ void testCudaMaskMultiDimSymbolic() {
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -1771,13 +1776,13 @@ void testCudaMaskCompoundInnerLoop() {
                j,
                0,
                A_SIZE,
-               Store::make(c_buf, {i, j}, ExprHandle(2) * a_buf(i, j), 1),
+               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
                threadBound),
            For::make(
                k,
                0,
                B_SIZE,
-               Store::make(d_buf, {i, k}, c_buf(i, k * 2) + b_buf(i, k), 1),
+               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
                threadBound)}),
       blockBound);
 
@@ -1910,13 +1915,13 @@ void testCudaMaskInnerLoopOneBlock() {
                j,
                0,
                A_SIZE,
-               Store::make(c_buf, {i, j}, ExprHandle(2) * a_buf(i, j), 1),
+               c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)),
                threadBound),
            For::make(
                k,
                0,
                B_SIZE,
-               Store::make(d_buf, {i, k}, c_buf(i, k * 2) + b_buf(i, k), 1),
+               d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)),
                threadBound)}));
 
   stmt = FlattenIndexes(stmt);
@@ -2032,13 +2037,13 @@ void testCudaMaskMultiDimMultiAxis() {
       "C",
       {{OUTER_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
@@ -2163,13 +2168,13 @@ void testCudaMaskMultiDimMultiLevel() {
       "C",
       {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return ExprHandle(2) * a_buf(i, j);
+        return ExprHandle(2) * a_buf.load(i, j);
       });
   Tensor* d = Compute(
       "D",
       {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}},
       [&](const VarHandle& i, const VarHandle& j) {
-        return c->call(i, j * 2) + b_buf(i, j);
+        return c->call(i, j * 2) + b_buf.load(i, j);
       });
 
   LoopNest l({c, d});
diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp
index 298225c23540..6fafb2813902 100644
--- a/test/cpp/tensorexpr/test_expr.cpp
+++ b/test/cpp/tensorexpr/test_expr.cpp
@@ -65,10 +65,10 @@ void testExprLetStmtTest01() {
   Placeholder a_buf("a", kFloat, {1});
   Placeholder b_buf("b", kFloat, {1});
 
-  ExprHandle load_a = Load::make(a_buf, {0}, 1);
+  ExprHandle load_a = a_buf.load(0);
   VarHandle var = VarHandle("v", kFloat);
   Stmt* let_store = Let::make(var, load_a);
-  Stmt* store_b = Store::make(b_buf, {0}, var, 1);
+  Stmt* store_b = b_buf.store({0}, var);
   Block* block = Block::make({let_store, store_b});
 
   SimpleIREvaluator eval(block, a_buf, b_buf);
@@ -199,17 +199,14 @@ void testExprVectorAdd01() {
     }
   */
   VarHandle index = VarHandle("index", kInt);
-  ExprHandle load_a = Load::make(
-      a_buf,
+  ExprHandle load_a = a_buf.loadWithMask(
       {Ramp::make(index * kVectorSize, 1, kVectorSize)},
       Broadcast::make(1, kVectorSize));
-  ExprHandle load_b = Load::make(
-      b_buf,
+  ExprHandle load_b = b_buf.loadWithMask(
       {Ramp::make(index * kVectorSize, 1, kVectorSize)},
       Broadcast::make(1, kVectorSize));
   ExprHandle value = load_a + load_b;
-  Stmt* store_c = Store::make(
-      c_buf,
+  Stmt* store_c = c_buf.storeWithMask(
       {Ramp::make(index * kVectorSize, 1, kVectorSize)},
       value,
       Broadcast::make(1, kVectorSize));
@@ -244,20 +241,15 @@ void testExprCompareSelectEQ() {
   std::vector<int> c_buffer(N, 0);
   std::vector<int> c_ref(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   SimpleIREvaluator ir_eval(memcpy_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -288,7 +280,6 @@ void testExprCompareSelectDtypes() {
   std::vector<float> c_buffer(N, 0.0f);
   std::vector<float> c_ref(N, 3.14f);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   // C[i] = (A[i] == B[i]) ? 3.14f : 2.78f
   // A and B are int, C is float.
@@ -296,16 +287,14 @@ void testExprCompareSelectDtypes() {
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
+              a.load(i),
+              b.load(i),
               FloatImm::make(3.14f),
               FloatImm::make(2.78f),
-              CompareSelectOperation::kEQ),
-          mask));
+              CompareSelectOperation::kEQ)));
 
   SimpleIREvaluator ir_eval(select_expr, a, b, c);
   ir_eval(a_buffer, b_buffer, c_buffer);
@@ -328,10 +317,8 @@ void testExprIntrinsicsDtypes() {
   std::vector<double> b_buffer(N, 0.0);
   std::vector<double> b_ref(N, 10.0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto fabs_expr = For::make(
-      i, 0, N, Store::make(b, {i}, fabs(Load::make(a, {i}, mask)), mask));
+  auto fabs_expr = For::make(i, 0, N, b.store({i}, fabs(a.load(i))));
 
   SimpleIREvaluator ir_eval(fabs_expr, a, b);
   ir_eval(a_buffer, b_buffer);
@@ -483,7 +470,7 @@ void testExprDynamicShapeAdd() {
     Placeholder b(BufHandle("b", {n}, kFloat));
     Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
-    Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
+    Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
     std::vector<float> aData(size, 1.0f);
     std::vector<float> bData(size, 2.0f);
     std::vector<float> cData(size, 0.0f);
@@ -501,10 +488,8 @@ void testCond01() {
   PaddedBuffer<float> a_v(N);
   Placeholder a_buf("a", kFloat, {N});
   VarHandle index = VarHandle("index", kInt);
-  Stmt* assign_x2 =
-      Store::make(BufHandle(a_buf.data()), {index}, cast<float>(index) * 2, 1);
-  Stmt* assign_x3 =
-      Store::make(BufHandle(a_buf.data()), {index}, cast<float>(index) * 3, 1);
+  Stmt* assign_x2 = a_buf.store({index}, cast<float>(index) * 2);
+  Stmt* assign_x3 = a_buf.store({index}, cast<float>(index) * 3);
   ExprHandle even_cond = CompareSelect::make(Mod::make(index, 2), 0, kEQ);
   Stmt* assign = Cond::make(even_cond, assign_x2, assign_x3);
   Stmt* for_stmt = For::make(index, 0, N, assign);
@@ -564,7 +549,7 @@ void testStmtClone() {
 
   Placeholder a_buf("a", kInt, {N});
   VarHandle index = VarHandle("index", kInt);
-  Stmt* body = Store::make(BufHandle(a_buf.data()), {index}, 5, 1);
+  Stmt* body = a_buf.store({index}, 5);
   Stmt* loop = For::make(index, 0, N, body);
 
   Stmt* cloned_loop = Stmt::clone(loop);
@@ -578,7 +563,7 @@ void testStmtClone() {
 
   // Let's add another assign to the body in the cloned loop and verify that the
   // original statement hasn't changed while the cloned one has.
-  Stmt* body_addition = Store::make(BufHandle(a_buf.data()), {index}, 33, 1);
+  Stmt* body_addition = a_buf.store({index}, 33);
   Block* cloned_body =
       static_cast<Block*>(static_cast<const For*>(cloned_loop)->body());
   cloned_body->append_stmt(body_addition);
diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp
index af7c31dbf28f..7f4e1a0afc24 100644
--- a/test/cpp/tensorexpr/test_llvm.cpp
+++ b/test/cpp/tensorexpr/test_llvm.cpp
@@ -167,11 +167,7 @@ void testLLVMLetTest01() {
   VarHandle x("x", kFloat);
   auto block = Block::make({
       Let::make(x, 3.f),
-      Store::make(
-          a,
-          {IntImm::make(0)},
-          ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f)),
-          IntImm::make(1)),
+      a.store({0}, ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f))),
   });
 
   LLVMCodeGen cg(block, {a});
@@ -187,15 +183,12 @@ void testLLVMLetTest02() {
   std::vector<void*> args({v.data()});
   VarHandle x("x", kFloat);
   VarHandle y("y", kFloat);
-  auto block = Block::make({
-      Let::make(x, 3.f),
-      Let::make(y, 6.f),
-      Store::make(
-          a,
-          {IntImm::make(0)},
-          ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)),
-          IntImm::make(1)),
-  });
+  auto block = Block::make(
+      {Let::make(x, 3.f),
+       Let::make(y, 6.f),
+       a.store(
+           {IntImm::make(0)},
+           ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)))});
 
   LLVMCodeGen cg(block, {a});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -210,17 +203,15 @@ void testLLVMLetTestMultitype() {
   std::vector<void*> args({v.data()});
   VarHandle x("x", kByte);
   VarHandle y("y", kHalf);
-  auto block = Block::make({
-      Let::make(x, 3),
-      Let::make(y, 6.f),
-      Store::make(
-          a,
-          {IntImm::make(0)},
-          Cast::make(
-              kDouble,
-              ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f))),
-          IntImm::make(1)),
-  });
+  auto block =
+      Block::make({Let::make(x, 3),
+                   Let::make(y, 6.f),
+                   a.store(
+                       {0},
+                       Cast::make(
+                           kDouble,
+                           ExprHandle(2.f) +
+                               (x * ExprHandle(3.f) + y * ExprHandle(4.f))))});
 
   LLVMCodeGen cg(block, {a});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -244,9 +235,9 @@ void testLLVMBlockTest() {
   std::vector<void*> args({v.data()});
 
   auto block = Block::make({
-      Store::make(a, {IntImm::make(0)}, IntImm::make(3), IntImm::make(1)),
-      Store::make(a, {IntImm::make(1)}, IntImm::make(4), IntImm::make(1)),
-      Store::make(a, {IntImm::make(0)}, IntImm::make(4), IntImm::make(1)),
+      a.store({0}, 3),
+      a.store({1}, 4),
+      a.store({0}, 4),
   });
 
   LLVMCodeGen cg(block, {a});
@@ -262,11 +253,7 @@ void testLLVMLoadStoreTest() {
   std::vector<int32_t> a_buffer = {42};
   std::vector<int32_t> b_buffer = {-11};
 
-  auto store = Store::make(
-      b,
-      {IntImm::make(0)},
-      Load::make(a, {IntImm::make(0)}, IntImm::make(1)),
-      IntImm::make(1));
+  auto store = b.store({0}, a.load(0));
   LLVMCodeGen cg(store, {a, b});
   std::vector<void*> args({a_buffer.data(), b_buffer.data()});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -283,14 +270,7 @@ void testLLVMIfThenElseTest() {
   std::vector<int32_t> b_buffer = {-11};
   std::vector<int32_t> c_buffer = {1};
 
-  auto store = Store::make(
-      b,
-      {IntImm::make(0)},
-      IfThenElse::make(
-          Load::make(c, {IntImm::make(0)}, IntImm::make(1)), // cond
-          Load::make(a, {IntImm::make(0)}, IntImm::make(1)), // then
-          IntImm::make(0)), // else
-      IntImm::make(1));
+  auto store = b.store({0}, IfThenElse::make(c.load(0), a.load(0), 0));
   LLVMCodeGen cg(store, {a, b, c});
   std::vector<void*> args({a_buffer.data(), b_buffer.data(), c_buffer.data()});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -305,10 +285,10 @@ void testLLVMVecLoadStoreTest() {
   std::vector<int32_t> a_buffer = {1, 1, 1, 1};
   std::vector<int32_t> b_buffer = {2, 2, 2, 2};
 
-  auto store = Store::make(
-      b,
+  auto store = b.storeWithMask(
       {Ramp::make(0, 1, 4)},
-      Load::make(a, {Ramp::make(0, 1, 4)}, Broadcast::make(IntImm::make(1), 4)),
+      a.loadWithMask(
+          {Ramp::make(0, 1, 4)}, Broadcast::make(IntImm::make(1), 4)),
       Broadcast::make(IntImm::make(1), 4));
   LLVMCodeGen cg(store, {a, b});
   std::vector<void*> args({a_buffer.data(), b_buffer.data()});
@@ -331,11 +311,9 @@ void testLLVMVecLoadStoreTest() {
     float val = 0.5f;                                            \
     std::vector<float> a_buffer(Lanes, val);                     \
     std::vector<float> b_buffer(Lanes, val);                     \
-    auto store = Store::make(                                    \
-        b,                                                       \
+    auto store = b.storeWithMask(                                \
         {Ramp::make(0, 1, Lanes)},                               \
-        Name(Load::make(                                         \
-            a,                                                   \
+        Name(a.loadWithMask(                                     \
             {Ramp::make(0, 1, Lanes)},                           \
             Broadcast::make(IntImm::make(1), Lanes))),           \
         Broadcast::make(IntImm::make(1), Lanes));                \
@@ -376,11 +354,9 @@ FLOAT_INTRINSICS_TEST(lgamma, 8)
     float val = 0.5f;                                            \
     std::vector<double> a_buffer(Lanes, val);                    \
     std::vector<double> b_buffer(Lanes, val);                    \
-    auto store = Store::make(                                    \
-        b,                                                       \
+    auto store = b.storeWithMask(                                \
         {Ramp::make(0, 1, Lanes)},                               \
-        Name(Load::make(                                         \
-            a,                                                   \
+        Name(a.loadWithMask(                                     \
             {Ramp::make(0, 1, Lanes)},                           \
             Broadcast::make(IntImm::make(1), Lanes))),           \
         Broadcast::make(IntImm::make(1), Lanes));                \
@@ -417,9 +393,8 @@ void testLLVMVectorizerLoadStoreTest() {
   KernelScope kernel_scope;
   Placeholder a(BufHandle("A", {1}, kInt));
 
-  Tensor* c = Compute("c", {{4, "i"}}, [&](const VarHandle& i) {
-    return Load::make(a, {i}, 1);
-  });
+  Tensor* c =
+      Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); });
 
   Placeholder c_buf(BufHandle(c->buf()));
   LoopNest l({c});
@@ -445,10 +420,8 @@ void testLLVMMemcpyTest() {
   std::vector<int32_t> a_buffer(N, 42);
   std::vector<int32_t> b_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr =
-      For::make(i, 0, N, Store::make(b, {i}, Load::make(a, {i}, mask), mask));
+  auto expr = For::make(i, 0, N, b.store({i}, a.load(i)));
 
   LLVMCodeGen cg(expr, {a, b});
 
@@ -467,9 +440,8 @@ void testLLVMBzeroTest() {
   Placeholder b(BufHandle("B", {N}, kInt));
   std::vector<int32_t> b_buffer(N, 11);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(i, 0, N, Store::make(b, {i}, IntImm::make(0), mask));
+  auto expr = For::make(i, 0, N, b.store({i}, 0));
 
   LLVMCodeGen cg(expr, {b});
 
@@ -490,17 +462,8 @@ void testLLVMElemwiseAdd() {
   std::vector<int32_t> b_buffer(N, 1);
   std::vector<int32_t> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Add::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask)),
-          mask));
+  auto expr = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i))));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -525,14 +488,8 @@ void testLLVMElemwiseAddFloat() {
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c, {i}, Load::make(a, {i}, mask) + Load::make(b, {i}, mask), mask));
+  auto expr = For::make(i, 0, N, c.store({i}, a.load(i) + b.load(i)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -561,10 +518,9 @@ void testLLVMElemwiseLog10Float() {
       i,
       0,
       N / 4,
-      Store::make(
-          b,
+      b.storeWithMask(
           {Ramp::make(i * 4, 1, 4)},
-          log10(Load::make(a, {Ramp::make(i * 4, 1, 4)}, mask)),
+          log10(a.loadWithMask({Ramp::make(i * 4, 1, 4)}, mask)),
           mask));
 
   LLVMCodeGen cg(expr, {a, b});
@@ -592,10 +548,9 @@ void testLLVMElemwiseLog1pFloat() {
       i,
       0,
       N / 4,
-      Store::make(
-          b,
+      b.storeWithMask(
           {Ramp::make(i * 4, 1, 4)},
-          log1p(Load::make(a, {Ramp::make(i * 4, 1, 4)}, mask)),
+          log1p(a.loadWithMask({Ramp::make(i * 4, 1, 4)}, mask)),
           mask));
 
   LLVMCodeGen cg(expr, {a, b});
@@ -619,17 +574,9 @@ void testLLVMElemwiseMaxInt() {
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -654,17 +601,9 @@ void testLLVMElemwiseMinInt() {
   std::vector<int> b_buffer(N, 1);
   std::vector<int> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -689,17 +628,9 @@ void testLLVMElemwiseMaxFloat() {
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -724,17 +655,9 @@ void testLLVMElemwiseMaxNaNFloat() {
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -760,17 +683,9 @@ void testLLVMElemwiseMinFloat() {
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -795,17 +710,9 @@ void testLLVMElemwiseMinNaNFloat() {
   std::vector<float> b_buffer(N, 1);
   std::vector<float> c_buffer(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false),
-          mask));
+  auto expr =
+      For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -831,17 +738,8 @@ void testLLVMElemwiseMod() {
   std::vector<int32_t> b_buffer(N, 23);
   std::vector<int32_t> c_buffer(N, 18);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
-  auto expr = For::make(
-      i,
-      0,
-      N,
-      Store::make(
-          c,
-          {i},
-          Mod::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask)),
-          mask));
+  auto expr = For::make(i, 0, N, c.store({i}, Mod::make(a.load(i), b.load(i))));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -872,20 +770,15 @@ void testLLVMCompareSelectIntEQ() {
     c_ref[i] = 0;
   }
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -912,20 +805,15 @@ void testLLVMCompareSelectFloatEQ() {
   std::vector<float> b_buffer(N, 1.0f);
   std::vector<int> c_buffer(N, 0);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kEQ),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kEQ)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -957,20 +845,15 @@ void testLLVMCompareSelectByteGT() {
     c_ref[i] = 1;
   }
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGT)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -998,20 +881,15 @@ void testLLVMCompareSelectByteGE() {
   std::vector<int> c_buffer(N, 0);
   std::vector<int> c_ref(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kGE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kGE)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -1044,20 +922,15 @@ void testLLVMCompareSelectByteLT() {
     c_ref[i] = 0;
   }
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLT),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLT)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -1085,20 +958,15 @@ void testLLVMCompareSelectByteLE() {
   std::vector<int> c_buffer(N, 0);
   std::vector<int> c_ref(N, 1);
 
-  auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto expr = For::make(
       i,
       0,
       N,
-      Store::make(
-          c,
+      c.store(
           {i},
           CompareSelect::make(
-              Load::make(a, {i}, mask),
-              Load::make(b, {i}, mask),
-              CompareSelectOperation::kLE),
-          mask));
+              a.load(i), b.load(i), CompareSelectOperation::kLE)));
 
   LLVMCodeGen cg(expr, {a, b, c});
 
@@ -1119,8 +987,7 @@ void testLLVMStoreFloat() {
   KernelScope kernel_scope;
   Placeholder result(BufHandle("result", {1}, kFloat));
   std::vector<float> result_buffer = {0.0f};
-  auto expr = Store::make(
-      result, {IntImm::make(0)}, FloatImm::make(3.14f), IntImm::make(1));
+  auto expr = result.store({0}, FloatImm::make(3.14f));
   LLVMCodeGen cg(expr, {result});
   std::vector<void*> args({result_buffer.data()});
   ASSERT_EQ(cg.value<int>(args), 0);
@@ -1155,7 +1022,7 @@ void testLLVMComputeMul() {
   Placeholder a(BufHandle("a", {N}, kFloat));
   Placeholder b(BufHandle("b", {N}, kFloat));
   Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) {
-    return Load::make(a, {i}, 1) * Load::make(b, {i}, 1);
+    return a.load(i) * b.load(i);
   });
 
   Placeholder c_buf(BufHandle(c->buf()));
@@ -1180,8 +1047,7 @@ void testLLVMBroadcastAdd() {
   Placeholder b(BufHandle("b", {N}, kFloat));
   Tensor* c = Compute(
       "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) {
-        ExprHandle mask(1);
-        return Load::make(a, {i, j}, mask) + Load::make(b, {j}, mask);
+        return a.load(i, j) + b.load(j);
       });
 
   Placeholder c_buf(BufHandle(c->buf()));
@@ -1227,7 +1093,7 @@ void testLLVMDynamicShapeAdd() {
     Placeholder b(BufHandle("b", {n}, kFloat));
     Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
-    Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
+    Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
     std::vector<float> aData(size, 1.0f);
     std::vector<float> bData(size, 2.0f);
     std::vector<float> cData(size, 0.0f);
@@ -1249,7 +1115,7 @@ void testLLVMBindDynamicShapeAdd() {
     Placeholder b(BufHandle("b", {n}, kFloat));
     Placeholder c(BufHandle("c", {n}, kFloat));
     VarHandle i("i", kInt);
-    Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1));
+    Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i)));
     std::vector<float> aData(size, 1.0f);
     std::vector<float> bData(size, 2.0f);
     std::vector<float> cData(size, 0.0f);
@@ -1268,8 +1134,9 @@ void testLLVMTensorDynamicShapeAdd() {
     VarHandle n("n", kInt);
     Placeholder a(BufHandle("a", {n}, kFloat));
     Placeholder b(BufHandle("b", {n}, kFloat));
-    Tensor* c = Compute(
-        "c", {{n, "n"}}, [&](const VarHandle& i) { return a(i) + b(i); });
+    Tensor* c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) {
+      return a.load(i) + b.load(i);
+    });
     LoopNest l({c});
     Stmt* s = l.root_stmt();
     LLVMCodeGen cg(s, {a, b, c, n});
@@ -1293,7 +1160,7 @@ void testLLVMDynamicShape2D() {
     Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
-          return a(i, j) + b(i, j);
+          return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
     l.prepareForCodegen();
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index c28acd6ccbf7..60c8fb1d62c0 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -579,7 +579,7 @@ void testExprSplitWithMask01() {
   Placeholder b_buf("b", kFloat, {M, N});
   Tensor* tensor = Compute(
       "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) {
-        return a_buf(m, n) + b_buf(m, n) + 1.0f;
+        return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f;
       });
   For* n_outer;
   For* n_inner;
@@ -615,7 +615,7 @@ void testExprSplitWithMaskRepeatedNoMask() {
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
-    return a_buf(m) + b_buf(m) + 1.0f;
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
 
   LoopNest l({tensor});
@@ -647,7 +647,7 @@ void testSplitWithTailWithLoopOptions() {
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
-    return a_buf(m) + b_buf(m) + 1.0f;
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   For *outer, *inner, *tail;
 
@@ -677,7 +677,7 @@ void testSplitWithMaskWithLoopOptions() {
   Placeholder a_buf("a", kFloat, {M});
   Placeholder b_buf("b", kFloat, {M});
   Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) {
-    return a_buf(m) + b_buf(m) + 1.0f;
+    return a_buf.load(m) + b_buf.load(m) + 1.0f;
   });
   For *outer, *inner;
 
@@ -705,7 +705,7 @@ void testScheduleBroadcastAddBuffer() {
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) + b_buf(n, k);
+        return a_buf.load(m, n) + b_buf.load(n, k);
       });
   LoopNest l({c});
   Stmt* stmt = l.root_stmt();
@@ -754,7 +754,7 @@ void testScheduleFunctionCall01() {
       "broadcast_add",
       {{M, "m"}, {N, "n"}, {K, "k"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) + b_buf(n, k);
+        return a_buf.load(m, n) + b_buf.load(n, k);
       });
   Tensor* d = Compute(
       "d",
@@ -814,13 +814,13 @@ void testScheduleInlineSimple() {
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k);
       });
 
   LoopNest l1({y});
@@ -895,13 +895,13 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k);
       });
   Tensor* z = Compute(
       "z",
@@ -974,8 +974,9 @@ void InlineFunc01Helper(const std::vector<std::string>& inline_order) {
         "z",
         {{M, "m3"}, {N, "n3"}, {K, "k3"}},
         [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-          return a_buf(m, n) * b_buf(n, k) +
-              (c_buf(m, n) * d_buf(m, k) + a_buf(m, n) * b_buf(n, k));
+          return a_buf.load(m, n) * b_buf.load(n, k) +
+              (c_buf.load(m, n) * d_buf.load(m, k) +
+               a_buf.load(m, n) * b_buf.load(n, k));
         });
     LoopNest l2({z2});
     l2.prepareForCodegen();
@@ -1129,7 +1130,7 @@ void testScheduleInlineIntrinsics() {
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
@@ -1491,7 +1492,7 @@ void testScheduleFuserStyle() {
 
   Tensor* b = Compute(
       "f", {{kTotalSize, "i"}}, [&](const std::vector<VarHandle>& axes) {
-        return a_buf(axes[0]) + 11.0f;
+        return a_buf.load(axes[0]) + 11.0f;
       });
 
   Tensor* c = Compute(
@@ -1526,13 +1527,13 @@ void testScheduleFuserThreeArg() {
   Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat));
 
   Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return a(i) + b(i);
+    return a.load(i) + b.load(i);
   });
   Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return (*e)(i) + c(i);
+    return e->call(i) + c.load(i);
   });
   Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
-    return (*f)(i) + d(i);
+    return f->call(i) + d.load(i);
   });
 
   LoopNest l({g});
@@ -1562,7 +1563,7 @@ void testScheduleDynamicShape2D() {
     Placeholder b(BufHandle("b", {m, n}, kFloat));
     Tensor* c = Compute(
         "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) {
-          return a(i, j) + b(i, j);
+          return a.load(i, j) + b.load(i, j);
         });
     LoopNest l({c});
     Stmt* s = l.root_stmt();
@@ -2088,10 +2089,13 @@ void testLoopNestReorderExtraStatements() {
 
   VarHandle i = VarHandle(loops[0]->var());
 
-  Stmt* store_1 = Store::make(extra, {i, 0}, ExprHandle(1.f), 1);
-  Stmt* store_2 = Store::make(extra, {i, 1}, ExprHandle(2.f), 1);
+  Stmt* store_1 =
+      Store::make(BufHandle(extra.data()), {i, 0}, ExprHandle(1.f), 1);
+  Stmt* store_2 =
+      Store::make(BufHandle(extra.data()), {i, 1}, ExprHandle(2.f), 1);
   // stmt 3 is the Function body.
-  Stmt* store_3 = Store::make(extra, {i, 2}, ExprHandle(4.f), 1);
+  Stmt* store_3 =
+      Store::make(BufHandle(extra.data()), {i, 2}, ExprHandle(4.f), 1);
 
   loops[0]->body()->prepend_stmt(store_1);
   loops[1]->body()->prepend_stmt(store_2);
@@ -2229,9 +2233,9 @@ void LoopNestReorderTestHelper(
   for (auto* l : loops) {
     // Add an increment at each layer of the loop which counts the number of
     // times the loop executes.
-    Load* load = new Load(extra, {new IntImm(j)}, new IntImm(1));
+    Load* load = new Load(extra.data(), {new IntImm(j)}, new IntImm(1));
     Add* add = new Add(load, new IntImm(1));
-    Stmt* store = Store::make(extra, {j}, ExprHandle(add), 1);
+    Stmt* store = new Store(extra.data(), {new IntImm(j)}, add, new IntImm(1));
     if (prepend) {
       l->body()->prepend_stmt(store);
     }
@@ -2342,13 +2346,13 @@ void testLoopNestReorderInternalLoopNest() {
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n) * b_buf(n, k);
+        return a_buf.load(m, n) * b_buf.load(n, k);
       });
   Tensor* y = Compute(
       "y",
       {{M, "m2"}, {N, "n2"}, {K, "k2"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k);
+        return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k);
       });
   Tensor* z = Compute(
       "z",
@@ -2858,7 +2862,7 @@ void testNormalizeAndSplitWithTail() {
   ExprHandle n(100);
   Placeholder a(BufHandle("a", {n}, kFloat));
   Tensor* b =
-      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); });
+      Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); });
   LoopNest l({b});
 
   // Input IR:
@@ -2905,8 +2909,9 @@ void testDetectInlineRankMismatch() {
   const int kTotalSize = 8;
 
   Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat));
-  Tensor* a = Compute(
-      "a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { return a_buf(i); });
+  Tensor* a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) {
+    return a_buf.load(i);
+  });
   Tensor* reshape = Compute(
       "reshape",
       {{kTotalSize / 2, "i"}, {2, "j"}},
diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp
index 6dedef4363e1..8436388f0d6b 100644
--- a/test/cpp/tensorexpr/test_reductions.cpp
+++ b/test/cpp/tensorexpr/test_reductions.cpp
@@ -284,7 +284,7 @@ void testReduceMinCustomInitializer() {
       "min",
       {},
       Minimum(ExprHandle(minInit)),
-      [&](ParameterList& v) { return in_.call(v); },
+      [&](ParameterList& v) { return in_.load(v); },
       {{10, "m"}});
 
   LoopNest loop({min});
@@ -321,7 +321,7 @@ void testReduceAnyAll() {
       {{4, "i"}},
       anyEqSV,
       [&](const auto& i, const auto& j) {
-        return CompareSelect::make(b(i, j), searchValue, kEQ);
+        return CompareSelect::make(b.load(i, j), searchValue, kEQ);
       },
       {{10, "j"}});
 
@@ -364,7 +364,7 @@ void testReduceAnyAll() {
       {{4, "i"}},
       allGTSV,
       [&](const auto& i, const auto& j) {
-        return CompareSelect::make(b(i, j), searchValue, kGT);
+        return CompareSelect::make(b.load(i, j), searchValue, kGT);
       },
       {{10, "j"}});
 
@@ -414,7 +414,7 @@ void testReduceMatmul2D() {
       {{3, "m"}, {3, "n"}},
       Sum(),
       [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) {
-        return tA(m, k) * tB(k, n);
+        return tA.load(m, k) * tB.load(k, n);
       },
       {{2, "k"}});
 
@@ -475,7 +475,7 @@ void testReduceAsProducer() {
       "scale",
       {{2, "l2"}, {3, "n1"}},
       [&](const VarHandle& l, const VarHandle& n) {
-        return c->call(l, n) * a(l, n);
+        return c->call(l, n) * a.load(l, n);
       });
   LoopNest loop({d});
   loop.prepareForCodegen();
@@ -518,7 +518,7 @@ void testReduceAsConsumer() {
       "scale",
       {{2, "l2"}, {3, "n1"}, {m, "m1"}},
       [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) {
-        return b(l, n, m) * a(l, n, m);
+        return b.load(l, n, m) * a.load(l, n, m);
       });
   Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}});
   LoopNest loop({d});
@@ -1317,7 +1317,7 @@ void testReduceInlineReduction() {
 
   Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}});
   Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) {
-    return a_buf(m) + x->call(m);
+    return a_buf.load(m) + x->call(m);
   });
 
   PaddedBuffer<float> a_v(M);
@@ -1352,7 +1352,7 @@ void testReduceInlineConsumer() {
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n, k) + b_buf(m, n, k);
+        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
   Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}});
 
@@ -1406,7 +1406,7 @@ void testReduceInlineReducerInternal() {
       "x",
       {{M, "m1"}, {N, "n1"}, {K, "k1"}},
       [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf(m, n, k) + b_buf(m, n, k);
+        return a_buf.load(m, n, k) + b_buf.load(m, n, k);
       });
 
   Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) {
diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp
index 5164cab08725..b286ab7b8151 100644
--- a/test/cpp/tensorexpr/test_registerizer.cpp
+++ b/test/cpp/tensorexpr/test_registerizer.cpp
@@ -13,7 +13,7 @@ using namespace torch::jit::tensorexpr;
 // Can replace a simple scalar access with a local variable.
 void testRegisterizerSimple() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -58,7 +58,7 @@ void testRegisterizerSimple() {
 // Won't do replacement of a loop access.
 void testRegisterizerLoop() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {10}, kInt));
+  BufHandle a("A", {10}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -105,7 +105,7 @@ void testRegisterizerLoop() {
 // invalidate it.
 void testRegisterizerLoopFixedLoad() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -151,7 +151,7 @@ void testRegisterizerLoopFixedLoad() {
 // Will registerize multiple accesses of different items of the same buffer.
 void testRegisterizerMultiVar() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -207,8 +207,8 @@ void testRegisterizerMultiVar() {
 // Will registerize the valid accesses while skipping invalid replacements.
 void testRegisterizerVariableLoad() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
-  Placeholder b(BufHandle("B", {10}, kInt));
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
   VarHandle x2("x", kInt);
   Stmt* stmt = Block::make(
@@ -268,7 +268,7 @@ void testRegisterizerSymbolicIndices() {
   KernelScope kernel_scope;
   VarHandle i("i", kInt);
   VarHandle N("N", kInt);
-  Placeholder a(BufHandle("A", {N}, kInt));
+  BufHandle a("A", {N}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {i}, 0, 1),
@@ -317,7 +317,7 @@ void testRegisterizerSymbolicIndices() {
 // yet. Will have to fix soon though.
 void testRegisterizerEarlyStop() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make(
       {Store::make(a, {0}, 0, 1),
@@ -344,7 +344,7 @@ void testRegisterizerEarlyStop() {
 // Can registerize accesses dependent on multiple loop vars.
 void testRegisterizerMultiLoop() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   VarHandle y("y", kInt);
   Stmt* stmt = Block::make(
@@ -402,7 +402,7 @@ void testRegisterizerMultiLoop() {
 // Can registerize correctly if scalars already exist in the program.
 void testRegisterizerRepeated() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -458,7 +458,7 @@ void testRegisterizerRepeated() {
 // Can registerize rthe load of A.
 void testRegisterizerNoLoads() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make(
       {Store::make(a, {0}, 0, 1),
@@ -499,8 +499,8 @@ void testRegisterizerNoLoads() {
 // Can registerize the load of A but not the store of B.
 void testRegisterizerNoRepeatedStores() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
-  Placeholder b(BufHandle("B", {10}, kInt));
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {10}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt =
       Block::make({Store::make(a, {0}, 0, 1),
@@ -548,7 +548,7 @@ void testRegisterizerNoRepeatedStores() {
 // Won't registerize if there are multiple accesses which may overlap.
 void testRegisterizerMultiVarOverlap() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({
       Store::make(a, {0}, 0, 1),
@@ -578,12 +578,12 @@ void testRegisterizerMultiVarOverlap() {
 void testRegisterizerAllocs() {
   KernelScope kernel_scope;
 
-  Placeholder a(BufHandle("A", {2}, kInt));
-  Placeholder b(BufHandle("B", {1}, kInt));
-  Placeholder c(BufHandle("C", {1}, kInt));
+  BufHandle a("A", {2}, kInt);
+  BufHandle b("B", {1}, kInt);
+  BufHandle c("C", {1}, kInt);
   VarHandle x("x", kInt);
 
-  VarHandle b_(b.data()->base_handle());
+  VarHandle b_(b.node()->base_handle());
 
   Stmt* stmt = Block::make(
       {Allocate::make(b_, kInt, {Load::make(c, {0}, 1)}),
@@ -646,7 +646,7 @@ void testRegisterizerAllocs() {
 
 void testRegisterizerNoInitializer() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
@@ -687,8 +687,8 @@ void testRegisterizerNoInitializer() {
 
 void testRegisterizerLoadThenStore() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
-  Placeholder b(BufHandle("B", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
+  BufHandle b("B", {1}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
@@ -737,7 +737,7 @@ void testRegisterizerLoadThenStore() {
 
 void testRegisterizerParallelized() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {1}, kInt));
+  BufHandle a("A", {1}, kInt);
   VarHandle x("x", kInt);
   LoopOptions loopOpts;
   loopOpts.set_gpu_block_index(0);
@@ -765,7 +765,7 @@ void testRegisterizerParallelized() {
 
 void testRegisterizerConditions() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {5}, kInt));
+  BufHandle a("A", {5}, kInt);
   VarHandle x("x", kInt);
   Stmt* stmt = Block::make({For::make(
       x,
diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp
index ebc74ac59b49..22cd89a33c30 100644
--- a/test/cpp/tensorexpr/test_simplify.cpp
+++ b/test/cpp/tensorexpr/test_simplify.cpp
@@ -504,9 +504,9 @@ void testHashDifferenceTypes() {
 void testHashLargeExpression() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Placeholder a(BufHandle("A", {N}, kInt));
-  Placeholder b(BufHandle("B", {N}, kInt));
-  Placeholder c(BufHandle("C", {N}, kInt));
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto memcpy_stmt = For::make(
@@ -522,8 +522,8 @@ void testHashLargeExpression() {
               CompareSelectOperation::kEQ),
           mask));
 
-  Placeholder d(BufHandle("D", {1}, kInt));
-  Placeholder e(BufHandle("E", {1}, kInt));
+  BufHandle d("D", {1}, kInt);
+  BufHandle e("E", {1}, kInt);
   auto store_ramp_stmt = Store::make(
       e,
       {Ramp::make(0, 1, 4)},
@@ -555,9 +555,9 @@ void testHashLargeExpression() {
 void testHashForLoopOptions() {
   KernelScope kernel_scope;
   constexpr int N = 1024;
-  Placeholder a(BufHandle("A", {N}, kInt));
-  Placeholder b(BufHandle("B", {N}, kInt));
-  Placeholder c(BufHandle("C", {N}, kInt));
+  BufHandle a("A", {N}, kInt);
+  BufHandle b("B", {N}, kInt);
+  BufHandle c("C", {N}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   auto for_stmt = For::make(
@@ -2632,8 +2632,8 @@ void testSimplifyConstantCond() {
   {
     // If the condition is constant true then take the true_value.
     // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1
-    Placeholder a(BufHandle("A", {1}, kInt));
-    Placeholder b(BufHandle("B", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
     ExprHandle condition(1);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2648,8 +2648,8 @@ void testSimplifyConstantCond() {
   {
     // If the condition is constant false then take the false_value.
     // 0 ? A[0] = 1 : B[0] = 1 => B[0] = 1
-    Placeholder a(BufHandle("A", {1}, kInt));
-    Placeholder b(BufHandle("B", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
     ExprHandle condition(0);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2665,8 +2665,8 @@ void testSimplifyConstantCond() {
     // condition is simplified before checking.
     // (x-x) ? A[0] = 1 : B[0] = 1 => B[0] = 1
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {1}, kInt));
-    Placeholder b(BufHandle("B", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
+    BufHandle b("B", {1}, kInt);
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, 1, 1);
     Stmt* false_val = Store::make(b, {0}, 1, 1);
@@ -2682,7 +2682,7 @@ void testSimplifyConstantCond() {
     // If both branches are the same then don't do the condition.
     // x ? A[0] = x : A[0] = x => A[0] = x
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, x, 1);
     Stmt* false_val = Store::make(a, {0}, x, 1);
@@ -2698,7 +2698,7 @@ void testSimplifyConstantCond() {
     // If both branches simplify to the same thing it still works.
     // x ? (x + x) : (2 * x) => x
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     ExprHandle condition(x - x);
     Stmt* true_val = Store::make(a, {0}, ExprHandle(2) * x, 1);
     Stmt* false_val = Store::make(a, {0}, x + x, 1);
@@ -2714,7 +2714,7 @@ void testSimplifyConstantCond() {
     // But not if they dont
     // x ? x : (2 * x) => x ? x : (2 * x)
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     ExprHandle condition(x);
     Stmt* true_val = Store::make(a, {0}, x, 1);
     Stmt* false_val = Store::make(a, {0}, ExprHandle(2) * x, 1);
@@ -2771,8 +2771,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // Will eliminate zero loop For.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2784,8 +2784,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // still works if start is not zero.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2798,8 +2798,8 @@ void testSimplifyEliminateZeroLengthFor() {
   {
     // works if both terms are variable.
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2812,8 +2812,8 @@ void testSimplifyEliminateZeroLengthFor() {
   {
     // works if one term simplifies down.
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2825,8 +2825,8 @@ void testSimplifyEliminateZeroLengthFor() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2841,8 +2841,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // Will remove the loop if the body is run once.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2856,8 +2856,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // still works if start is not zero.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2872,8 +2872,8 @@ void testSimplifyOneLoopFor() {
   {
     // works if both terms are variable.
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2888,8 +2888,8 @@ void testSimplifyOneLoopFor() {
   {
     // works if one term simplifies down.
     VarHandle x("x", kInt);
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body = For::make(
@@ -2903,8 +2903,8 @@ void testSimplifyOneLoopFor() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     auto body =
@@ -2919,8 +2919,8 @@ void testSimplifyForWontLoseLoopOptions() {
 
   {
     // Sanity check does nothing if the condition is not met.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     LoopOptions options;
@@ -2939,8 +2939,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Multiple layers of For will be simplified out.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -2956,8 +2956,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Will maintain an outer loop if the inner loop is eliminated.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -2979,8 +2979,8 @@ void testSimplifyMultilevelFor() {
 
   {
     // Will maintain inner loop if outer loops is eliminated.
-    Placeholder a(BufHandle("A", {4}, kInt));
-    Placeholder c(BufHandle("C", {4}, kInt));
+    BufHandle a("A", {4}, kInt);
+    BufHandle c("C", {4}, kInt);
     auto mask = IntImm::make(1);
     VarHandle i("i", kInt);
     VarHandle j("j", kInt);
@@ -3051,7 +3051,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten multiple blocks down to one.
     // { { { stmt1, stmt2 } } } =>  { stmt1, stmt2 }
-    Placeholder a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3074,7 +3074,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten multiple sub blocks containing statements.
     // { { stmt1 }, { stmt2 } } =>  { stmt1, stmt2 }
-    Placeholder a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3097,7 +3097,7 @@ void testSimplifyFlattenBlock() {
   {
     // Flatten sub blocks with different depths.
     // { stmt1 , { { stmt2 } } } =>  { stmt1, stmt2 }
-    Placeholder a(BufHandle("A", {1}, kInt));
+    BufHandle a("A", {1}, kInt);
     Store* store1 = Store::make(a, {0}, 1, 1);
     Store* store2 = Store::make(a, {0}, 0, 1);
 
@@ -3240,9 +3240,9 @@ void testDontSimplifyRand() {
 
 void testSimplifyReorderForCond() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {4}, kInt));
-  Placeholder b(BufHandle("B", {1}, kInt));
-  Placeholder c(BufHandle("C", {4}, kInt));
+  BufHandle a("A", {4}, kInt);
+  BufHandle b("B", {1}, kInt);
+  BufHandle c("C", {4}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -3440,8 +3440,8 @@ void testSimplifyReorderForCond() {
 
 void testSimplifyFuseConditions() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {2}, kInt));
-  Placeholder b(BufHandle("B", {2}, kInt));
+  BufHandle a("A", {2}, kInt);
+  BufHandle b("B", {2}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
   VarHandle j("j", kInt);
@@ -3858,7 +3858,7 @@ void testSimplifyFuseConditions() {
 
 void testSimplifySyncThreads() {
   KernelScope kernel_scope;
-  Placeholder a(BufHandle("A", {4}, kInt));
+  BufHandle a("A", {4}, kInt);
   auto mask = IntImm::make(1);
   VarHandle i("i", kInt);
 
@@ -3974,7 +3974,7 @@ void testSimplifyBroadcastTermExpander() {
   // relevant path in TermExpander::mutate. The two bc1 terms are brought
   // together and simplified to 2 * bc1, which then needs to make 2 multi-lane.
   ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1);
-  Placeholder buf(BufHandle("buf", {num_lanes}, kInt));
+  BufHandle buf("buf", {num_lanes}, kInt);
   // The result isn't fully simplified currently and thus would be brittle to
   // match. Observe its value instead.
   auto store = Store::make(
diff --git a/test/cpp/tensorexpr/test_train_impl.cpp b/test/cpp/tensorexpr/test_train_impl.cpp
index 0f9b5548a911..b9b7d33b129b 100644
--- a/test/cpp/tensorexpr/test_train_impl.cpp
+++ b/test/cpp/tensorexpr/test_train_impl.cpp
@@ -482,7 +482,7 @@ to_tensorexpr(const VGraph& graph, std::vector<VTensor*> outputs) {
       Placeholder inpB(BufHandle(get_name(id), exprs, kFloat));
       auto inpT =
           Compute("input" + get_name(id), vars, [&](const VarHandle& i) {
-            return Load::make(inpB, {i}, 1);
+            return Load::make(BufHandle(inpB.data()), {i}, 1);
           });
       inputs.emplace(&t, inpB);
       bindings.emplace(&t, inpT);
diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h
index 841aca47c7f6..434aa52db815 100644
--- a/torch/csrc/jit/tensorexpr/expr.h
+++ b/torch/csrc/jit/tensorexpr/expr.h
@@ -208,7 +208,6 @@ class TORCH_API Buf : public ExprNode<Buf> {
   std::vector<const Expr*> dims_;
 };
 
-// TODO: Merge this class with 'Placeholder'
 class TORCH_API BufHandle : public ExprHandle {
  public:
   BufHandle(
diff --git a/torch/csrc/jit/tensorexpr/ir.cpp b/torch/csrc/jit/tensorexpr/ir.cpp
index 0ad603006fd9..56dd403c11f4 100644
--- a/torch/csrc/jit/tensorexpr/ir.cpp
+++ b/torch/csrc/jit/tensorexpr/ir.cpp
@@ -39,43 +39,40 @@ static bool indicesValid(const std::vector<const Expr*>& indices) {
   return true;
 }
 
-Load::Load(
-    const Placeholder& buffer,
-    const std::vector<const Expr*>& indices,
-    const Expr* mask)
-    : Load(
-          ChooseDtype(buffer.dtype(), dtypeOfIndices(indices)),
-          buffer.data(),
-          indices,
-          mask) {}
-
-Load::Load(
-    Dtype dtype,
-    const Buf* buf,
-    const std::vector<const Expr*>& indices,
-    const Expr* mask)
-    : ExprNodeBase(dtype), buf_(buf), indices_(indices), mask_(mask) {
-  if (indices_.size() > 0 && buf->base_handle()->dtype() != kHandle) {
+void Load::verify_dtypes() const {
+  if (indices_.size() > 0 && buf_->base_handle()->dtype() != kHandle) {
     throw malformed_input(
-        "Load base handle dtype must be Handle", buf->base_handle());
+        "Load base handle dtype must be Handle", buf_->base_handle());
   }
 
-  if (!indicesValid(indices)) {
+  if (!indicesValid(indices_)) {
     throw malformed_input("invalid indices in Load");
   }
-  Dtype index_dtype = dtypeOfIndices(indices);
-  if (index_dtype.lanes() != mask->dtype().lanes()) {
+  Dtype index_dtype = dtypeOfIndices(indices_);
+  if (index_dtype.lanes() != mask_->dtype().lanes()) {
     throw malformed_input("lane mismatch in Load mask");
   }
 }
 
-ExprHandle Load::make(
-    const Placeholder& buffer,
-    const std::vector<ExprHandle>& indices,
-    const ExprHandle& mask) {
-  return ExprHandle(
-      new Load(buffer, ExprHandleVectorToExprVector(indices), mask.node()));
+Load::Load(
+    Dtype dtype,
+    const Buf* buf,
+    const std::vector<const Expr*>& indices,
+    const Expr* mask)
+    : ExprNodeBase(dtype), buf_(buf), indices_(indices), mask_(mask) {
+  verify_dtypes();
 }
+
+Load::Load(
+    const Buf* buf,
+    const std::vector<const Expr*>& indices,
+    const Expr* mask)
+    : Load(
+          ChooseDtype(buf->dtype(), dtypeOfIndices(indices)),
+          buf,
+          indices,
+          mask) {}
+
 ExprHandle Load::make(
     Dtype dtype,
     const BufHandle& buf,
@@ -85,15 +82,11 @@ ExprHandle Load::make(
       dtype, buf.node(), ExprHandleVectorToExprVector(indices), mask.node()));
 }
 
-Store::Store(
-    const Placeholder& buffer,
-    const std::vector<const Expr*>& indices,
-    const Expr* value,
-    const Expr* mask)
-    : Store(buffer.data(), indices, value, mask) {
-  if (buffer.dtype().scalar_type() != value->dtype().scalar_type()) {
-    throw malformed_input("invalid dtype in Store");
-  }
+ExprHandle Load::make(
+    const BufHandle& buf,
+    const std::vector<ExprHandle>& indices,
+    const ExprHandle& mask) {
+  return Load::make(buf.dtype(), buf, indices, mask);
 }
 
 Store::Store(
@@ -128,15 +121,6 @@ Store::Store(
   */
 }
 
-Store* Store::make(
-    const Placeholder& buffer,
-    const std::vector<ExprHandle>& indices,
-    const ExprHandle& value,
-    const ExprHandle& mask) {
-  return new Store(
-      buffer, ExprHandleVectorToExprVector(indices), value.node(), mask.node());
-}
-
 Store* Store::make(
     const BufHandle& buf,
     const std::vector<ExprHandle>& indices,
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index f5739fc4c968..7eeea564a6a7 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -391,26 +391,28 @@ class TORCH_API Load : public ExprNode<Load> {
     return buf_;
   }
   static ExprHandle make(
-      const Placeholder& buffer,
+      Dtype dtype,
+      const BufHandle& buf,
       const std::vector<ExprHandle>& indices,
       const ExprHandle& mask);
   static ExprHandle make(
-      Dtype dtype,
       const BufHandle& buf,
       const std::vector<ExprHandle>& indices,
       const ExprHandle& mask);
 
   Load(
-      const Placeholder& buffer,
+      Dtype dtype,
+      const Buf* base_handle,
       const std::vector<const Expr*>& indices,
       const Expr* mask);
   Load(
-      Dtype dtype,
       const Buf* base_handle,
       const std::vector<const Expr*>& indices,
       const Expr* mask);
 
  private:
+  void verify_dtypes() const;
+
   const Buf* buf_;
   std::vector<const Expr*> indices_;
   const Expr* mask_;
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 4f643b7cba95..54eb974beb12 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1470,7 +1470,7 @@ void TensorExprKernel::bindInput(const torch::jit::Value* input) {
                 for (size_t i = 0; i < axes.size(); i++) {
                   idx = idx + axes[i] * IntImm::make(*strides[i]);
                 }
-                return inBuffer(idx);
+                return inBuffer.load(idx);
               }));
       kernelArgs_.emplace_back(
           inBuffer, std::vector<ShapeArg>(), std::vector<ShapeArg>());
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 5edc5be0fcf4..456a264006e1 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -348,7 +348,7 @@ class Flattener : public IRMutator {
     for (size_t i = 0; i < params.size(); i++) {
       params_expr[i] = ExprHandle(params[i]);
     }
-    return buffer(params_expr).node();
+    return buffer.load(params_expr).node();
   }
 };
 
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index 772a0fa4c187..1d3712134335 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -270,12 +270,6 @@ class TORCH_API Store : public StmtNode<Store> {
     return buf_;
   }
 
-  static Store* make(
-      const Placeholder& buffer,
-      const std::vector<ExprHandle>& indices,
-      const ExprHandle& value,
-      const ExprHandle& mask);
-
   static Store* make(
       const BufHandle& buf,
       const std::vector<ExprHandle>& indices,
@@ -287,13 +281,6 @@ class TORCH_API Store : public StmtNode<Store> {
       const std::vector<ExprHandle>& indices,
       const ExprHandle& value);
 
-  // TODO: merge this with Load.
-  Store(
-      const Placeholder& buffer,
-      const std::vector<const Expr*>& indices,
-      const Expr* value,
-      const Expr* mask);
-
   Store(
       const Buf* buf,
       std::vector<const Expr*> indices,
diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp
index 81e987caba9a..4fad4cac9a6d 100644
--- a/torch/csrc/jit/tensorexpr/tensor.cpp
+++ b/torch/csrc/jit/tensorexpr/tensor.cpp
@@ -114,7 +114,7 @@ Tensor* Reduce(
       func_name,
       dim_args,
       reducer,
-      [&](ParameterList& p) { return buffer.call(p); },
+      [&](ParameterList& p) { return buffer.load(p); },
       reduce_args);
 }
 
diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h
index a77f01b8c501..9d0cadc52686 100644
--- a/torch/csrc/jit/tensorexpr/tensor.h
+++ b/torch/csrc/jit/tensorexpr/tensor.h
@@ -196,43 +196,39 @@ class Placeholder {
     return data_->dims();
   }
 
-  // TODO: consider defer the storage flatten to a later stage.
-  template <typename... Args>
-  ExprHandle operator()(Args... args) const {
-    return LoadValue(std::forward<Args>(args)...);
-  }
+  template <typename... Ts>
+  inline ExprHandle load(const Ts&... ts) const;
 
-  ExprHandle LoadValue(
-      const ExprHandle& x,
-      const ExprHandle& y,
-      const ExprHandle& z) const {
-    return Load::make(*this, {x, y, z}, ExprHandle(1));
-  }
-  ExprHandle LoadValue(const ExprHandle& x, const ExprHandle& y) const {
-    return Load::make(*this, {x, y}, ExprHandle(1));
+  template <typename T>
+  inline ExprHandle load(const std::vector<T>& args) const;
+
+  inline ExprHandle loadWithMask(
+      const std::vector<ExprHandle>& args,
+      const ExprHandle& mask) const {
+    return ExprHandle(
+        new Load(data(), ExprHandleVectorToExprVector(args), mask.node()));
   }
-  ExprHandle LoadValue(const ExprHandle& x) const {
-    return Load::make(*this, {x}, ExprHandle(1));
+
+  inline Store* store(
+      const std::vector<ExprHandle>& args,
+      const ExprHandle& val) const {
+    return new Store(
+        data(), ExprHandleVectorToExprVector(args), val.node(), new IntImm(1));
   }
 
-  template <typename T>
-  ExprHandle call(const std::vector<T>& args) const {
-    std::vector<ExprHandle> params(args.begin(), args.end());
-    return LoadValue(params);
+  inline Store* storeWithMask(
+      const std::vector<ExprHandle>& args,
+      const ExprHandle& val,
+      const ExprHandle& mask) const {
+    return new Store(
+        data(), ExprHandleVectorToExprVector(args), val.node(), mask.node());
   }
 
  private:
-  ExprHandle LoadValue(const std::vector<ExprHandle>& indices) const;
-
   const Buf* data_;
   std::vector<const Expr*> strides_;
 };
 
-inline ExprHandle Placeholder::LoadValue(
-    const std::vector<ExprHandle>& indices) const {
-  return Load::make(*this, indices, ExprHandle(1));
-}
-
 TORCH_API Tensor* Compute(
     const std::string& func_name,
     const std::vector<DimArg>& dim_args,
@@ -391,6 +387,21 @@ inline ExprHandle Tensor::call(const std::vector<T>& args) {
   std::vector<ExprHandle> params(args.begin(), args.end());
   return FunctionCall::make(this, params);
 }
+
+template <typename... Ts>
+inline ExprHandle Placeholder::load(const Ts&... ts) const {
+  std::vector<ExprHandle> params({ExprHandle(ts)...});
+  return ExprHandle(
+      new Load(data(), ExprHandleVectorToExprVector(params), new IntImm(1)));
+}
+
+template <typename T>
+inline ExprHandle Placeholder::load(const std::vector<T>& args) const {
+  std::vector<ExprHandle> params(args.begin(), args.end());
+  return ExprHandle(
+      new Load(data(), ExprHandleVectorToExprVector(params), new IntImm(1)));
+}
+
 } // namespace tensorexpr
 } // namespace jit
 } // namespace torch

From 93650a82c9ffc96f8c65a1912e42c31fb4c84c75 Mon Sep 17 00:00:00 2001
From: Peng-Jen Chen <pipibjc@fb.com>
Date: Tue, 29 Sep 2020 21:34:00 -0700
Subject: [PATCH 292/449] Move prim::tolist math.log and aten::cpu to lite
 interpreter for translation model (#45482)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45482

Working on some models that need these ops on lite interpreter.

Test Plan: locally build and load/run the TS model without problem.

Reviewed By: iseeyuan

Differential Revision: D23906581

fbshipit-source-id: 01b9de2af2046296165892b837bc14a7e5d59b4e
---
 torch/csrc/jit/runtime/register_prim_ops.cpp  | 83 +++++++++++++++++++
 .../jit/runtime/register_prim_ops_fulljit.cpp | 83 -------------------
 2 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index bf2ffa421ee9..2c4840bdcd13 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -98,6 +98,88 @@ RegisterOperators reg(
            return 0;
          },
          aliasAnalysisFromSchema()),
+     OperatorGenerator(
+         TORCH_SELECTIVE_SCHEMA("aten::cpu(Tensor(a) self) -> Tensor(a|b)"),
+         [](Stack* stack) {
+           at::Tensor a;
+           pop(stack, a);
+           push(stack, a.cpu());
+         },
+         aliasAnalysisFromSchema()),
+     Operator(
+         prim::tolist,
+         // This operator has to be unschematized because the return type
+         // depends on the type hint and input. The implementation of this
+         // operator below is intended to be as close to the Python
+         // implementation in torch/csrc/utils/tensor_list.cpp as possible.
+         [](const Node* /*node*/) -> Operation {
+           return [](Stack* stack) {
+             int elem_ty_val;
+             int dim_val;
+             at::Tensor t;
+
+             pop(stack, elem_ty_val);
+             pop(stack, dim_val);
+             pop(stack, t);
+
+             // If the Tensor is not on the CPU, transfer it.
+             if (!t.device().is_cpu()) {
+               t = t.cpu();
+             }
+
+             // Rebuild the output type using elem_ty_val and dim_val. Start
+             // with the element type corresponding to elem_ty_val.
+             TypePtr out_ty;
+             if (elem_ty_val == 0) {
+               out_ty = IntType::get();
+             } else if (elem_ty_val == 1) {
+               out_ty = FloatType::get();
+             } else if (elem_ty_val == 2) {
+               out_ty = BoolType::get();
+             } else {
+               TORCH_CHECK(
+                   false,
+                   "Unsupported element type for tolist; only int, float and bool are supported");
+             }
+
+             // Check that type of the Tensor matches that of the annotation.
+             // Make an exception for the case in which the annotated type is
+             // float and the Tensor data type is also float; the elements will
+             // be casted to double later.
+             TORCH_CHECK(
+                 (out_ty == FloatType::get() && t.is_floating_point()) ||
+                     tryScalarTypeFromJitType(out_ty) == t.scalar_type(),
+                 "Output annotation element type and runtime tensor element type must match for tolist()");
+
+             // Check that the dimension of the Tensor matches that of the
+             // annotation.
+             TORCH_CHECK(
+                 dim_val == t.dim(),
+                 "Output annotation list dimension and runtime tensor dimension must match for tolist()");
+
+             // Wrap out_ty in a ListType dim times.
+             for (int i = 0; i < dim_val; ++i) {
+               out_ty = ListType::create(out_ty);
+             }
+
+             int64_t dim = t.dim();
+             auto sizes = t.sizes();
+             auto strides = t.strides();
+             size_t element_size = t.element_size();
+             char* data = static_cast<char*>(t.data_ptr());
+             auto result = tensorToListRecursive(
+                 data,
+                 0,
+                 dim,
+                 out_ty,
+                 t.scalar_type(),
+                 sizes,
+                 strides,
+                 element_size);
+             push(stack, std::move(result));
+           };
+         },
+         aliasAnalysisSpecialCase()),
      // only used internally in range() translation
      OperatorGenerator(
          TORCH_SELECTIVE_SCHEMA(
@@ -602,6 +684,7 @@ RegisterOperators reg(
          TORCH_SELECTIVE_SCHEMA("aten::dequantize.any(Any tensors) -> Any"),
          [](Stack* stack) { dequantize(*stack); },
          aliasAnalysisFromSchema()),
+     DEFINE_UNARY_OP(aten::log, std::log(a), float, float),
      DEFINE_STRING_OP(aten::add, a + b, str),
      DEFINE_COMPARISON_OP(aten::eq, a == b),
      DEFINE_COMPARISON_OP(aten::ne, a != b),
diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
index dc075ce14166..0a1fb91efc62 100644
--- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp
@@ -301,14 +301,6 @@ RegisterOperators reg(
            push(stack, a.layout());
          },
          aliasAnalysisFromSchema()),
-     Operator(
-         "aten::cpu(Tensor(a) self) -> Tensor(a|b)",
-         [](Stack* stack) {
-           at::Tensor a;
-           pop(stack, a);
-           push(stack, a.cpu());
-         },
-         aliasAnalysisFromSchema()),
      Operator(
          "prim::index(Device self) -> int?",
          [](Stack* stack) {
@@ -517,80 +509,6 @@ RegisterOperators reg(
            }
          },
          aliasAnalysisFromSchema()),
-     Operator(
-         prim::tolist,
-         // This operator has to be unschematized because the return type
-         // depends on the type hint and input. The implementation of this
-         // operator below is intended to be as close to the Python
-         // implementation in torch/csrc/utils/tensor_list.cpp as possible.
-         [](const Node* node) -> Operation {
-           return [](Stack* stack) {
-             int elem_ty_val;
-             int dim_val;
-             at::Tensor t;
-
-             pop(stack, elem_ty_val);
-             pop(stack, dim_val);
-             pop(stack, t);
-
-             // If the Tensor is not on the CPU, transfer it.
-             if (!t.device().is_cpu()) {
-               t = t.cpu();
-             }
-
-             // Rebuild the output type using elem_ty_val and dim_val. Start
-             // with the element type corresponding to elem_ty_val.
-             TypePtr out_ty;
-             if (elem_ty_val == 0) {
-               out_ty = IntType::get();
-             } else if (elem_ty_val == 1) {
-               out_ty = FloatType::get();
-             } else if (elem_ty_val == 2) {
-               out_ty = BoolType::get();
-             } else {
-               TORCH_CHECK(
-                   false,
-                   "Unsupported element type for tolist; only int, float and bool are supported");
-             }
-
-             // Check that type of the Tensor matches that of the annotation.
-             // Make an exception for the case in which the annotated type is
-             // float and the Tensor data type is also float; the elements will
-             // be casted to double later.
-             TORCH_CHECK(
-                 (out_ty == FloatType::get() && t.is_floating_point()) ||
-                     tryScalarTypeFromJitType(out_ty) == t.scalar_type(),
-                 "Output annotation element type and runtime tensor element type must match for tolist()");
-
-             // Check that the dimension of the Tensor matches that of the
-             // annotation.
-             TORCH_CHECK(
-                 dim_val == t.dim(),
-                 "Output annotation list dimension and runtime tensor dimension must match for tolist()");
-
-             // Wrap out_ty in a ListType dim times.
-             for (int i = 0; i < dim_val; ++i) {
-               out_ty = ListType::create(out_ty);
-             }
-
-             int64_t dim = t.dim();
-             auto sizes = t.sizes();
-             auto strides = t.strides();
-             size_t element_size = t.element_size();
-             char* data = static_cast<char*>(t.data_ptr());
-             auto result = tensorToListRecursive(
-                 data,
-                 0,
-                 dim,
-                 out_ty,
-                 t.scalar_type(),
-                 sizes,
-                 strides,
-                 element_size);
-             push(stack, std::move(result));
-           };
-         },
-         aliasAnalysisSpecialCase()),
      Operator(
          prim::ConstantChunk,
          [](const Node* node) -> Operation {
@@ -947,7 +865,6 @@ RegisterOperators reg2({
     DEFINE_INT_OP(aten::__rshift__, a >> b),
 
     DEFINE_UNARY_OP(aten::round, round_to_even(a), float, float),
-    DEFINE_UNARY_OP(aten::log, std::log(a), float, float),
     DEFINE_GENERIC_BINARY_OP(aten::log, std::log(a) / std::log(b), float),
     DEFINE_INT_FLOAT_OP(aten::log, std::log(a) / std::log(b), float),
     DEFINE_SCALAR_SCALAR_BINARY_OP(

From b4ba66ae320510ee96e3ea566ab2851d6f6aac06 Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Tue, 29 Sep 2020 21:41:05 -0700
Subject: [PATCH 293/449] Print tensor shapes and convolution parameters when
 cuDNN exception is thrown (#45023)

Summary:
Originally proposed at https://github.com/pytorch/pytorch/issues/44473#issuecomment-690670989 by colesbury .

This PR adds the functionality to print relevant tensor shapes and convolution parameters along with the stack trace once a cuDNN exception is thrown.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45023

Reviewed By: gchanan

Differential Revision: D23932661

Pulled By: ezyang

fbshipit-source-id: 5f5f570df6583271049dfc916fac36695f415331
---
 aten/src/ATen/cuda/Exceptions.h      |  29 +++---
 aten/src/ATen/cudnn/Descriptors.cpp  |  35 ++++++-
 aten/src/ATen/cudnn/Descriptors.h    |   7 ++
 aten/src/ATen/native/Convolution.cpp |   1 +
 aten/src/ATen/native/cudnn/Conv.cpp  | 146 +++++++++++++++++++++------
 5 files changed, 173 insertions(+), 45 deletions(-)

diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
index 615ba3e92b71..80e39c6bc6bc 100644
--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@@ -19,20 +19,23 @@ class CuDNNError : public c10::Error {
 
 }  // namespace c10
 
+#define AT_CUDNN_CHECK_WITH_SHAPES(EXPR, ...) AT_CUDNN_CHECK(EXPR, "\n", ##__VA_ARGS__)
+
 // See Note [CHECK macro]
-#define AT_CUDNN_CHECK(EXPR)                                                                  \
-  do {                                                                                        \
-    cudnnStatus_t status = EXPR;                                                              \
-    if (status != CUDNN_STATUS_SUCCESS) {                                                     \
-      if (status == CUDNN_STATUS_NOT_SUPPORTED) {                                             \
-        TORCH_CHECK_WITH(CuDNNError, false,                                                   \
-            "cuDNN error: ",                                                                  \
-            cudnnGetErrorString(status),                                                      \
-            ". This error may appear if you passed in a non-contiguous input.");              \
-      } else {                                                                                \
-        TORCH_CHECK_WITH(CuDNNError, false, "cuDNN error: ", cudnnGetErrorString(status));    \
-      }                                                                                       \
-    }                                                                                         \
+#define AT_CUDNN_CHECK(EXPR, ...)                                                               \
+  do {                                                                                          \
+    cudnnStatus_t status = EXPR;                                                                \
+    if (status != CUDNN_STATUS_SUCCESS) {                                                       \
+      if (status == CUDNN_STATUS_NOT_SUPPORTED) {                                               \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ",                                                                    \
+            cudnnGetErrorString(status),                                                        \
+            ". This error may appear if you passed in a non-contiguous input.", ##__VA_ARGS__); \
+      } else {                                                                                  \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ", cudnnGetErrorString(status), ##__VA_ARGS__);                       \
+      }                                                                                         \
+    }                                                                                           \
   } while (0)
 
 namespace at { namespace cuda { namespace blas {
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index 2863212a03a8..aba7b407162f 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -4,7 +4,6 @@
 
 #include <ostream>
 #include <sstream>
-#include <string>
 
 namespace at { namespace native {
 
@@ -144,4 +143,38 @@ void FilterDescriptor::set(const at::Tensor &t, int64_t pad, bool force_nhwc) {
   set(getDataType(t), (int) dim, size, filter_format);
 }
 
+std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
+  switch (tformat) {
+    case CUDNN_TENSOR_NCHW:
+      return "CUDNN_TENSOR_NCHW";
+    case CUDNN_TENSOR_NHWC:
+      return "CUDNN_TENSOR_NHWC";
+    default:
+      std::ostringstream oss;
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
+      return oss.str();
+  }
+}
+
+std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  int nbDims;
+  int dimA[CUDNN_DIM_MAX];
+  cudnnDataType_t dtype;
+  cudnnTensorFormat_t tformat;
+  cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
+  // Read out only nbDims of the arrays!
+  out << "    dimA = ";
+  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
+    out << i << ", ";
+  }
+  out << "\n";
+  return out;
+}
+
+void FilterDescriptor::print() { std::cout << *this; }
+
 }}
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 04e027491709..2aed3f66632f 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <string>
+
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
 
@@ -12,6 +14,8 @@
 
 namespace at { namespace native {
 
+std::string cudnnTypeToString(cudnnDataType_t dtype);
+
 // TODO: Add constructors for all of the descriptors
 
 inline int dataSize(cudnnDataType_t dataType)
@@ -153,12 +157,15 @@ class TORCH_CUDA_API FilterDescriptor
 public:
   void set(const at::Tensor &t, int64_t pad = 0, bool force_nhwc = false);
 
+  void print();
 private:
   void set(cudnnDataType_t dataType, int dim, int* size, cudnnTensorFormat_t filter_format) {
     AT_CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, filter_format, dim, size));
   }
 };
 
+std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d);
+
 struct TORCH_CUDA_API ConvolutionDescriptor
   : public Descriptor<cudnnConvolutionStruct,
                       &cudnnCreateConvolutionDescriptor,
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index 2655796bb624..aa3a2debfe0a 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -62,6 +62,7 @@ std::ostream& operator<<(std::ostream & out, const ConvParams& params) {
       << "  benchmark = " << params.benchmark
       << "  deterministic = " << params.deterministic
       << "  cudnn_enabled = " << params.cudnn_enabled
+      << "  allow_tf32 = " << params.allow_tf32
       << "}";
   return out;
 }
diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index 4ddd533ec8f8..65548d68440b 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -1,5 +1,6 @@
 #include <limits>
 #include <vector>
+#include <sstream>
 #include <functional>
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
@@ -100,6 +101,10 @@ std::tuple<at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
 //     if(dataType == CUDNN_DATA_HALF)
 //       AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
 //
+//     Update: AT_CUDNN_CHECK is updated with AT_CUDNN_CHECK_WITH_SHAPES, which
+//        automatically prints tensor shapes and convolution parameters if there is
+//        a cuDNN exception thrown.
+//
 // When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it informs
 // cudnnGet/cudnnFind to iterate/take into account both tensor core and non-tensor-core algos.
 // If you don't call cudnnSetConvolutionMathType before calling cudnnGet/cudnnFind,
@@ -220,6 +225,19 @@ struct ConvolutionParams
   // forward and backward, so you can reuse the benchmark entry,
 };
 
+std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) {
+  out << "ConvolutionParams \n"
+    << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
+    << "    padding = " << ArrayRef<int>{params.padding} << "\n"
+    << "    stride = " << ArrayRef<int>{params.stride} << "\n"
+    << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
+    << "    groups = " << params.groups << "\n"
+    << "    deterministic = " << (params.deterministic ? "true" : "false") << "\n"
+    << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
+
+  return out;
+}
+
 // NB: This can't be a constructor, because then ConvolutionParams
 // would not be a POD anymore.
 // TODO: Use TensorGeometry here instead of the entire Tensor, which we
@@ -268,6 +286,61 @@ struct ConvolutionArgs {
   }
 };
 
+std::string repro_from_args(const ConvolutionArgs& args) {
+  auto pybool = [](bool b) -> const char* { return b ? "True" : "False"; };
+  std::string partial_dtype;
+  switch (args.params.dataType) {
+    case CUDNN_DATA_FLOAT: partial_dtype = "float"; break;
+    case CUDNN_DATA_DOUBLE: partial_dtype = "double"; break;
+    case CUDNN_DATA_HALF: partial_dtype = "half"; break;
+    default: partial_dtype = "unsupported";
+  }
+  const std::string full_dtype = "torch." + partial_dtype;
+  const int out_channels = args.weight.sizes()[0];
+  const int in_channels = args.weight.sizes()[1] * args.params.groups;
+  const size_t dim = args.input.sizes().size();
+  const std::string channels_last_xd = dim == 4 ? "channels_last" : "channels_last_3d";
+  const std::string to_channels_last = args.input.suggest_memory_format() == at::MemoryFormat::ChannelsLast \
+    ? ".to(memory_format=torch." + channels_last_xd + ")" : "";
+
+  std::ostringstream ss;
+  ss << "You can try to repro this exception using the following code snippet. ";
+  ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
+  ss << "import torch\n";
+  ss << "torch.backends.cuda.matmul.allow_tf32 = " << pybool(at::globalContext().allowTF32CuBLAS()) << "\n";
+  ss << "torch.backends.cudnn.benchmark = " << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
+  ss << "torch.backends.cudnn.deterministic = " << pybool(args.params.deterministic) << "\n";
+  ss << "torch.backends.cudnn.allow_tf32 = " << pybool(args.params.allow_tf32) << "\n";
+  ss << "data = torch.randn(" << args.input.sizes() << ", dtype=" << full_dtype << ", ";
+  ss <<   "device='cuda', requires_grad=True)" << to_channels_last << "\n";
+  ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", ";
+  ss <<   "kernel_size=" << args.weight.sizes().slice(2) << ", ";
+  ss <<   "padding=" << ArrayRef<int>(args.params.padding) << ", ";
+  ss <<   "stride=" << ArrayRef<int>(args.params.stride) << ", ";
+  ss <<   "dilation=" << ArrayRef<int>(args.params.dilation) << ", ";
+  ss <<   "groups=" << args.params.groups << ")\n";
+  ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n";
+  ss << "out = net(data)\n";
+  ss << "out.backward(torch.randn_like(out))\n";
+  ss << "torch.cuda.synchronize()\n\n";
+  
+  return ss.str();
+}
+
+std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) {
+  out << repro_from_args(args)         // already has a trailing newline
+    << args.params                     // already has a trailing newline
+    << "input: " << args.idesc         // already has a trailing newline
+    << "output: " << args.odesc        // already has a trailing newline
+    << "weight: " << args.wdesc        // already has a trailing newline
+    << "Pointer addresses: " << "\n"
+    << "    input: " << args.input.data_ptr() << "\n"
+    << "    output: " << args.output.data_ptr() << "\n"
+    << "    weight: " << args.weight.data_ptr() << "\n";
+
+  return out;
+}
+
 // ---------------------------------------------------------------------
 //
 // Benchmarking
@@ -457,7 +530,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
     int perf_count;
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     if (!benchmark) {
-      AT_CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardAlgorithm_v7(
           args.handle,
           args.idesc.desc(),
           args.wdesc.desc(),
@@ -465,11 +538,11 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
           args.odesc.desc(),
           num_algos,
           &perf_count,
-          perf_results.get()));
+          perf_results.get()), args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
-      AT_CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionForwardAlgorithmEx(
           args.handle,
           args.idesc.desc(), args.input.data_ptr(),
           args.wdesc.desc(), args.weight.data_ptr(),
@@ -479,7 +552,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
           &perf_count,
           perf_results.get(),
           ws.data,
-          ws.size));
+          ws.size), args);
 
       // Free the cached blocks in our caching allocator. They are
       // needed here because the above benchmarking uses a huge amount of memory,
@@ -493,14 +566,14 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
     const ConvolutionArgs& args,
     algo_t algo, size_t* workspaceSize)
   {
-    AT_CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardWorkspaceSize(
         args.handle,
         args.idesc.desc(),
         args.wdesc.desc(),
         args.cdesc.desc(),
         args.odesc.desc(),
         algo,
-        workspaceSize));
+        workspaceSize), args);
   }
 };
 
@@ -527,7 +600,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
     int perf_count;
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     if (!benchmark) {
-      AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataAlgorithm_v7(
           args.handle,
           args.wdesc.desc(),
           args.odesc.desc(),
@@ -535,11 +608,11 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
           args.idesc.desc(),
           num_algos,
           &perf_count,
-          perf_results.get()));
+          perf_results.get()), args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
-      AT_CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardDataAlgorithmEx(
           args.handle,
           args.wdesc.desc(), args.weight.data_ptr(),
           args.odesc.desc(), args.output.data_ptr(),
@@ -549,7 +622,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
           &perf_count,
           perf_results.get(),
           ws.data,
-          ws.size));
+          ws.size), args);
 
       // Free the cached blocks in our caching allocator. They are
       // needed here because the above benchmarking uses a huge amount of memory,
@@ -563,14 +636,14 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
     const ConvolutionArgs& args,
     cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize)
   {
-    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataWorkspaceSize(
         args.handle,
         args.wdesc.desc(),
         args.odesc.desc(),
         args.cdesc.desc(),
         args.idesc.desc(),
         algo,
-        workspaceSize));
+        workspaceSize), args);
   }
 };
 
@@ -599,7 +672,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
     std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
     int perf_count;
     if (!benchmark) {
-      AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
           args.handle,
           args.idesc.desc(),
           args.odesc.desc(),
@@ -607,11 +680,11 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
           args.wdesc.desc(),
           num_algos,
           &perf_count,
-          perf_results.get()));
+          perf_results.get()), args);
     } else {
       size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
       Workspace ws(max_ws_size);
-      AT_CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardFilterAlgorithmEx(
           args.handle,
           args.idesc.desc(), args.input.data_ptr(),
           args.odesc.desc(), args.output.data_ptr(),
@@ -621,7 +694,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
           &perf_count,
           perf_results.get(),
           ws.data,
-          ws.size));
+          ws.size), args);
 
       // Free the cached blocks in our caching allocator. They are
       // needed here because the above benchmarking uses a huge amount of memory,
@@ -633,14 +706,14 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
 
   static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize)
   {
-    AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterWorkspaceSize(
         args.handle,
         args.idesc.desc(),
         args.odesc.desc(),
         args.cdesc.desc(),
         args.wdesc.desc(),
         algo,
-        workspaceSize));
+        workspaceSize), args);
   }
 };
 
@@ -850,17 +923,18 @@ void raw_cudnn_convolution_forward_out_32bit(
       // whether to use Tensor core kernels or not
       // See Note [behavior of cudnnFind and cudnnGet]
       ASSERT_CORRECT_PRECISION(fwdAlgPerf.mathType);
-      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType), args);
 
       Constant one(dataType, 1);
       Constant zero(dataType, 0);
 
-      AT_CUDNN_CHECK(cudnnConvolutionForward(
-        args.handle,
-        &one, args.idesc.desc(), input.data_ptr(),
-        args.wdesc.desc(), weight.data_ptr(),
-        args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory,
-        &zero, args.odesc.desc(), output.data_ptr()));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionForward(
+          args.handle,
+          &one, args.idesc.desc(), input.data_ptr(),
+          args.wdesc.desc(), weight.data_ptr(),
+          args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory,
+          &zero, args.odesc.desc(), output.data_ptr()),
+        args, "Forward algorithm: ", static_cast<int>(fwdAlgPerf.algo), "\n");
       }
   );
 }
@@ -986,17 +1060,22 @@ void raw_cudnn_convolution_backward_input_out_32bit(
       // whether to use Tensor core kernels or not
       // See Note [behavior of cudnnFind and cudnnGet]
       ASSERT_CORRECT_PRECISION(bwdDataAlgPerf.mathType);
-      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType), args);
 
       Constant one(dataType, 1);
       Constant zero(dataType, 0);
 
-      AT_CUDNN_CHECK(cudnnConvolutionBackwardData(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardData(
           args.handle,
           &one, args.wdesc.desc(), weight.data_ptr(),
           args.odesc.desc(), grad_output.data_ptr(),
           args.cdesc.desc(), bwdDataAlgPerf.algo, workspace.data_ptr(), bwdDataAlgPerf.memory,
-          &zero, args.idesc.desc(), grad_input.data_ptr()));
+          &zero, args.idesc.desc(), grad_input.data_ptr()),
+        args,
+        "Additional pointer addresses: \n",
+        "    grad_output: ", grad_output.data_ptr(), "\n",
+        "    grad_input: ", grad_input.data_ptr(), "\n",
+        "Backward data algorithm: ", static_cast<int>(bwdDataAlgPerf.algo), "\n");
     }
   );
 }
@@ -1148,17 +1227,22 @@ void raw_cudnn_convolution_backward_weight_out_32bit(
       // whether to use Tensor core kernels or not
       // See Note [behavior of cudnnFind and cudnnGet]
       ASSERT_CORRECT_PRECISION(bwdFilterAlgPerf.mathType);
-      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType));
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType), args);
 
       Constant one(dataType, 1);
       Constant zero(dataType, 0);
 
-      AT_CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+      AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardFilter(
           args.handle,
           &one, args.idesc.desc(), input.data_ptr(),
           args.odesc.desc(), grad_output.data_ptr(),
           args.cdesc.desc(), bwdFilterAlgPerf.algo, workspace.data_ptr(), bwdFilterAlgPerf.memory,
-          &zero, args.wdesc.desc(), grad_weight.data_ptr()));
+          &zero, args.wdesc.desc(), grad_weight.data_ptr()),
+        args,
+        "Additional pointer addresses: \n",
+        "    grad_output: ", grad_output.data_ptr(), "\n",
+        "    grad_weight: ", grad_weight.data_ptr(), "\n",
+        "Backward filter algorithm: ", static_cast<int>(bwdFilterAlgPerf.algo), "\n");
     }
   );
 }

From c2c7099944e94fb7d0460ac8689819b42ba0e30e Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 29 Sep 2020 22:54:58 -0700
Subject: [PATCH 294/449] Fix docs for kwargs, q-z (#43589)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43589

Reviewed By: zhangguanheng66

Differential Revision: D24006259

Pulled By: mruberry

fbshipit-source-id: 39abd474744f152648aad201d7311b42d20efc88
---
 torch/_torch_docs.py | 200 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 149 insertions(+), 51 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index a19d3f882ca1..8474282613ef 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -5864,7 +5864,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.qr,
            r"""
-qr(input, some=True, out=None) -> (Tensor, Tensor)
+qr(input, some=True, *, out=None) -> (Tensor, Tensor)
 
 Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`,
 and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R`
@@ -5892,6 +5892,8 @@ def merge_dicts(*dicts):
                 batch dimensions consisting of matrices of dimension :math:`m \times n`.
     some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for
                 complete QR decomposition.
+
+Keyword args:
     out (tuple, optional): tuple of `Q` and `R` tensors
                 satisfying :code:`input = torch.matmul(Q, R)`.
                 The dimensions of `Q` and `R` are :math:`(*, m, k)` and :math:`(*, k, n)`
@@ -5928,7 +5930,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rad2deg,
            r"""
-rad2deg(input, out=None) -> Tensor
+rad2deg(input, *, out=None) -> Tensor
 
 Returns a new tensor with each of the elements of :attr:`input`
 converted from angles in radians to degrees.
@@ -5951,7 +5953,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.deg2rad,
            r"""
-deg2rad(input, out=None) -> Tensor
+deg2rad(input, *, out=None) -> Tensor
 
 Returns a new tensor with each of the elements of :attr:`input`
 converted from angles in degrees to radians.
@@ -6008,7 +6010,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rand,
            r"""
-rand(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+rand(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with random numbers from a uniform distribution
 on the interval :math:`[0, 1)`
@@ -6018,6 +6020,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -6035,7 +6039,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rand_like,
            r"""
-rand_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` that is filled with
 random numbers from a uniform distribution on the interval :math:`[0, 1)`.
@@ -6044,6 +6048,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -6070,6 +6076,8 @@ def merge_dicts(*dicts):
     low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
     high (int): One above the highest integer to be drawn from the distribution.
     size (tuple): a tuple defining the shape of the output tensor.
+
+Keyword args:
     {generator}
     {out}
     {dtype}
@@ -6097,7 +6105,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randint_like,
            """
-randint_like(input, low=0, high, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+randint_like(input, low=0, high, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
 memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same shape as Tensor :attr:`input` filled with
@@ -6112,6 +6120,8 @@ def merge_dicts(*dicts):
     {input}
     low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
     high (int): One above the highest integer to be drawn from the distribution.
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -6122,7 +6132,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randn,
            r"""
-randn(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+randn(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with random numbers from a normal distribution
 with mean `0` and variance `1` (also called the standard normal
@@ -6136,6 +6146,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -6153,7 +6165,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randn_like,
            r"""
-randn_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` that is filled with
 random numbers from a normal distribution with mean 0 and variance 1.
@@ -6162,6 +6174,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -6172,12 +6186,14 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.randperm,
            r"""
-randperm(n, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor
+randperm(n, *, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor
 
 Returns a random permutation of integers from ``0`` to ``n - 1``.
 
 Args:
     n (int): the upper bound (exclusive)
+
+Keyword args:
     {out}
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: ``torch.int64``.
@@ -6193,7 +6209,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tensor,
            r"""
-tensor(data, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
 
 Constructs a tensor with :attr:`data`.
 
@@ -6214,6 +6230,8 @@ def merge_dicts(*dicts):
 
 Args:
     {data}
+
+Keyword args:
     {dtype}
     {device}
     {requires_grad}
@@ -6244,7 +6262,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.range,
            r"""
-range(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
 with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
@@ -6261,6 +6279,8 @@ def merge_dicts(*dicts):
     start (float): the starting value for the set of points. Default: ``0``.
     end (float): the ending value for the set of points
     step (float): the gap between each pair of adjacent points. Default: ``1``.
+
+Keyword args:
     {out}
     {dtype} If `dtype` is not given, infer the data type from the other input
         arguments. If any of `start`, `end`, or `stop` are floating-point, the
@@ -6281,7 +6301,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.arange,
            r"""
-arange(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
 with values from the interval ``[start, end)`` taken with common difference
@@ -6298,6 +6318,8 @@ def merge_dicts(*dicts):
     start (Number): the starting value for the set of points. Default: ``0``.
     end (Number): the ending value for the set of points
     step (Number): the gap between each pair of adjacent points. Default: ``1``.
+
+Keyword args:
     {out}
     {dtype} If `dtype` is not given, infer the data type from the other input
         arguments. If any of `start`, `end`, or `stop` are floating-point, the
@@ -6320,7 +6342,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.remainder,
            r"""
-remainder(input, other, out=None) -> Tensor
+remainder(input, other, *, out=None) -> Tensor
 
 Computes the element-wise remainder of division.
 
@@ -6334,6 +6356,8 @@ def merge_dicts(*dicts):
     input (Tensor): the dividend
     other (Tensor or float): the divisor that may be either a number or a
                                Tensor of the same shape as the dividend
+
+Keyword args:
     {out}
 
 Example::
@@ -6351,7 +6375,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.renorm,
            r"""
-renorm(input, p, dim, maxnorm, out=None) -> Tensor
+renorm(input, p, dim, maxnorm, *, out=None) -> Tensor
 
 Returns a tensor where each sub-tensor of :attr:`input` along dimension
 :attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower
@@ -6364,6 +6388,8 @@ def merge_dicts(*dicts):
     p (float): the power for the norm computation
     dim (int): the dimension to slice over to get the sub-tensors
     maxnorm (float): the maximum norm to keep each sub-tensor under
+
+Keyword args:
     {out}
 
 Example::
@@ -6437,13 +6463,15 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.round,
            r"""
-round(input, out=None) -> Tensor
+round(input, *, out=None) -> Tensor
 
 Returns a new tensor with each of the elements of :attr:`input` rounded
 to the closest integer.
 
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6457,7 +6485,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.rsqrt,
            r"""
-rsqrt(input, out=None) -> Tensor
+rsqrt(input, *, out=None) -> Tensor
 
 Returns a new tensor with the reciprocal of the square-root of each of
 the elements of :attr:`input`.
@@ -6467,6 +6495,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6550,7 +6580,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.logit,
            r"""
-logit(input, eps=None, out=None) -> Tensor
+logit(input, eps=None, *, out=None) -> Tensor
 
 Returns a new tensor with the logit of the elements of :attr:`input`.
 :attr:`input` is clamped to [eps, 1 - eps] when eps is not None.
@@ -6568,6 +6598,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     eps (float, optional): the epsilon for input clamp bound. Default: ``None``
+
+Keyword args:
     {out}
 
 Example::
@@ -6581,7 +6613,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sign,
            r"""
-sign(input, out=None) -> Tensor
+sign(input, *, out=None) -> Tensor
 
 Returns a new tensor with the signs of the elements of :attr:`input`.
 
@@ -6590,6 +6622,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6647,7 +6681,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sin,
            r"""
-sin(input, out=None) -> Tensor
+sin(input, *, out=None) -> Tensor
 
 Returns a new tensor with the sine of the elements of :attr:`input`.
 
@@ -6656,6 +6690,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6669,7 +6705,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sinh,
            r"""
-sinh(input, out=None) -> Tensor
+sinh(input, *, out=None) -> Tensor
 
 Returns a new tensor with the hyperbolic sine of the elements of
 :attr:`input`.
@@ -6679,6 +6715,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6692,7 +6730,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sort,
            r"""
-sort(input, dim=-1, descending=False, out=None) -> (Tensor, LongTensor)
+sort(input, dim=-1, descending=False, *, out=None) -> (Tensor, LongTensor)
 
 Sorts the elements of the :attr:`input` tensor along a given dimension
 in ascending order by value.
@@ -6710,6 +6748,8 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
+
+Keyword args:
     out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
         be optionally given to be used as output buffers
 
@@ -6771,7 +6811,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sparse_coo_tensor,
            r"""
-sparse_coo_tensor(indices, values, size=None, dtype=None, device=None, requires_grad=False) -> Tensor
+sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor
 
 Constructs a sparse tensors in COO(rdinate) format with non-zero elements at the given :attr:`indices`
 with the given :attr:`values`. A sparse tensor can be `uncoalesced`, in that case, there are duplicate
@@ -6789,6 +6829,8 @@ def merge_dicts(*dicts):
     size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not
         provided the size will be inferred as the minimum size big enough to hold all non-zero
         elements.
+
+Keyword args:
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if None, infers data type from :attr:`values`.
     device (:class:`torch.device`, optional): the desired device of returned tensor.
@@ -6848,7 +6890,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sqrt,
            r"""
-sqrt(input, out=None) -> Tensor
+sqrt(input, *, out=None) -> Tensor
 
 Returns a new tensor with the square-root of the elements of :attr:`input`.
 
@@ -6857,6 +6899,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6870,12 +6914,14 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.square,
            r"""
-square(input, out=None) -> Tensor
+square(input, *, out=None) -> Tensor
 
 Returns a new tensor with the square of the elements of :attr:`input`.
 
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -6889,7 +6935,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.squeeze,
            r"""
-squeeze(input, dim=None, out=None) -> Tensor
+squeeze(input, dim=None, *, out=None) -> Tensor
 
 Returns a tensor with all the dimensions of :attr:`input` of size `1` removed.
 
@@ -6913,6 +6959,8 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): if given, the input will be squeezed only in
            this dimension
+
+Keyword args:
     {out}
 
 Example::
@@ -7069,12 +7117,14 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.sum,
            r"""
-sum(input, dtype=None) -> Tensor
+sum(input, *, dtype=None) -> Tensor
 
 Returns the sum of all elements in the :attr:`input` tensor.
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
 
 Example::
@@ -7085,7 +7135,7 @@ def merge_dicts(*dicts):
     >>> torch.sum(a)
     tensor(-0.5475)
 
-.. function:: sum(input, dim, keepdim=False, dtype=None) -> Tensor
+.. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
 
 Returns the sum of each row of the :attr:`input` tensor in the given
 dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
@@ -7097,6 +7147,8 @@ def merge_dicts(*dicts):
     {input}
     {dim}
     {keepdim}
+
+Keyword args:
     {dtype}
 
 Example::
@@ -7116,7 +7168,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.nansum,
            r"""
-nansum(input, dtype=None) -> Tensor
+nansum(input, *, dtype=None) -> Tensor
 
 Returns the sum of all elements, treating Not a Numbers (NaNs) as zero.
 
@@ -7132,7 +7184,7 @@ def merge_dicts(*dicts):
     >>> torch.nansum(a)
     tensor(7.)
 
-.. function:: nansum(input, dim, keepdim=False, dtype=None) -> Tensor
+.. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor
 
 Returns the sum of each row of the :attr:`input` tensor in the given
 dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero.
@@ -7163,7 +7215,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.svd,
            r"""
-svd(input, some=True, compute_uv=True, out=None) -> (Tensor, Tensor, Tensor)
+svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor)
 
 This function returns a namedtuple ``(U, S, V)`` which is the singular value
 decomposition of a input real matrix or batches of real matrices :attr:`input` such that
@@ -7205,6 +7257,8 @@ def merge_dicts(*dicts):
                     batch dimensions consisting of :math:`m \times n` matrices.
     some (bool, optional): controls the shape of returned `U` and `V`
     compute_uv (bool, optional): option whether to compute `U` and `V` or not
+
+Keyword args:
     out (tuple, optional): the output tuple of tensors
 
 Example::
@@ -7239,7 +7293,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.symeig,
            r"""
-symeig(input, eigenvectors=False, upper=True, out=None) -> (Tensor, Tensor)
+symeig(input, eigenvectors=False, upper=True, *, out=None) -> (Tensor, Tensor)
 
 This function returns eigenvalues and eigenvectors
 of a real symmetric matrix :attr:`input` or a batch of real symmetric matrices,
@@ -7274,6 +7328,8 @@ def merge_dicts(*dicts):
                     batch dimensions consisting of symmetric matrices.
     eigenvectors(boolean, optional): controls whether eigenvectors have to be computed
     upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region
+
+Keyword args:
     out (tuple, optional): the output tuple of (Tensor, Tensor)
 
 Returns:
@@ -7526,7 +7582,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tan,
            r"""
-tan(input, out=None) -> Tensor
+tan(input, *, out=None) -> Tensor
 
 Returns a new tensor with the tangent of the elements of :attr:`input`.
 
@@ -7535,6 +7591,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -7548,7 +7606,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tanh,
            r"""
-tanh(input, out=None) -> Tensor
+tanh(input, *, out=None) -> Tensor
 
 Returns a new tensor with the hyperbolic tangent of the elements
 of :attr:`input`.
@@ -7558,6 +7616,8 @@ def merge_dicts(*dicts):
 """ + r"""
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -7571,7 +7631,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.topk,
            r"""
-topk(input, k, dim=None, largest=True, sorted=True, out=None) -> (Tensor, LongTensor)
+topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor)
 
 Returns the :attr:`k` largest elements of the given :attr:`input` tensor along
 a given dimension.
@@ -7594,6 +7654,8 @@ def merge_dicts(*dicts):
            smallest elements
     sorted (bool, optional): controls whether to return the elements
            in sorted order
+
+Keyword args:
     out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be
         optionally given to be used as output buffers
 
@@ -7703,7 +7765,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.tril,
            r"""
-tril(input, diagonal=0, out=None) -> Tensor
+tril(input, diagonal=0, *, out=None) -> Tensor
 
 Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices
 :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
@@ -7722,6 +7784,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     diagonal (int, optional): the diagonal to consider
+
+Keyword args:
     {out}
 
 Example::
@@ -7758,7 +7822,7 @@ def merge_dicts(*dicts):
 # as common args.
 add_docstr(torch.tril_indices,
            r"""
-tril_indices(row, col, offset=0, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
 
 Returns the indices of the lower triangular part of a :attr:`row`-by-
 :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
@@ -7785,6 +7849,8 @@ def merge_dicts(*dicts):
     col (``int``): number of columns in the 2-D matrix.
     offset (``int``): diagonal offset from the main diagonal.
         Default: if not provided, 0.
+
+Keyword args:
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if ``None``, ``torch.long``.
     {device}
@@ -7809,7 +7875,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.triu,
            r"""
-triu(input, diagonal=0, out=None) -> Tensor
+triu(input, diagonal=0, *, out=None) -> Tensor
 
 Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
 :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
@@ -7828,6 +7894,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     diagonal (int, optional): the diagonal to consider
+
+Keyword args:
     {out}
 
 Example::
@@ -7872,7 +7940,7 @@ def merge_dicts(*dicts):
 # as common args.
 add_docstr(torch.triu_indices,
            r"""
-triu_indices(row, col, offset=0, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
 
 Returns the indices of the upper triangular part of a :attr:`row` by
 :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
@@ -7899,6 +7967,8 @@ def merge_dicts(*dicts):
     col (``int``): number of columns in the 2-D matrix.
     offset (``int``): diagonal offset from the main diagonal.
         Default: if not provided, 0.
+
+Keyword args:
     dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
         Default: if ``None``, ``torch.long``.
     {device}
@@ -7929,13 +7999,15 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.trunc,
            r"""
-trunc(input, out=None) -> Tensor
+trunc(input, *, out=None) -> Tensor
 
 Returns a new tensor with the truncated integer values of
 the elements of :attr:`input`.
 
 Args:
     {input}
+
+Keyword args:
     {out}
 
 Example::
@@ -8086,7 +8158,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.zeros,
            r"""
-zeros(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
 Returns a tensor filled with the scalar value `0`, with the shape defined
 by the variable argument :attr:`size`.
@@ -8094,6 +8166,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -8112,7 +8186,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.zeros_like,
            r"""
-zeros_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor filled with the scalar value `0`, with the same size as
 :attr:`input`. ``torch.zeros_like(input)`` is equivalent to
@@ -8125,6 +8199,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8141,7 +8217,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.empty,
            r"""
-empty(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
 
 Returns a tensor filled with uninitialized data. The shape of the tensor is
 defined by the variable argument :attr:`size`.
@@ -8149,6 +8225,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -8168,7 +8246,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.empty_like,
            r"""
-empty_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
 
 Returns an uninitialized tensor with the same size as :attr:`input`.
 ``torch.empty_like(input)`` is equivalent to
@@ -8176,6 +8254,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8191,7 +8271,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.empty_strided,
            r"""
-empty_strided(size, stride, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
 
 Returns a tensor filled with uninitialized data. The shape and strides of the tensor is
 defined by the variable argument :attr:`size` and :attr:`stride` respectively.
@@ -8207,6 +8287,8 @@ def merge_dicts(*dicts):
 Args:
     size (tuple of ints): the shape of the output tensor
     stride (tuple of ints): the strides of the output tensor
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8235,6 +8317,8 @@ def merge_dicts(*dicts):
     size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
         shape of the output tensor.
     fill_value (Scalar): the value to fill the output tensor with.
+
+Keyword args:
     {out}
     {dtype}
     {layout}
@@ -8250,7 +8334,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.full_like,
            """
-full_like(input, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+full_like(input, fill_value, \\*, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
 memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
@@ -8260,6 +8344,8 @@ def merge_dicts(*dicts):
 Args:
     {input}
     fill_value: the number to fill the output tensor with.
+
+Keyword args:
     {dtype}
     {layout}
     {device}
@@ -8885,7 +8971,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.hann_window,
            """
-hann_window(window_length, periodic=True, dtype=None, \
+hann_window(window_length, periodic=True, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Hann window function.
@@ -8912,6 +8998,8 @@ def merge_dicts(*dicts):
     window_length (int): the size of returned window
     periodic (bool, optional): If True, returns a window to be used as periodic
         function. If False, return a symmetric window.
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -8926,7 +9014,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.hamming_window,
            """
-hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, dtype=None, \
+hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Hamming window function.
@@ -8957,6 +9045,8 @@ def merge_dicts(*dicts):
         function. If False, return a symmetric window.
     alpha (float, optional): The coefficient :math:`\alpha` in the equation above
     beta (float, optional): The coefficient :math:`\beta` in the equation above
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -8971,7 +9061,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.bartlett_window,
            """
-bartlett_window(window_length, periodic=True, dtype=None, \
+bartlett_window(window_length, periodic=True, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Bartlett window function.
@@ -9000,6 +9090,8 @@ def merge_dicts(*dicts):
     window_length (int): the size of returned window
     periodic (bool, optional): If True, returns a window to be used as periodic
         function. If False, return a symmetric window.
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -9014,7 +9106,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.blackman_window,
            """
-blackman_window(window_length, periodic=True, dtype=None, \
+blackman_window(window_length, periodic=True, *, dtype=None, \
 layout=torch.strided, device=None, requires_grad=False) -> Tensor
 """ + r"""
 Blackman window function.
@@ -9040,6 +9132,8 @@ def merge_dicts(*dicts):
     window_length (int): the size of returned window
     periodic (bool, optional): If True, returns a window to be used as periodic
         function. If False, return a symmetric window.
+
+Keyword args:
     {dtype} Only floating point types are supported.
     layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
           ``torch.strided`` (dense layout) is supported.
@@ -9469,7 +9563,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.searchsorted,
            r"""
-searchsorted(sorted_sequence, values, out_int32=False, right=False, out=None) -> Tensor
+searchsorted(sorted_sequence, values, *, out_int32=False, right=False, out=None) -> Tensor
 
 Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
 corresponding values in :attr:`values` were inserted before the indices, the order of the
@@ -9502,6 +9596,8 @@ def merge_dicts(*dicts):
     sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
                               dimension.
     values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+
+Keyword args:
     out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
                                 Default value is False, i.e. default output data type is torch.int64.
     right (bool, optional): if False, return the first suitable location that is found. If True, return the
@@ -9544,7 +9640,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.bucketize,
            r"""
-bucketize(input, boundaries, out_int32=False, right=False, out=None) -> Tensor
+bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
 
 Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
 boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
@@ -9565,6 +9661,8 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
     boundaries (Tensor): 1-D tensor, must contain a monotonically increasing sequence.
+
+Keyword args:
     out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
                                 Default value is False, i.e. default output data type is torch.int64.
     right (bool, optional): if False, return the first suitable location that is found. If True, return the

From f5c95d5cf1718bd4b2d5b787d7fae9151efff811 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Wed, 30 Sep 2020 00:52:50 -0700
Subject: [PATCH 295/449] Source code level attribution in profiler (#43898)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43898

Adding with_source parameter to enable tracking source code
(filename and line) in profiler for eager, torchscript and autograd
modes

Test Plan:
python test/test_profiler.py
```
Name                                 Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls  Source Location
-----------------------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  --------------------------------------------
ts_method_1                          10.43%           235.364us        36.46%           822.920us        822.920us        1                test/test_profiler.py(70): test_source
aten::add                            7.52%            169.833us        8.88%            200.439us        200.439us        1                test/test_profiler.py(69): test_source
aten::normal_                        6.26%            141.380us        6.26%            141.380us        141.380us        1                test/test_profiler.py(67): test_source
aten::add                            5.80%            130.830us        8.41%            189.800us        63.267us         3                test/test_profiler.py(72): test_source
aten::sum                            5.02%            113.340us        8.39%            189.475us        189.475us        1                test/test_profiler.py(64): ts_method_1
aten::add                            4.58%            103.346us        6.33%            142.847us        142.847us        1                test/test_profiler.py(62): ts_method_1
aten::mul                            4.05%            91.498us         9.62%            217.113us        217.113us        1                test/test_profiler.py(71): test_source
aten::add                            4.03%            90.880us         5.60%            126.405us        126.405us        1                test/test_profiler.py(58): ts_method_2
aten::empty                          3.49%            78.735us         3.49%            78.735us         19.684us         4                test/test_profiler.py(72): test_source
```

Reviewed By: ngimel

Differential Revision: D23432664

Pulled By: ilia-cher

fbshipit-source-id: 83ad7ebe0c2502494d3b48c4e687802db9c77615
---
 aten/src/ATen/record_function.h               |  19 +-
 .../profiler_benchmark/profiler_bench.py      | 143 +++++----
 caffe2/CMakeLists.txt                         |   2 +-
 test/cpp/jit/test_misc.cpp                    |   1 -
 test/test_profiler.py                         |  58 +++-
 torch/autograd/profiler.py                    | 186 +++++++++---
 torch/csrc/autograd/function.h                |  29 +-
 torch/csrc/autograd/init.cpp                  |  21 +-
 torch/csrc/autograd/profiler.cpp              | 150 ++++++----
 torch/csrc/autograd/profiler.h                |  62 +++-
 .../profiler/server_process_global_profiler.h |   3 +-
 torch/csrc/distributed/rpc/utils.cpp          |   8 +-
 torch/csrc/jit/frontend/tracer.cpp            |  12 +
 torch/csrc/jit/frontend/tracer.h              |   4 +
 torch/csrc/jit/python/python_tracer.cpp       |  42 ++-
 torch/csrc/jit/python/python_tracer.h         |   1 -
 torch/csrc/jit/runtime/interpreter.cpp        | 275 ++++++++++--------
 torch/csrc/jit/runtime/interpreter.h          |   4 +
 .../rpc/server_process_global_profiler.py     |   8 +-
 19 files changed, 692 insertions(+), 336 deletions(-)

diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h
index 69f8581c9274..cf839ad4a188 100644
--- a/aten/src/ATen/record_function.h
+++ b/aten/src/ATen/record_function.h
@@ -13,6 +13,8 @@ namespace at {
 enum class C10_API_ENUM RecordScope : uint8_t {
   // c10/ATen ops, autograd nodes
   FUNCTION = 0,
+  // Functions/nodes called from the autograd
+  BACKWARD_FUNCTION,
   // TorchScript functions, methods
   TORCHSCRIPT_FUNCTION,
   // User defined scope (e.g. with record_function())
@@ -115,10 +117,22 @@ struct TORCH_API RecordFunction {
   // Retrieves the thread_id that this RecordFunction ran start callbacks with.
   // Useful for writing thread safe end callbacks that may be potentially
   // executed in a different thread (async ops)
-  inline uint64_t getStartCallbacksThreadId() const {
+  inline uint64_t threadId() const {
     return thread_id_;
   }
 
+  // For backward functions - thread id of the corresponding forward function,
+  // or zero otherwise;
+  // used alongside with sequence number to correlate backward functions with
+  // the forward ones
+  inline uint64_t forwardThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  inline void setForwardThreadId(uint64_t thread_id) {
+    fwd_thread_id_ = thread_id;
+  }
+
   inline RecordScope scope() const {
     return scope_;
   }
@@ -205,6 +219,9 @@ struct TORCH_API RecordFunction {
   // The logical thread_id that this RecordFunction was created with
   uint64_t thread_id_ = 0;
 
+  // For backward functions - thread id of the the forward function
+  uint64_t fwd_thread_id_ = 0;
+
   // Unique id for this RecordFunction, used in callbacks to track start
   // and end of ranges
   RecordFunctionHandle handle_ {0};
diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py
index 616d1078ee7d..6b187b03522e 100644
--- a/benchmarks/profiler_benchmark/profiler_bench.py
+++ b/benchmarks/profiler_benchmark/profiler_bench.py
@@ -1,33 +1,23 @@
-from functools import partial
-import itertools
+import argparse
 import statistics
+import sys
 import timeit
 import torch
 
-TENSOR_SIZES = [1, 32, 128, 256, 512]
-INTERNAL_ITER = 256
-PARALLEL_TASKS_NUM = 4
-N = 100
+from torch.utils._benchmark import Timer
 
+PARALLEL_TASKS_NUM = 4
+INTERNAL_ITER = None
 def loop_workload(x):
     for i in range(INTERNAL_ITER):
         x = torch.mm(x, x)
     return x
 
-traced_loop_workload = None
-def run_profiler_benchmark_loop(input_x, use_cuda, profiling_enabled):
-    if profiling_enabled:
-        with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof:
-            traced_loop_workload(input_x)
-    else:
-        traced_loop_workload(input_x)
-
-def parallel_task(x):
-    for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
-        x = torch.mm(x, x)
-    return x
-
 def parallel_workload(x):
+    def parallel_task(x):
+        for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)):
+            x = torch.mm(x, x)
+        return x
     futs = []
     for i in range(PARALLEL_TASKS_NUM):
         futs.append(torch.jit._fork(parallel_task, x))
@@ -35,50 +25,85 @@ def parallel_workload(x):
         torch.jit._wait(futs[i])
     return x
 
-traced_parallel_workload = None
-def run_profiler_benchmark_parallel(input_x, use_cuda, profiling_enabled):
-    if profiling_enabled:
-        with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof:
-            traced_parallel_workload(input_x)
-    else:
-        traced_parallel_workload(input_x)
 
 if __name__ == '__main__':
-    for workload_name in ["loop", "parallel"]:
-        print("Payload: {}; {} iterations, N = {}\n".format(
-            workload_name, INTERNAL_ITER, N))
-        for params in itertools.product([False, True], TENSOR_SIZES, [False, True]):
-            use_cuda = params[0]
-            profiling_tensor_size = params[1]
-            profiling_enabled = params[2]
-
-            if (use_cuda and not torch.cuda.is_available()):
-                continue
-
-            print("Profiling {}, tensor size {}x{}, use cuda: {}".format(
-                "enabled" if profiling_enabled else "disabled",
-                profiling_tensor_size, profiling_tensor_size, use_cuda))
-
-            input_x = torch.rand(profiling_tensor_size, profiling_tensor_size)
-            if use_cuda:
-                input_x = input_x.cuda()
-            workload = None
-            if workload_name == "loop":
-                workload = partial(
-                    run_profiler_benchmark_loop, input_x, use_cuda, profiling_enabled)
-                traced_loop_workload = torch.jit.trace(loop_workload, input_x)
-            elif workload_name == "parallel":
-                workload = partial(
-                    run_profiler_benchmark_parallel, input_x, use_cuda, profiling_enabled)
-                traced_parallel_workload = torch.jit.trace(
-                    parallel_workload, input_x)
-
-            runtimes = timeit.repeat(workload, repeat=N, number=1)
+    torch._C._set_graph_executor_optimize(False)
+    parser = argparse.ArgumentParser(
+        description='Profiler benchmark')
+
+    parser.add_argument('--with_cuda', action='store_true')
+    parser.add_argument('--with_stack', action='store_true')
+    parser.add_argument('--use_script', action='store_true')
+    parser.add_argument('--profiling_tensor_size', default=1, type=int)
+    parser.add_argument('--workload', default='loop', type=str)
+    parser.add_argument('--internal_iter', default=256, type=int)
+    parser.add_argument('--n', default=100, type=int)
+    parser.add_argument('--use_timer', action='store_true')
+    parser.add_argument('--timer_min_run_time', default=100, type=int)
+
+    args = parser.parse_args()
+
+    if args.with_cuda and not torch.cuda.is_available():
+        print("No CUDA available")
+        sys.exit()
+
+    print("Payload: {}; {} iterations, N = {}\n".format(
+        args.workload, args.internal_iter, args.n))
+    INTERNAL_ITER = args.internal_iter
+
+    for profiling_enabled in [False, True]:
+        print("Profiling {}, tensor size {}x{}, use cuda: {}, with stacks: {}, use script: {}".format(
+            "enabled" if profiling_enabled else "disabled",
+            args.profiling_tensor_size,
+            args.profiling_tensor_size,
+            args.with_cuda,
+            args.with_stack,
+            args.use_script))
+
+        input_x = torch.rand(
+            args.profiling_tensor_size,
+            args.profiling_tensor_size)
+
+        if args.with_cuda:
+            input_x = input_x.cuda()
+
+        workload = None
+        assert args.workload in ["loop", "parallel"]
+        if args.workload == "loop":
+            workload = loop_workload
+        else:
+            workload = parallel_workload
+
+        if args.use_script:
+            traced_workload = torch.jit.trace(workload, (input_x,))
+            workload = traced_workload
+
+        if profiling_enabled:
+            def payload():
+                x = None
+                with torch.autograd.profiler.profile(
+                        use_cuda=args.with_cuda,
+                        with_stack=args.with_stack) as prof:
+                    x = workload(input_x)
+                return x
+        else:
+            def payload():
+                return workload(input_x)
+
+        if args.use_timer:
+            t = Timer(
+                "payload()",
+                globals={"payload": payload},
+                timer=timeit.default_timer,
+            ).blocked_autorange(min_run_time=args.timer_min_run_time)
+            print(t)
+        else:
+            runtimes = timeit.repeat(payload, repeat=args.n, number=1)
             avg_time = statistics.mean(runtimes) * 1000.0
             stddev_time = statistics.stdev(runtimes) * 1000.0
             print("\tavg. time: {:.3f} ms, stddev: {:.3f} ms".format(
                 avg_time, stddev_time))
-            if workload_name == "loop":
+            if args.workload == "loop":
                 print("\ttime per iteration: {:.3f} ms".format(
-                    avg_time / INTERNAL_ITER))
-            print()
+                    avg_time / args.internal_iter))
+        print()
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 96da8a00c7c8..eeb4801577ae 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -6,7 +6,7 @@ if(USE_VULKAN)
   include(../cmake/VulkanCodegen.cmake)
 endif()
 
-# ---[ MSVC OpenMP modification 
+# ---[ MSVC OpenMP modification
 if(MSVC)
   include(../cmake/public/utils.cmake)
 endif()
diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index 39fb703fc366..d205ae3d58db 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -825,7 +825,6 @@ TEST(RecordFunctionTest, Basic) {
     traced_inputs.clear();
   }
 
-  TORCH_CHECK(ts_names.size() == 2);
   TORCH_CHECK(ts_names.find("forward") != ts_names.end());
   TORCH_CHECK(ts_names.find("foo") != ts_names.end());
 
diff --git a/test/test_profiler.py b/test/test_profiler.py
index aefdfbb937fa..f1feff1d0af3 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -3,6 +3,7 @@
 import unittest
 
 import torch
+import torch.nn as nn
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS)
 from torch.autograd.profiler import profile
@@ -18,7 +19,7 @@
 @unittest.skipIf(TEST_WITH_ASAN, "Cannot test with ASAN")
 @unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
 @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
-class TestProfiler_cuda(TestCase):
+class TestProfilerCUDA(TestCase):
     def test_mem_leak(self):
         """Checks that there's no memory leak when using profiler with CUDA
         """
@@ -44,5 +45,60 @@ def test_mem_leak(self):
         self.assertTrue(not (is_increasing and max_diff > 100 * 1024),
                         msg='memory usage is increasing, {}'.format(str(last_rss)))
 
+class TestProfiler(TestCase):
+    def test_source(self):
+        """Checks that source code attribution works for eager, TS and autograd mode
+        """
+        # avoid automatic inlining
+        prev_opt = torch._C._get_graph_executor_optimize()
+        torch._C._set_graph_executor_optimize(False)
+
+        @torch.jit.script
+        def ts_method_2(x, y):
+            return torch.matmul(x, y)
+
+        @torch.jit.script
+        def ts_method_1(x, y, z):
+            a = x + z
+            w = ts_method_2(x, y) + a
+            return w.sum()
+
+        class DummyModule(nn.Module):
+            def __init__(self):
+                super(DummyModule, self).__init__()
+                self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        mod = DummyModule()
+
+        with profile(with_stack=True) as p:
+            x = torch.randn(10, 10, requires_grad=True)
+            y = torch.randn(10, 10, requires_grad=True)
+            z = x + y
+            w = ts_method_1(x, y, z)
+            v = 2 * w
+            v.backward()
+            a = torch.randn(2, 3, 2, 2, requires_grad=True)
+            b = mod(a)
+            c = b.sum()
+            c.backward()
+
+        print(p.key_averages(
+            group_by_stack_n=5).table(
+            sort_by="self_cpu_time_total", row_limit=-1))
+
+        for e in p.function_events:
+            if "aten::add" in e.name or "AddBackward" in e.name:
+                self.assertTrue(any(["test_profiler" in entry for entry in e.stack]))
+                self.assertTrue(any([(
+                    "test_source" in entry or
+                    "ts_method_1" in entry or
+                    "ts_method_2" in entry) for entry in e.stack]))
+
+        torch._C._set_graph_executor_optimize(prev_opt)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 7c9fe643a991..8d33be090b27 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -105,6 +105,35 @@ def populate_cpu_children(self):
 
         self._cpu_children_populated = True
 
+    def set_backward_stacktraces(self):
+        self.populate_cpu_children()
+
+        def bw_parent(evt):
+            if evt is None:
+                return None
+            elif evt.scope == 1:
+                return evt
+            else:
+                return bw_parent(evt.cpu_parent)
+
+        fwd_stacks = {}
+        for evt in self:
+            if bw_parent(evt) is None:
+                t = (evt.sequence_nr, evt.thread)
+                if t not in fwd_stacks:
+                    fwd_stacks[t] = evt.stack
+
+        for evt in self:
+            p = bw_parent(evt)
+            if p is not None:
+                assert p.fwd_thread is not None
+                t = (p.sequence_nr, p.fwd_thread)
+                if t in fwd_stacks:
+                    evt.stack = fwd_stacks[t]
+                else:
+                    evt.stack = []
+
+
     @property
     def self_cpu_time_total(self):
         return sum([event.self_cpu_time_total for event in self])
@@ -208,14 +237,17 @@ def export_chrome_trace(self, path):
             f.truncate()
             f.write("]")
 
-    def key_averages(self, group_by_input_shapes=False):
+    def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
         """Averages all function events over their keys.
 
-        @param group_by_input_shapes The key would become
-        (event name, input dimensions) rather than just event name.
-        This is useful to see which dimensionality contributes to the runtime
-        the most and may help with dimension specific optimizations or
-        choosing best candidates for quantization (aka fitting a roof line)
+        Arguments:
+            group_by_input_shapes: group entries by
+            (event name, input shapes) rather than just event name.
+            This is useful to see which input shapes contribute to the runtime
+            the most and may help with size-specific optimizations or
+            choosing the best candidates for quantization (aka fitting a roof line)
+
+            group_by_stack_n: group by top n stack trace entries
 
         Returns:
             An EventList containing FunctionEventAvg objects.
@@ -223,15 +255,22 @@ def key_averages(self, group_by_input_shapes=False):
         self.populate_cpu_children()
         stats = defaultdict(FunctionEventAvg)
 
-        def get_key(event, group_by_input_shapes):
-            if not group_by_input_shapes:
-                return (event.key, event.node_id)
-            return (event.key, str(event.input_shapes), event.node_id)
+        def get_key(event, group_by_input_shapes, group_by_stack_n):
+            key = [str(event.key), str(event.node_id)]
+            if group_by_input_shapes:
+                key.append(str(event.input_shapes))
+            if group_by_stack_n > 0:
+                key += event.stack[:group_by_stack_n]
+            return tuple(key)
         for evt in self:
-            stats[get_key(evt, group_by_input_shapes)].add(
-                evt, group_by_input_shapes)
+            stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt)
 
-        return EventList(stats.values(), use_cuda=self._use_cuda, profile_memory=self._profile_memory)
+        avg_list = EventList(stats.values(), use_cuda=self._use_cuda, profile_memory=self._profile_memory)
+        for evt in avg_list:
+            evt.stack = evt.stack[:group_by_stack_n]
+            if not group_by_input_shapes:
+                evt.input_shapes = ""
+        return avg_list
 
     def total_average(self):
         """Averages all events.
@@ -275,8 +314,11 @@ class profile(object):
 
         profile_memory (bool, optional): Whether to report memory usage, default: ``False``
 
+        with_stack (bool, optional): record source information (file and line number) for the ops
+
     .. warning:
-        Enabling memory profiling incurs additional profiler overhead
+        Enabling memory profiling or source attribution incurs additional profiler
+        overhead
 
     .. warning:
         This context managers should not be called recursively, i.e. no nested
@@ -312,7 +354,8 @@ def __init__(
             enabled=True,
             use_cuda=False,
             record_shapes=False,
-            profile_memory=False):
+            profile_memory=False,
+            with_stack=False):
         self.enabled = enabled
         self.use_cuda = use_cuda
         self.function_events = None
@@ -321,6 +364,7 @@ def __init__(
         self.entered = False
         self.record_shapes = record_shapes
         self.profile_memory = profile_memory
+        self.with_stack = with_stack
 
     def __enter__(self):
         if not self.enabled:
@@ -331,7 +375,11 @@ def __enter__(self):
         profiler_kind = torch.autograd.ProfilerState.CUDA if self.use_cuda \
             else torch.autograd.ProfilerState.CPU
 
-        config = torch.autograd.ProfilerConfig(profiler_kind, self.record_shapes, self.profile_memory)
+        config = torch.autograd.ProfilerConfig(
+            profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            self.with_stack)
         torch.autograd._enable_profiler(config)
         return self
 
@@ -340,9 +388,11 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             return
         records = torch.autograd._disable_profiler()
         self.function_events = EventList(
-            parse_cpu_trace(records),
+            parse_event_records(records),
             use_cuda=self.use_cuda,
             profile_memory=self.profile_memory)
+        if self.with_stack:
+            self.function_events.set_backward_stacktraces()
         return False
 
     def __repr__(self):
@@ -374,9 +424,9 @@ def export_chrome_trace(self, path):
         return self.function_events.export_chrome_trace(path)
     export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
 
-    def key_averages(self, group_by_input_shape=False):
+    def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
         self._check_finish()
-        return self.function_events.key_averages(group_by_input_shape)
+        return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
     key_averages.__doc__ = EventList.key_averages.__doc__
 
     def total_average(self):
@@ -569,8 +619,8 @@ def __enter__(self):
             torch.autograd.ProfilerConfig(
                 torch.autograd.ProfilerState.NVTX,
                 self.record_shapes,
-                False
-            )
+                False,
+                False)
         )
         return self
 
@@ -666,19 +716,22 @@ def elapsed_us(self):
 class FunctionEvent(FormattedTimesMixin):
     """Profiling information about a single function."""
     def __init__(
-            self, id, node_id, name, thread, cpu_start, cpu_end, input_shapes=None,
-            cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, is_remote=True,
-            sequence_nr=-1):
+            self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None,
+            stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False,
+            is_remote=True, sequence_nr=-1):
         self.id = id
         self.node_id = node_id
         self.name = name
         self.cpu_interval = Interval(cpu_start, cpu_end)
         self.thread = thread
+        self.fwd_thread = fwd_thread
         self.kernels = []
         self.count = 1
         self.cpu_children = []
         self.cpu_parent = None
         self.input_shapes = input_shapes
+        self.stack = stack
+        self.scope = scope
         self.cpu_memory_usage = cpu_memory_usage
         self.cuda_memory_usage = cuda_memory_usage
         self.is_async = is_async
@@ -787,6 +840,8 @@ def __init__(self):
         self.self_cpu_time_total = 0
         self.self_cuda_time_total = 0
         self.input_shapes = None
+        self.stack = None
+        self.scope = None
         self.cpu_memory_usage = 0
         self.cuda_memory_usage = 0
         self.self_cpu_memory_usage = 0
@@ -794,7 +849,7 @@ def __init__(self):
         self.cpu_children = None
         self.cpu_parent = None
 
-    def add(self, other, group_by_input_shapes=False):
+    def add(self, other):
         if self.key is None:
             # First function being recorded as part of FunctionEventAvg, propagate
             # fields.
@@ -804,13 +859,11 @@ def add(self, other, group_by_input_shapes=False):
             self.is_remote = other.is_remote
             self.cpu_parent = other.cpu_parent
             self.cpu_children = other.cpu_children
-            if group_by_input_shapes:
-                self.input_shapes = other.input_shapes
 
-        assert (
-            not group_by_input_shapes or
-            other.input_shapes == self.input_shapes
-        )
+            self.input_shapes = other.input_shapes
+            self.stack = other.stack
+            self.scope = other.scope
+
         assert isinstance(other, (FunctionEvent, FunctionEventAvg))
         assert other.key == self.key
         self.cpu_time_total += other.cpu_time_total
@@ -830,8 +883,8 @@ def __iadd__(self, other):
     def __repr__(self):
         return (
             '<FunctionEventAvg key={} self_cpu_time={} cpu_time={} '
-            ' self_cuda_time={} cuda_time={} input_shapes={}> '
-            'cpu_memory_usage={} cuda_memory_usage={}'.format(
+            ' self_cuda_time={} cuda_time={} input_shapes={} '
+            'cpu_memory_usage={} cuda_memory_usage={}>'.format(
                 self.key,
                 self.self_cpu_time_total_str,
                 self.cpu_time_str,
@@ -855,14 +908,10 @@ def __missing__(self, key):
         self[key] = torch._C._demangle(key) if len(key) > 1 else key
         return self[key]
 
-
-################################################################################
-# CPU checkpoints
-
-def parse_cpu_trace(thread_records):
+def parse_event_records(thread_records):
     def get_record_key(record):
         """
-        Returns a tuple to be used by parse_cpu_trace for correlating start and
+        Returns a tuple to be used by parse_event_records for correlating start and
         end records.
         """
         return (record.handle(), record.node_id())
@@ -883,6 +932,17 @@ def get_record_key(record):
         "aten::_version",
     ]
 
+    def filter_stack_entry(entry):
+        filtered_entries = [
+            ("autograd/__init__", "_make_grads"),
+            ("autograd/__init__", "backward"),
+            ("torch/tensor", "backward"),
+            ("_internal/common_utils", "prof_callable"),
+            ("_internal/common_utils", "prof_func_call"),
+            ("_internal/common_utils", "prof_meth_call"),
+        ]
+        return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries])
+
     # cuda start events and the overall profiler start event don't happen
     # at exactly the same time because we need to record an event on each device
     # and each record takes ~4us. So we adjust here by the difference
@@ -961,7 +1021,10 @@ def adjusted_time(cuda_record, cuda_records_map):
                     thread=start.thread_id(),
                     cpu_start=start_record.cpu_elapsed_us(start),
                     cpu_end=start_record.cpu_elapsed_us(record),
+                    fwd_thread=start.fwd_thread_id(),
                     input_shapes=start.shapes(),
+                    stack=[entry for entry in start.stack() if filter_stack_entry(entry)],
+                    scope=start.scope(),
                     cpu_memory_usage=cpu_memory_usage,
                     cuda_memory_usage=cuda_memory_usage,
                     is_async=is_async,
@@ -1098,10 +1161,24 @@ def build_table(
         ), use_cuda=use_cuda, profile_memory=profile_memory)
 
     has_input_shapes = any(
-        [event.input_shapes is not None for event in events])
+        [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events])
+
     name_column_width = max([len(evt.key) for evt in events]) + 4
+
     DEFAULT_COLUMN_WIDTH = 12
-    SHAPES_COLUMN_WIDTH = 45
+
+    shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
+    shapes_column_width = min(shapes_column_width, 45)
+
+    src_column_width = None
+    stacks = []
+    for evt in events:
+        if evt.stack is not None and len(evt.stack) > 0:
+            stacks.append(evt.stack)
+    has_stack = len(stacks) > 0
+    if has_stack:
+        src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
+        src_column_width = min(src_column_width, 75)
 
     headers = [
         'Name',
@@ -1141,10 +1218,11 @@ def build_table(
     row_format = [""]
     header_sep = [""]
     line_length = [-SPACING_SIZE]
+    MAX_STACK_ENTRY = 5
 
-    def add_column(padding):
-        row_format[0] += '{: >' + str(padding) + '}  '
-        header_sep[0] += '-' * padding + '  '
+    def add_column(padding, text_dir='>'):
+        row_format[0] += '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE)
+        header_sep[0] += '-' * padding + (' ' * SPACING_SIZE)
         line_length[0] += padding + SPACING_SIZE
 
     add_column(name_column_width)
@@ -1153,7 +1231,11 @@ def add_column(padding):
 
     if has_input_shapes:
         headers.append('Input Shapes')
-        add_column(SHAPES_COLUMN_WIDTH)
+        add_column(shapes_column_width)
+
+    if has_stack:
+        headers.append('Source Location')
+        add_column(src_column_width, text_dir='<')
 
     row_format = row_format[0]
     header_sep = header_sep[0]
@@ -1229,9 +1311,21 @@ def append(s):
         if append_node_id:
             row_values.append(evt.node_id)
         if has_input_shapes:
-            row_values.append(str(evt.input_shapes)[:SHAPES_COLUMN_WIDTH])
+            row_values.append(str(evt.input_shapes)[:shapes_column_width])
+        if has_stack:
+            src_field = ""
+            if len(evt.stack) > 0:
+                src_field = evt.stack[0][:src_column_width]
+            row_values.append(src_field)
         append(row_format.format(*row_values))
 
+        if has_stack:
+            empty_headers = [""] * (len(headers) - 1)
+            for entry in evt.stack[1:MAX_STACK_ENTRY]:
+                append(row_format.format(*(empty_headers + [entry[:src_column_width]])))
+            empty_headers.append("")
+            append(row_format.format(*empty_headers))
+
     append(header_sep)
     append("Self CPU time total: {}".format(format_time(self_cpu_time_total)))
     if use_cuda:
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 17d4f5473880..65d94717a84b 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -114,6 +114,10 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
       // We are tracking the parents to track multiple backward operations.
       assign_parent();
     }
+
+    if (profiler::profilerEnabled()) {
+      thread_id_ = at::RecordFunction::currentThreadId();
+    }
   }
 
   explicit Node(edge_list&& next_edges = edge_list())
@@ -129,8 +133,21 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   /// Evaluates the function on the given inputs and returns the result of the
   /// function call.
   variable_list operator()(variable_list&& inputs) {
-    RECORD_FUNCTION(
-        name(), std::vector<c10::IValue>(inputs.begin(), inputs.end()), sequence_nr());
+    // Using RecordFunction to trogger observers in the backward pass
+    at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION);
+    if (guard.active) {
+      // Using sequence number and thread id to correlate with
+      // the forward pass function
+      guard.setForwardThreadId(thread_id_);
+      if (guard.needs_inputs) {
+        guard.before(
+          name(),
+          std::vector<c10::IValue>(inputs.begin(), inputs.end()),
+          sequence_nr());
+      } else {
+        guard.before(name(), sequence_nr());
+      }
+    }
     // In the first iteration of named tensors, autograd ignores names and
     // operates on unnamed tensors. In the long term, autograd should
     // probably operate with names.
@@ -241,6 +258,11 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   // assigning a node as a parent to this node
   void assign_parent();
 
+  /// Id of the thread that created Node
+  uint64_t thread_id() const noexcept {
+    return thread_id_;
+  }
+
   /// Returns the name of the dynamic type of the function, for debugging.
   virtual std::string name() const;
 
@@ -362,6 +384,9 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   // fields.
   const uint64_t sequence_nr_;
 
+  // Id of the thread that created the instance
+  uint64_t thread_id_ = 0;
+
   // Note [Thread Safety on Autograd Node]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Autograd Engine let the owning thread which calls Engine::execute to drive the
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 5fac8dfb6d9f..045a732a2016 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -42,23 +42,26 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("NVTX", ProfilerState::NVTX);
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
-      .def(py::init<ProfilerState, bool, bool>());
+      .def(py::init<ProfilerState, bool, bool, bool>());
 
   py::class_<Event>(m, "ProfilerEvent")
       .def("kind", &Event::kind)
       .def("name", [](const Event& e) { return e.name(); })
-      .def("thread_id", &Event::thread_id)
+      .def("thread_id", &Event::threadId)
+      .def("fwd_thread_id", &Event::fwdThreadId)
       .def("device", &Event::device)
-      .def("cpu_elapsed_us", &Event::cpu_elapsed_us)
-      .def("cuda_elapsed_us", &Event::cuda_elapsed_us)
-      .def("has_cuda", &Event::has_cuda)
+      .def("cpu_elapsed_us", &Event::cpuElapsedUs)
+      .def("cuda_elapsed_us", &Event::cudaElapsedUs)
+      .def("has_cuda", &Event::hasCuda)
       .def("shapes", &Event::shapes)
-      .def("cpu_memory_usage", &Event::cpu_memory_usage)
-      .def("cuda_memory_usage", &Event::cuda_memory_usage)
+      .def("cpu_memory_usage", &Event::cpuMemoryUsage)
+      .def("cuda_memory_usage", &Event::cudaMemoryUsage)
       .def("handle", &Event::handle)
-      .def("node_id", &Event::node_id)
+      .def("node_id", &Event::nodeId)
       .def("is_remote", &Event::isRemote)
-      .def("sequence_nr", &Event::sequence_nr);
+      .def("sequence_nr", &Event::sequenceNr)
+      .def("stack", &Event::stack)
+      .def("scope", &Event::scope);
 
   py::class_<ProfilerDisableOptions>(m, "_ProfilerDisableOptions")
     .def(py::init<bool, bool>());
diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp
index 1a0887ec8908..5cbb7606e579 100644
--- a/torch/csrc/autograd/profiler.cpp
+++ b/torch/csrc/autograd/profiler.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/jit/frontend/code_template.h>
 
+#include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/runtime/operator.h>
 
 #include <ATen/core/op_registration/op_registration.h>
@@ -24,28 +25,28 @@ namespace torch { namespace autograd { namespace profiler {
 
 namespace {
 
-  enum EventIValueIdx {
-    KIND = 0,
-    NAME,
-    THREAD_ID,
-    HANDLE,
-    NODE_ID,
-    CPU_MEM_USAGE,
-    CPU_NS,
-    CUDA_RECORDED,
-    CUDA_MEM_USAGE,
-    CUDA_DEVICE,
-    CUDA_US,
-    SHAPES,
-    NUM_EVENT_IVALUE_IDX // must be last in list
-  };
+enum EventIValueIdx {
+  KIND = 0,
+  NAME,
+  THREAD_ID,
+  HANDLE,
+  NODE_ID,
+  CPU_MEM_USAGE,
+  CPU_NS,
+  CUDA_RECORDED,
+  CUDA_MEM_USAGE,
+  CUDA_DEVICE,
+  CUDA_US,
+  SHAPES,
+  NUM_EVENT_IVALUE_IDX // must be last in list
+};
 
-  enum ProfilerIValueIdx {
-    STATE = 0,
-    REPORT_INPUT_SHAPES,
-    PROFILE_MEMORY,
-    NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
-  };
+enum ProfilerIValueIdx {
+  STATE = 0,
+  REPORT_INPUT_SHAPES,
+  PROFILE_MEMORY,
+  NUM_PROFILER_CFG_IVALUE_IDX // must be last in list
+};
 
   const std::unordered_set<std::string> disable_cuda_profiling = {
       "aten::view",
@@ -162,6 +163,12 @@ static CUDAStubs* cuda_stubs = default_stubs_addr;
 //  - save profiling events into the profiling state
 //
 
+struct FileLineFunc {
+  std::string filename;
+  size_t line;
+  std::string funcname;
+};
+
 // Profiler state
 struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   explicit ProfilerThreadLocalState(const ProfilerConfig& config)
@@ -218,36 +225,44 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   }
 
   void pushRange(
-      const at::StringView& name,
+      const at::RecordFunction& fn,
       const bool record_cuda,
       const char* msg = "",
-      int64_t sequence_nr = -1,
-      std::vector<std::vector<int64_t>>&& shapes = {},
-      at::RecordFunctionHandle handle = 0) {
+      std::vector<std::vector<int64_t>>&& shapes = {}) {
     if (config_.state == ProfilerState::Disabled) {
       return;
     }
     if (config_.state == ProfilerState::NVTX) {
-      cuda_stubs->nvtxRangePushA(
-          getNvtxStr(name, msg, sequence_nr, shapes).c_str());
+      cuda_stubs->nvtxRangePushA(getNvtxStr(
+          fn.name(), msg, fn.seqNr(), shapes).c_str());
     } else {
       Event evt(
           EventKind::PushRange,
-          name,
+          fn.name(),
           at::RecordFunction::currentThreadId(),
           record_cuda,
-          handle,
+          fn.handle(),
           std::move(shapes),
           at::RecordFunction::getDefaultNodeId());
-      evt.setSequenceNr(sequence_nr);
+      evt.setSequenceNr(fn.seqNr());
+      evt.setFwdThreadId(fn.forwardThreadId());
+      evt.setScope((uint8_t)fn.scope());
+#ifndef C10_MOBILE
+      // backward nodes source range corresponds to the forward node
+      // TODO: consider using C++ stack trace
+      if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) {
+        auto cs = prepareCallstack(jit::currentCallstack());
+        if (cs.empty()) {
+          cs = prepareCallstack(jit::tracer::pythonCallstack());
+        }
+        evt.setStack(callstackStr(cs));
+      }
+#endif
       getEventList().record(std::move(evt));
     }
   }
 
-  void popRange(
-      uint64_t thread_id,
-      const bool record_cuda,
-      at::RecordFunctionHandle handle) {
+  void popRange(const at::RecordFunction& fn, const bool record_cuda) {
     if (config_.state == ProfilerState::Disabled) {
       return;
     }
@@ -263,9 +278,9 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
           at::StringView(""),
           at::RecordFunction::currentThreadId(),
           record_cuda,
-          handle);
+          fn.handle());
       evt.setNodeId(at::RecordFunction::getDefaultNodeId());
-      getEventList(thread_id).record(std::move(evt));
+      getEventList(fn.threadId()).record(std::move(evt));
     }
   }
 
@@ -297,7 +312,35 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
     return config_.profile_memory;
   }
 
-  private:
+ private:
+  std::vector<FileLineFunc> prepareCallstack(const std::vector<jit::StackEntry>& cs) {
+    std::vector<FileLineFunc> entries;
+    entries.reserve(cs.size());
+    for (const auto& entry : cs) {
+      auto& range = entry.range;
+      if (range.source()) {
+        auto& src = range.source();
+        if (src && src->filename()) {
+          auto line = src->starting_line_no() +
+              src->lineno_for_offset(range.start());
+          entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename});
+        }
+      }
+    }
+    return entries;
+  }
+
+  std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
+    std::vector<std::string> cs_str;
+    cs_str.reserve(cs.size());
+    for (const auto& entry : cs) {
+      std::stringstream loc;
+      loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
+      cs_str.push_back(loc.str());
+    }
+    return cs_str;
+  }
+
   std::string getNvtxStr(
       const at::StringView& name,
       const char* msg,
@@ -363,8 +406,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase {
   std::unordered_map<uint64_t, std::shared_ptr<RangeEventList>>
       event_lists_map_;
 
-  ProfilerConfig config_ =
-      ProfilerConfig(ProfilerState::Disabled, false, false);
+  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
   at::CallbackHandle handle_ = 0;
   c10::optional<std::vector<std::vector<Event>>> remoteProfiledEvents_;
 };
@@ -405,10 +447,9 @@ void pushProfilingCallbacks() {
               inputSizes.emplace_back();
             }
           }
-          state_ptr->pushRange(
-              fn.name(), record_cuda, msg, fn.seqNr(), std::move(inputSizes), fn.handle());
+          state_ptr->pushRange(fn, record_cuda, msg, std::move(inputSizes));
         } else {
-          state_ptr->pushRange(fn.name(), record_cuda, msg, fn.seqNr(), {}, fn.handle());
+          state_ptr->pushRange(fn, record_cuda, msg);
         }
       },
       [](const at::RecordFunction& fn) {
@@ -421,7 +462,7 @@ void pushProfilingCallbacks() {
         if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) {
           record_cuda = false;
         }
-        state_ptr->popRange(fn.getStartCallbacksThreadId(), record_cuda, fn.handle());
+        state_ptr->popRange(fn, record_cuda);
       })
     .needsInputs(state_ptr->config().report_input_shapes)
     .needsIds(true));
@@ -611,13 +652,13 @@ at::IValue Event::toIValue() const {
   eventIValueList.reserve(NUM_EVENT_IVALUE_IDX);
   eventIValueList.emplace_back(static_cast<int64_t>(kind_));
   eventIValueList.emplace_back(std::string(name_.str()));
-  eventIValueList.emplace_back(thread_id_);
+  eventIValueList.emplace_back(static_cast<int64_t>(thread_id_));
   eventIValueList.emplace_back(static_cast<double>(handle_));
   eventIValueList.emplace_back(node_id_);
   eventIValueList.emplace_back(cpu_memory_usage_);
   eventIValueList.emplace_back(cpu_ns_);
   // CUDA event information
-  bool cuda_profiling_enabled = has_cuda();
+  bool cuda_profiling_enabled = hasCuda();
   eventIValueList.emplace_back(cuda_profiling_enabled);
   eventIValueList.emplace_back(static_cast<int64_t>(cuda_memory_usage_));
   eventIValueList.emplace_back(device_);
@@ -638,8 +679,8 @@ at::IValue Event::toIValue() const {
   return at::IValue(eventIValueList);
 }
 
-double Event::cuda_elapsed_us(const Event& e) const {
-  TORCH_CHECK(e.has_cuda() && has_cuda(), "Events were not recorded for CUDA");
+double Event::cudaElapsedUs(const Event& e) const {
+  TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA");
   TORCH_CHECK(
       e.device() == device(),
       c10::str(
@@ -688,22 +729,22 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector<Event*>& e
   bool first = true;
   for (Event* evt : events) {
     if (evt->kind() == "push") {
-      events_map[std::make_pair(evt->handle(), evt->node_id())] = evt;
+      events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt;
     } else if (evt->kind() == "pop") {
       if (!first) {
         out << ",\n";
       }
       first = false;
-      auto it = events_map.find(std::make_pair(evt->handle(), evt->node_id()));
+      auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId()));
       TORCH_CHECK(it != events_map.end(), "Unmatched pop event");
       Event* evt_start = it->second;
       events_map.erase(it);
 
       jit::TemplateEnv env;
       env.s("name", evt_start->name());
-      env.d("ts", profiler_start->cpu_elapsed_us(*evt_start));
-      env.d("dur", evt_start->cpu_elapsed_us(*evt));
-      env.d("tid", evt_start->thread_id());
+      env.d("ts", profiler_start->cpuElapsedUs(*evt_start));
+      env.d("dur", evt_start->cpuElapsedUs(*evt));
+      env.d("tid", evt_start->threadId());
       out << event_template.format(env);
     }
   }
@@ -722,10 +763,7 @@ RecordProfile::RecordProfile(const std::string& filename)
 }
 
 void RecordProfile::init() {
-  enableProfiler(ProfilerConfig(
-      ProfilerState::CPU,
-      /* report_input_shapes */ false,
-      /* profile_memory */ false));
+  enableProfiler(ProfilerConfig(ProfilerState::CPU));
 }
 
 RecordProfile::~RecordProfile() {
diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h
index 12c5e409e7f7..9cfe9ea1fd6e 100644
--- a/torch/csrc/autograd/profiler.h
+++ b/torch/csrc/autograd/profiler.h
@@ -113,15 +113,18 @@ enum class C10_API_ENUM ProfilerState {
 struct TORCH_API ProfilerConfig {
   ProfilerConfig(
       ProfilerState state,
-      bool report_input_shapes,
-      bool profile_memory)
+      bool report_input_shapes = false,
+      bool profile_memory = false,
+      bool with_stack = false)
       : state(state),
         report_input_shapes(report_input_shapes),
-        profile_memory(profile_memory) {}
+        profile_memory(profile_memory),
+        with_stack(with_stack) {}
   ~ProfilerConfig();
   ProfilerState state;
   bool report_input_shapes;
   bool profile_memory;
+  bool with_stack;
 
   // Returns IValues corresponding to ProfilerConfig struct, to be used for
   // serialization.
@@ -218,24 +221,29 @@ struct TORCH_API Event final {
   const char* name() const {
     return name_.str();
   }
-  uint16_t thread_id() const {
+
+  uint64_t threadId() const {
     return thread_id_;
   }
+
   std::vector<std::vector<int64_t>> shapes() const {
     return shapes_;
   }
-  double cpu_elapsed_us(const Event & e) const {
+
+  double cpuElapsedUs(const Event& e) const {
     return (e.cpu_ns_ - cpu_ns_)/(1000.0);
   }
 
-  double cpu_us() const {
+  double cpuUs() const {
     return cpu_ns_ / (1000.0);
   }
 
-  double cuda_elapsed_us(const Event & e) const;
-  bool has_cuda() const {
+  double cudaElapsedUs(const Event& e) const;
+
+  bool hasCuda() const {
     return cuda_event != nullptr || (isRemote() && device_ != -1);
   }
+
   int device() const {
     return device_;
   }
@@ -253,11 +261,11 @@ struct TORCH_API Event final {
     }
   }
 
-  int64_t cpu_memory_usage() const {
+  int64_t cpuMemoryUsage() const {
     return cpu_memory_usage_;
   }
 
-  int64_t cuda_memory_usage() const {
+  int64_t cudaMemoryUsage() const {
     return cuda_memory_usage_;
   }
 
@@ -266,7 +274,7 @@ struct TORCH_API Event final {
   }
 
   // Node ID corresponding to this event.
-  int node_id( ) const {
+  int nodeId( ) const {
     return node_id_;
   }
 
@@ -291,16 +299,41 @@ struct TORCH_API Event final {
     sequence_nr_ = sequence_nr;
   }
 
-  int64_t sequence_nr() const {
+  int64_t sequenceNr() const {
     return sequence_nr_;
   }
 
+  const std::vector<std::string>& stack() const {
+    return stack_;
+  }
+
+  void setStack(const std::vector<std::string>& stack) {
+    stack_ = stack;
+  }
+
+  uint64_t fwdThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  void setFwdThreadId(uint64_t fwd_thread_id) {
+    fwd_thread_id_ = fwd_thread_id;
+  }
+
+  uint8_t scope() const {
+    return scope_;
+  }
+
+  void setScope(uint8_t scope) {
+    scope_ = scope;
+  }
+
  private:
   // signed to allow for negative intervals, initialized for safety.
   int64_t cpu_ns_ = 0;
   at::StringView name_;
   EventKind kind_;
-  uint16_t thread_id_;
+  uint64_t thread_id_;
+  uint64_t fwd_thread_id_;
   at::RecordFunctionHandle handle_ {0};
   std::vector<std::vector<int64_t>> shapes_;
   int64_t cpu_memory_usage_ = 0;
@@ -311,6 +344,9 @@ struct TORCH_API Event final {
   bool is_remote_ = false;
   int64_t cuda_us_ = -1;
   int64_t sequence_nr_ = -1;
+
+  std::vector<std::string> stack_;
+  uint8_t scope_;
 };
 
 // a linked-list of fixed sized vectors, to avoid
diff --git a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
index f4baed5218b6..b45026b184fe 100644
--- a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
+++ b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
@@ -51,8 +51,7 @@ class State {
   // parse_cpu_trace(result) for results of all profile range.
   std::mutex resultsMutex_;
   std::vector<thread_event_lists> results_;
-  const ProfilerConfig config_ =
-      ProfilerConfig(ProfilerState::Disabled, false, false);
+  const ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
 };
 
 class StateStackEntry;
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 636cf7f67c36..fa97ea116a0c 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -528,7 +528,7 @@ void populateRemoteProfiledEvents(
       profilerStart = &e;
       foundCpuStart = true;
     } else if (cudaProfilingEnabled && 0 == strcmp(e.name(), "__cuda_start_event")) {
-      e.setCudaUs(e.cpu_us());
+      e.setCudaUs(e.cpuUs());
       auto device = e.device();
       TORCH_CHECK(
           device != -1,
@@ -561,7 +561,7 @@ void populateRemoteProfiledEvents(
     // launched/ended, since deserialized event will not have a
     // corresponding CUDA event.
     for (auto& e : profiledEvents) {
-      if (e.has_cuda()) {
+      if (e.hasCuda()) {
         auto cudaDevice = e.device();
         TORCH_CHECK(
             cudaDevice != -1,
@@ -572,8 +572,8 @@ void populateRemoteProfiledEvents(
             c10::str(
                 "Failed to find __cuda_start_event for device ", cudaDevice));
         auto cudaProfilerStartEvent = it->second;
-        double cudaElapsedUs = cudaProfilerStartEvent->cuda_elapsed_us(e);
-        int64_t cudaUs = cudaElapsedUs + cudaProfilerStartEvent->cpu_us();
+        double cudaElapsedUs = cudaProfilerStartEvent->cudaElapsedUs(e);
+        int64_t cudaUs = cudaElapsedUs + cudaProfilerStartEvent->cpuUs();
         e.setCudaUs(cudaUs);
       }
     }
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 7c544c1aa18e..b4bc481ae5bd 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -912,6 +912,18 @@ void setRecordSourceLocation(void (*v)(Node*)) {
   record_source_location.store(v);
 }
 
+std::vector<StackEntry> defaultPythonCallstack() {
+  return std::vector<StackEntry>();
+}
+std::atomic<decltype(&defaultPythonCallstack)> python_callstack_fn(
+    defaultPythonCallstack);
+std::vector<StackEntry> pythonCallstack() {
+  return python_callstack_fn.load()();
+}
+void setPythonCallstack(std::vector<StackEntry> (*v)()) {
+  python_callstack_fn.store(v);
+}
+
 void defaultWarn(const std::string& str) {
   TORCH_WARN(str);
 }
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 5b1243c424b8..74a5225d4f3f 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -7,6 +7,7 @@
 #include <torch/csrc/WindowsTorchApiMacro.h>
 
 #include <torch/csrc/jit/api/object.h>
+#include <torch/csrc/jit/frontend/source_range.h>
 #include <torch/csrc/utils/variadic.h>
 
 #include <cstdint>
@@ -193,6 +194,9 @@ struct WithNestedTracingFrame {
 TORCH_API void recordSourceLocation(Node* n);
 TORCH_API void setRecordSourceLocation(void (*v)(Node*));
 
+TORCH_API std::vector<StackEntry> pythonCallstack();
+TORCH_API void setPythonCallstack(std::vector<StackEntry> (*v)());
+
 // Having finished adding a new 'node' to the graph IR 'setValueTrace'
 // associates this node with an output variable, so that further operations
 // involving this variable know which node in the IR to reference.
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 0aa46a0139ba..43a11f75a768 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -23,25 +23,44 @@ namespace tracer {
 
 // Python interpreter retrieval routine adapted from
 // https://stackoverflow.com/a/8706144
-SourceRange getPythonInterpreterSourceRange() {
-  c10::optional<std::string> source_filename;
-  size_t source_line = 0;
-  std::stringstream stack_trace;
-
+std::vector<StackEntry> _pythonCallstack() {
   pybind11::gil_scoped_acquire gil;
   PyFrameObject* frame = PyEval_GetFrame();
+  std::vector<StackEntry> entries;
 
   while (nullptr != frame) {
-    int line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
+    size_t line = PyCode_Addr2Line(frame->f_code, frame->f_lasti);
     std::string filename = THPUtils_unpackString(frame->f_code->co_filename);
     std::string funcname = THPUtils_unpackString(frame->f_code->co_name);
-    stack_trace << filename << "(" << line << "): " << funcname << "\n";
-    if (!source_filename) {
-      source_filename = filename;
-      source_line = line;
-    }
+    auto source = std::make_shared<Source>(funcname, filename, line);
+    entries.emplace_back(
+        StackEntry{funcname, SourceRange(source, 0, funcname.size())});
     frame = frame->f_back;
   }
+  return entries;
+}
+
+SourceRange getPythonInterpreterSourceRange() {
+  auto cs = pythonCallstack();
+  c10::optional<std::string> source_filename;
+  size_t source_line = 0;
+  std::stringstream stack_trace;
+  for (const auto& entry : cs) {
+    auto& range = entry.range;
+    if (range.source()) {
+      auto& src = range.source();
+      if (src && src->filename()) {
+        auto line =
+            src->starting_line_no() + src->lineno_for_offset(range.start());
+        stack_trace << *(src->filename()) << "(" << line
+                    << "): " << entry.filename << "\n";
+        if (!source_filename) {
+          source_filename = *(src->filename());
+          source_line = line;
+        }
+      }
+    }
+  }
 
   auto stack_trace_text = stack_trace.str();
   auto source =
@@ -123,6 +142,7 @@ void pythonWarn(const std::string& reason) {
 }
 
 void initPythonTracerBindings(PyObject* module) {
+  setPythonCallstack(_pythonCallstack);
   setRecordSourceLocation(pythonRecordSourceLocation);
 
   auto m = py::handle(module).cast<py::module>();
diff --git a/torch/csrc/jit/python/python_tracer.h b/torch/csrc/jit/python/python_tracer.h
index 9797a1e32e50..5d8e3a9a52ea 100644
--- a/torch/csrc/jit/python/python_tracer.h
+++ b/torch/csrc/jit/python/python_tracer.h
@@ -16,7 +16,6 @@ struct Module;
 namespace tracer {
 void initPythonTracerBindings(PyObject* module);
 
-std::string getPythonInterpreterStackTrace();
 SourceRange getPythonInterpreterSourceRange();
 
 Node* preRecordPythonTrace(
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index f61e2597447f..812acaf6f208 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -7,6 +7,7 @@
 #include <c10/util/Exception.h>
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/jit/api/compilation_unit.h>
 #include <torch/csrc/jit/api/function_impl.h>
@@ -412,6 +413,21 @@ struct BailoutBlock {
   std::vector<Instruction> instructions; // ends in a TAIL_CALL
 };
 
+thread_local InterpreterStateImpl* tls_int_state_ptr_ = nullptr;
+struct TLSCurrentInterpreterGuard {
+  TLSCurrentInterpreterGuard(InterpreterStateImpl* state) {
+    prev_state_ = tls_int_state_ptr_;
+    tls_int_state_ptr_ = state;
+  }
+
+  ~TLSCurrentInterpreterGuard() {
+    tls_int_state_ptr_ = prev_state_;
+  }
+
+ private:
+  InterpreterStateImpl* prev_state_;
+};
+
 struct CodeImpl {
   friend struct InterpreterState;
   std::vector<Instruction> instructions_;
@@ -1044,30 +1060,11 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
 
     // RecordFunction object associated with this frame
     std::unique_ptr<at::RecordFunction> record_function;
+
     // symbol table for a frame
     ShapeSymbolTable symbols2dims;
   };
 
-  // saved-by-value stuff that can exist on the stack inside runInterpreter
-  struct ActiveFrame {
-    size_t pc;
-    Instruction* instructions;
-    IValue* constants;
-    Operation* operators;
-    Function** functions;
-    std::function<void(std::vector<IValue>&)>* profile_functions;
-    TypePtr* types;
-
-    ActiveFrame(const Frame& frame)
-        : pc(frame.pc),
-          instructions(frame.function->instructions_.data()),
-          constants(frame.function->constant_table_.data()),
-          operators(frame.function->operator_table_.data()),
-          functions(frame.function->function_table_.data()),
-          profile_functions(frame.function->profile_function_table_.data()),
-          types(frame.function->type_table_.data()) {}
-  };
-
   std::vector<Frame> frames;
 
   c10::intrusive_ptr<InterpreterStateImpl> intrusive_from_this() {
@@ -1078,7 +1075,6 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   void enterFrame(const Code& code, size_t base_pointer) {
     frames.emplace_back(Frame{code.pImpl, 0, base_pointer, c10::nullopt});
     registers.resize(registers.size() + code.pImpl->register_size_);
-    // frames.back().function->dump(std::cout);
   }
 
   void leaveFrame() {
@@ -1101,16 +1097,16 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     }
   }
 
-  void runBuiltinFunction(Stack& stack, Function* fn, ActiveFrame* af) {
+  void runBuiltinFunction(Stack& stack, Function* fn) {
     // BuiltinOpFunction directly invokes a void(Stack&) to implement
     // custom C++ classes. Call run() here with the stack, and we will
     // get the results from that C++ method back in the stack. Advance
     // the PC by 1 without adding any new frame.
     fn->run(stack);
-    ++af->pc;
+    ++frames.back().pc;
   }
 
-  void runGraphFunction(Stack& stack, Function* fn, ActiveFrame* af) {
+  void runGraphFunction(Stack& stack, Function* fn) {
     const Code& code =
         // consider passing
         // `frames.back().function->remaining_bailout_depth_` into
@@ -1122,21 +1118,9 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         fn->get_executor()
             .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts())
             .code;
-    frames.back().pc = af->pc + 1;
+    ++frames.back().pc;
     enterFrame(code, stack.size() - code.num_inputs());
-    if (at::hasCallbacks() && at::isRecordFunctionEnabled()) {
-      auto rec_fn = std::make_unique<at::RecordFunction>(
-          at::RecordScope::TORCHSCRIPT_FUNCTION);
-      if (rec_fn->active) {
-        if (rec_fn->needs_inputs) {
-          rec_fn->before(fn->name(), last(stack, code.num_inputs()));
-        } else {
-          rec_fn->before(fn->name());
-        }
-        frames.back().record_function = std::move(rec_fn);
-      }
-    }
-    *af = ActiveFrame(frames.back());
+    checkAndStartRecordFunction(frames.back(), stack);
   }
 
   bool runImpl(Stack& stack) {
@@ -1152,18 +1136,22 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       stack_start_ = 0;
     }
 
-    ActiveFrame af(frames.back());
+    TLSCurrentInterpreterGuard g(this);
+    if (frames.back().pc == 0 && stack_start_ == 0) {
+      checkAndStartRecordFunction(frames.back(), stack);
+    }
     try {
       while (true) {
+        Frame& frame = frames.back();
         // std::cout << "RUNNING ";
-        // frames.back().function->dump(std::cout, af.pc);
-        Instruction inst = af.instructions[af.pc];
+        // frames.back().function->dump(std::cout, frame.pc);
+        Instruction inst = frame.function->instructions_[frame.pc];
         switch (inst.op) {
           case ENTER: {
             auto obj = peek(stack, 0, 1);
             TORCH_INTERNAL_ASSERT(obj.isObject());
             entered_objects.push_back(obj);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case EXIT: {
             auto obj = entered_objects.back().toObject();
@@ -1173,90 +1161,90 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             push(stack, IValue());
             push(stack, IValue());
             push(stack, IValue());
-            runGraphFunction(stack, &f, &af);
+            runGraphFunction(stack, &f);
           } break;
           case OP:
-            af.operators[inst.X](&stack);
-            ++af.pc;
+            frame.function->operator_table_[inst.X](&stack);
+            ++frame.pc;
             break;
           case OPN:
             stack.push_back(inst.N);
-            af.operators[inst.X](&stack);
-            ++af.pc;
+            frame.function->operator_table_[inst.X](&stack);
+            ++frame.pc;
             break;
           case LOAD:
             stack.emplace_back(reg(inst.X));
-            ++af.pc;
+            ++frame.pc;
             break;
           case MOVE:
             stack.emplace_back(std::move(reg(inst.X)));
-            ++af.pc;
+            ++frame.pc;
             break;
           case STORE:
             reg(inst.X) = pop(stack);
-            ++af.pc;
+            ++frame.pc;
             break;
           case STOREN:
             for (size_t i = inst.N; i > 0; --i) {
               reg(inst.X + i - 1) = pop(stack);
             }
-            ++af.pc;
+            ++frame.pc;
             break;
           case DROP:
             pop(stack);
-            ++af.pc;
+            ++frame.pc;
             break;
           case DROPR:
             reg(inst.X) = IValue();
-            ++af.pc;
+            ++frame.pc;
             break;
           case LOADC:
-            stack.emplace_back(af.constants[inst.X]);
-            ++af.pc;
+            stack.emplace_back(frame.function->constant_table_[inst.X]);
+            ++frame.pc;
             break;
           case GET_ATTR: {
             auto userObj = pop(stack).toObject();
             auto value = userObj->getSlot(inst.X);
             push(stack, std::move(value));
-            ++af.pc;
+            ++frame.pc;
           } break;
           case SET_ATTR: {
             auto v = pop(stack);
             auto userObj = pop(stack).toObject();
             userObj->setSlot(inst.X, std::move(v));
-            ++af.pc;
+            ++frame.pc;
           } break;
           case JF:
-            af.pc += (pop(stack).toBool()) ? 1 : inst.X;
+            frame.pc += (pop(stack).toBool()) ? 1 : inst.X;
             break;
           case JMP:
-            af.pc += inst.X;
+            frame.pc += inst.X;
             break;
           case LOOP: {
             // stack: iteration_count, max_iter, cond, loop_carried_deps...
-            auto frame = stack.end() - (inst.N + 1);
-            int64_t trip_count = frame[0].toInt();
-            int64_t max_trip_count = frame[1].toInt();
-            bool cond = frame[2].toBool();
+            auto fr = stack.end() - (inst.N + 1);
+            int64_t trip_count = fr[0].toInt();
+            int64_t max_trip_count = fr[1].toInt();
+            bool cond = fr[2].toBool();
             if (trip_count < max_trip_count && cond) {
-              frame[2] = trip_count;
-              frame[0] = trip_count + 1;
-              ++af.pc;
+              fr[2] = trip_count;
+              fr[0] = trip_count + 1;
+              ++frame.pc;
             } else {
               size_t n_loop_carried = inst.N - 2;
               for (size_t i = 0; i < n_loop_carried; ++i) {
-                frame[i] = std::move(frame[i + 3]);
+                fr[i] = std::move(fr[i + 3]);
               }
               drop(stack, 3); // iteration_count, max_iter, cond
-              af.pc += inst.X;
+              frame.pc += inst.X;
             }
           } break;
           case CALL: {
-            Function* fn = af.functions[inst.X];
+            Function* fn = frame.function->function_table_[inst.X];
             if (!fn->isGraphFunction()) {
-              runBuiltinFunction(stack, fn, &af);
+              runBuiltinFunction(stack, fn);
             } else {
-              runGraphFunction(stack, fn, &af);
+              runGraphFunction(stack, fn);
             }
           } break;
           case INTERFACE_CALL: {
@@ -1276,17 +1264,17 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                 peek(stack, 0, inst.N)
                     .toObject()
                     ->type()
-                    ->getMethod(af.constants[inst.X].toStringRef());
+                    ->getMethod(
+                        frame.function->constant_table_[inst.X].toStringRef());
             if (!function.isGraphFunction()) {
-              runBuiltinFunction(stack, &function, &af);
+              runBuiltinFunction(stack, &function);
             } else {
-              runGraphFunction(stack, &function, &af);
+              runGraphFunction(stack, &function);
             }
           } break;
           case RET:
             if (frames.size() > 1) {
               leaveFrame();
-              af = ActiveFrame(frames.back());
               break;
             }
             if (future_) {
@@ -1298,6 +1286,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                     jit::last(stack, num_outputs).vec()));
               }
             }
+            // destroy the last frame and call RecordFunction's end callbacks
+            leaveFrame();
             return false;
           case WAIT: {
             auto future = stack.back().toFuture();
@@ -1343,7 +1333,6 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                 stack.resize(stack_start_);
               }
               // save pc into the frame so we continue here when restored
-              frames.back().pc = af.pc;
               future->addCallback(
                   Callback(intrusive_from_this(), std::move(copied)));
 
@@ -1351,26 +1340,26 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             }
             stack.pop_back();
             stack.emplace_back(future->value());
-            ++af.pc;
+            ++frame.pc;
           } break;
           case PROFILE_OP: {
-            auto& frame_id_ref = frames.back().id;
+            auto& frame_id_ref = frame.id;
             if (!frame_id_ref.has_value()) {
               frame_id_ref = Frame::num_frames++;
             }
-            auto callback = af.profile_functions[inst.X];
+            auto callback = frame.function->profile_function_table_[inst.X];
             push(stack, c10::IValue{static_cast<int64_t>(*frame_id_ref)});
             callback(stack);
-            ++af.pc;
+            ++frame.pc;
             break;
           }
           case FAIL_GUARD: {
             // patch FAIL_GUARD back to GUARD
             GRAPH_DEBUG(
                 "Bailout ", inst.X, " triggered via bailout_requests_!");
-            af.instructions[af.pc].op = GUARD;
+            frame.function->instructions_[frame.pc].op = GUARD;
             push(stack, false);
-            ++af.pc;
+            ++frame.pc;
             break;
           }
           case TYPECHECK: {
@@ -1380,7 +1369,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             for (i = 0; i < num_inputs; i++) {
               auto& input = peek(stack, i, num_inputs);
               auto t = input.toTensor();
-              const TypePtr& expected = af.types[inst.X + i];
+              const TypePtr& expected = frame.function->type_table_[inst.X + i];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() &&
                   (!frames.back().symbols2dims.bindSymbolicShapes(
@@ -1393,7 +1382,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             if (i == num_inputs) {
               push(stack, true);
             }
-            ++af.pc;
+            ++frame.pc;
             break;
           }
           case GUARD: {
@@ -1404,7 +1393,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
               push(stack, true);
             } else {
               auto t = stack.back().toTensor();
-              const TypePtr& expected = af.types[inst.X];
+              const TypePtr& expected = frame.function->type_table_[inst.X];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() &&
                   !frames.back().symbols2dims.bindSymbolicShapes(
@@ -1414,21 +1403,21 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
                 push(stack, expected_type->matchTensor(t));
               }
             }
-            ++af.pc;
+            ++frame.pc;
           } break;
           case TAIL_CALL: {
             GRAPH_DEBUG("running TAIL_CALL for ", inst.X);
-            af.functions[inst.X]->ensure_defined();
+            frame.function->function_table_[inst.X]->ensure_defined();
             size_t remaining_bailout_depth =
-                frames.back().function->remaining_bailout_depth_ > 0
-                ? frames.back().function->remaining_bailout_depth_ - 1
+                frame.function->remaining_bailout_depth_ > 0
+                ? frame.function->remaining_bailout_depth_ - 1
                 : 0;
-            const Code& code = af.functions[inst.X]
+            const Code& code = frame.function->function_table_[inst.X]
                                    ->get_executor()
                                    .getPlanFor(stack, remaining_bailout_depth)
                                    .code;
             size_t num_inputs = code.num_inputs();
-            size_t base_pointer = frames.back().base_pointer;
+            size_t base_pointer = frame.base_pointer;
             TORCH_INTERNAL_ASSERT(stack.size() >= num_inputs);
             size_t inputs_start = stack.size() - num_inputs;
             for (size_t i = 0; i < num_inputs; ++i) {
@@ -1438,49 +1427,52 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             stack.resize(base_pointer + num_inputs);
             leaveFrame();
             enterFrame(code, base_pointer);
-            af = ActiveFrame(frames.back());
+            checkAndStartRecordFunction(frames.back(), stack);
           } break;
           case LIST_UNPACK: {
             listUnpack(stack, inst.X);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case TUPLE_CONSTRUCT: {
             tupleConstruct(stack, inst.X);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case TUPLE_SLICE: {
             tupleSlice(stack, inst.X, inst.X + inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case NAMED_TUPLE_CONSTRUCT: {
-            auto type = af.types[inst.X]->expect<TupleType>();
+            auto type =
+                frame.function->type_table_[inst.X]->expect<TupleType>();
             namedTupleConstruct(stack, type, inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case LIST_CONSTRUCT: {
-            auto type = af.types[inst.X]->expect<ListType>();
+            auto type = frame.function->type_table_[inst.X]->expect<ListType>();
             listConstruct(stack, type, inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case DICT_CONSTRUCT: {
-            auto type = af.types[inst.X]->expect<DictType>();
+            auto type = frame.function->type_table_[inst.X]->expect<DictType>();
             dictConstruct(stack, type, inst.N);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case CREATE_OBJECT: {
-            auto type = af.types[inst.X]->expect<ClassType>();
+            auto type =
+                frame.function->type_table_[inst.X]->expect<ClassType>();
             createObject(stack, type);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case ISINSTANCE: {
             at::ArrayRef<TypePtr> types(
-                af.types + inst.X, af.types + inst.X + inst.N);
+                &(frame.function->type_table_[inst.X]),
+                &(frame.function->type_table_[inst.X + inst.N]));
             isinstance(stack, types);
-            ++af.pc;
+            ++frame.pc;
           } break;
           case FORK: {
             // Move inputs to a separate stack
-            Function* forked_fn = af.functions[inst.X];
+            Function* forked_fn = frame.function->function_table_[inst.X];
             InterpreterState forked_interpreter(
                 forked_fn->get_executor()
                     .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts())
@@ -1492,10 +1484,10 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             drop(stack, inst.N);
             push(stack, forked_interpreter.getFuture());
             at::launch(std::move(continuation));
-            ++af.pc;
+            ++frame.pc;
           } break;
           case WARN: {
-            Node* node = frames.back().function->instructions_source_.at(af.pc);
+            Node* node = frame.function->instructions_source_.at(frame.pc);
             auto range = node->sourceRange().source();
             if (range->filename()) {
               auto line = range->starting_line_no() +
@@ -1511,12 +1503,11 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             } else {
               TORCH_WARN(pop(stack).toStringRef());
             }
-            ++af.pc;
+            ++frame.pc;
           } break;
         }
       }
     } catch (std::exception& e) {
-      frames.back().pc = af.pc;
       for (auto it = entered_objects.rbegin(), end = entered_objects.rend();
            it != end;
            ++it) {
@@ -1542,6 +1533,43 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   }
 
   void formatStackTrace(std::ostream& out) {
+    format_stack_trace(out, callstack());
+  }
+
+  void handleError(const ExceptionMessage& msg, bool is_jit_exception) {
+    std::ostringstream ss;
+    ss << "The following operation failed in the TorchScript interpreter.\n";
+    formatStackTrace(ss);
+    ss << "RuntimeError: " << msg << "\n";
+    if (future_) {
+      future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
+    } else if (is_jit_exception) {
+      throw JITException(ss.str());
+    } else {
+      throw std::runtime_error(ss.str());
+    }
+  }
+
+  static void checkAndStartRecordFunction(Frame& frame, Stack& stack) {
+    if (!frame.record_function && at::hasCallbacks() &&
+        at::isRecordFunctionEnabled()) {
+      auto rec_fn = std::make_unique<at::RecordFunction>(
+          at::RecordScope::TORCHSCRIPT_FUNCTION);
+      if (rec_fn->active) {
+        if (rec_fn->needs_inputs) {
+          rec_fn->before(
+              frame.function->function_name_,
+              last(stack, frame.function->n_inputs));
+        } else {
+          rec_fn->before(frame.function->function_name_);
+        }
+        frame.record_function = std::move(rec_fn);
+      }
+    }
+  }
+
+ public:
+  std::vector<StackEntry> callstack() const {
     std::vector<StackEntry> entries;
     for (size_t i = 0; i < frames.size(); ++i) {
       const Frame& frame = frames[i];
@@ -1562,24 +1590,9 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
       }
       entries.emplace_back(StackEntry{previous_fn_name, node->sourceRange()});
     }
-    format_stack_trace(out, entries);
+    return entries;
   }
 
-  void handleError(const ExceptionMessage& msg, bool is_jit_exception) {
-    std::ostringstream ss;
-    ss << "The following operation failed in the TorchScript interpreter.\n";
-    formatStackTrace(ss);
-    ss << "RuntimeError: " << msg << "\n";
-    if (future_) {
-      future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
-    } else if (is_jit_exception) {
-      throw JITException(ss.str());
-    } else {
-      throw std::runtime_error(ss.str());
-    }
-  }
-
- public:
   c10::intrusive_ptr<Future> getOrCreateFuture() {
     if (!future_) {
       future_ =
@@ -1611,6 +1624,15 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   }
 };
 
+std::vector<StackEntry> currentCallstack() {
+  if (tls_int_state_ptr_) {
+    auto cs = tls_int_state_ptr_->callstack();
+    std::reverse(cs.begin(), cs.end());
+    return cs;
+  }
+  return std::vector<StackEntry>();
+}
+
 std::atomic<size_t> InterpreterStateImpl::Frame::num_frames;
 
 std::ostream& operator<<(std::ostream& out, const Code& code) {
@@ -1704,5 +1726,6 @@ void InterpreterContinuation::operator()() {
   DistAutogradContainer::forceCurrentContextId(prev_dist_id);
 #endif
 }
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 3325e1213e91..740101cef23f 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -6,6 +6,7 @@
 #include <ATen/ThreadLocalState.h>
 #include <ATen/core/ivalue.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
+#include <torch/csrc/jit/frontend/source_range.h>
 
 namespace at {
 class Tensor;
@@ -126,5 +127,8 @@ struct InterpreterContinuation {
 TORCH_API at::TensorTypePtr tensorTypeInCurrentExecutionContext(
     const at::Tensor& t);
 
+// current (TLS) TorchScript interpreter callstack
+TORCH_API std::vector<StackEntry> currentCallstack();
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index 6114f66ce6cb..0f4ba8d53817 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -113,8 +113,10 @@ def __enter__(self):
             else torch.autograd.ProfilerState.CPU
         )
         profiler_config = torch.autograd.ProfilerConfig(
-            profiler_kind, self.record_shapes, self.profile_memory
-        )
+            profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            False)
         _enable_server_process_global_profiler(profiler_config)
         return self
 
@@ -143,7 +145,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         process_global_function_events = []
         for thread_local_events in process_global_events:
             # Parse from ``Event``s to ``FunctionEvent``s.
-            thread_local_function_events = torch.autograd.profiler.parse_cpu_trace(
+            thread_local_function_events = torch.autograd.profiler.parse_event_records(
                 thread_local_events
             )
             thread_local_function_events.sort(

From 9b27e0926b9025d91c1df50bb6575a801a31f299 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Wed, 30 Sep 2020 05:50:53 -0700
Subject: [PATCH 296/449] Add callgrind collection to Timer (#44717)

Summary:
This PR allows Timer to collect deterministic instruction counts for (some) snippets. Because of the intrusive nature of Valgrind (effectively replacing the CPU with an emulated one) we have to perform our measurements in a separate process. This PR writes a `.py` file containing the Timer's `setup` and `stmt`, and executes it within a `valgrind` subprocess along with a plethora of checks and error handling. There is still a bit of jitter around the edges due to the Python glue that I'm using, but the PyTorch signal is quite good and thus this provides a low friction way of getting signal. I considered using JIT as an alternative, but:

A) Python specific overheads (e.g. parsing) are important
B) JIT might do rewrites which would complicate measurement.

Consider the following bit of code, related to https://github.com/pytorch/pytorch/issues/44484:
```
from torch.utils._benchmark import Timer
counts = Timer(
    "x.backward()",
    setup="x = torch.ones((1,)) + torch.ones((1,), requires_grad=True)"
).collect_callgrind()

for c, fn in counts[:20]:
    print(f"{c:>12}  {fn}")
```

```
      812800  ???:_dl_update_slotinfo
      355600  ???:update_get_addr
      308300  work/Python/ceval.c:_PyEval_EvalFrameDefault'2
      304800  ???:__tls_get_addr
      196059  ???:_int_free
      152400  ???:__tls_get_addr_slow
      138400  build/../c10/core/ScalarType.h:c10::typeMetaToScalarType(caffe2::TypeMeta)
      126526  work/Objects/dictobject.c:_PyDict_LoadGlobal
      114268  ???:malloc
      101400  work/Objects/unicodeobject.c:PyUnicode_FromFormatV
       85900  work/Python/ceval.c:_PyEval_EvalFrameDefault
       79946  work/Objects/typeobject.c:_PyType_Lookup
       72000  build/../c10/core/Device.h:c10::Device::validate()
       70000  /usr/include/c++/8/bits/stl_vector.h:std::vector<at::Tensor, std::allocator<at::Tensor> >::~vector()
       66400  work/Objects/object.c:_PyObject_GenericGetAttrWithDict
       63000  ???:pthread_mutex_lock
       61200  work/Objects/dictobject.c:PyDict_GetItem
       59800  ???:free
       58400  work/Objects/tupleobject.c:tupledealloc
       56707  work/Objects/dictobject.c:lookdict_unicode_nodummy
```

Moreover, if we backport this PR to 1.6 (just copy the `_benchmarks` folder) and load those counts as `counts_1_6`, then we can easily diff them:
```
print(f"Head instructions: {sum(c for c, _ in counts)}")
print(f"1.6 instructions:  {sum(c for c, _ in counts_1_6)}")
count_dict = {fn: c for c, fn in counts}
for c, fn in counts_1_6:
    _ = count_dict.setdefault(fn, 0)
    count_dict[fn] -= c
count_diffs = sorted([(c, fn) for fn, c in count_dict.items()], reverse=True)
for c, fn in count_diffs[:15] + [["", "..."]] + count_diffs[-15:]:
    print(f"{c:>8}  {fn}")
```

```
Head instructions: 7609547
1.6 instructions:  6059648
  169600  ???:_dl_update_slotinfo
  101400  work/Objects/unicodeobject.c:PyUnicode_FromFormatV
   74200  ???:update_get_addr
   63600  ???:__tls_get_addr
   46800  work/Python/ceval.c:_PyEval_EvalFrameDefault
   33512  work/Objects/dictobject.c:_PyDict_LoadGlobal
   31800  ???:__tls_get_addr_slow
   31700  build/../aten/src/ATen/record_function.cpp:at::RecordFunction::RecordFunction(at::RecordScope)
   28300  build/../torch/csrc/utils/python_arg_parser.cpp:torch::FunctionSignature::parse(_object*, _object*, _object*, _object**, bool)
   27800  work/Objects/object.c:_PyObject_GenericGetAttrWithDict
   27401  work/Objects/dictobject.c:lookdict_unicode_nodummy
   24115  work/Objects/typeobject.c:_PyType_Lookup
   24080  ???:_int_free
   21700  work/Objects/dictobject.c:PyDict_GetItemWithError
   20700  work/Objects/dictobject.c:PyDict_GetItem
          ...
   -3200  build/../c10/util/SmallVector.h:at::TensorIterator::binary_op(at::Tensor&, at::Tensor const&, at::Tensor const&, bool)
   -3400  build/../aten/src/ATen/native/TensorIterator.cpp:at::TensorIterator::resize_outputs(at::TensorIteratorConfig const&)
   -3500  /usr/include/c++/8/x86_64-redhat-linux/bits/gthr-default.h:std::unique_lock<std::mutex>::unlock()
   -3700  build/../torch/csrc/utils/python_arg_parser.cpp:torch::PythonArgParser::raw_parse(_object*, _object*, _object**)
   -4207  work/Objects/obmalloc.c:PyMem_Calloc
   -4500  /usr/include/c++/8/bits/stl_vector.h:std::vector<at::Tensor, std::allocator<at::Tensor> >::~vector()
   -4800  build/../torch/csrc/autograd/generated/VariableType_2.cpp:torch::autograd::VariableType::add__Tensor(at::Tensor&, at::Tensor const&, c10::Scalar)
   -5000  build/../c10/core/impl/LocalDispatchKeySet.cpp:c10::impl::ExcludeDispatchKeyGuard::ExcludeDispatchKeyGuard(c10::DispatchKey)
   -5300  work/Objects/listobject.c:PyList_New
   -5400  build/../torch/csrc/utils/python_arg_parser.cpp:torch::FunctionParameter::check(_object*, std::vector<pybind11::handle, std::allocator<pybind11::handle> >&)
   -5600  /usr/include/c++/8/bits/std_mutex.h:std::unique_lock<std::mutex>::unlock()
   -6231  work/Objects/obmalloc.c:PyMem_Free
   -6300  work/Objects/listobject.c:list_repeat
  -11200  work/Objects/listobject.c:list_dealloc
  -28900  build/../torch/csrc/utils/python_arg_parser.cpp:torch::FunctionSignature::parse(_object*, _object*, _object**, bool)
```

Remaining TODOs:
  * Include a timer in the generated script for cuda sync.
  * Add valgrind to CircleCI machines and add a unit test.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44717

Reviewed By: soumith

Differential Revision: D24010742

Pulled By: robieta

fbshipit-source-id: df6bc765f8efce7193893edba186cd62b4b23623
---
 .circleci/docker/common/install_base.sh       |   3 +-
 .circleci/scripts/binary_ios_build.sh         |   1 +
 .gitmodules                                   |   4 +
 test/test_utils.py                            |  12 +-
 third_party/valgrind                          |   1 +
 torch/CMakeLists.txt                          |   3 +
 torch/_C/__init__.pyi.in                      |   4 +
 torch/csrc/Module.cpp                         |  26 +
 torch/overrides.py                            |   2 +
 torch/utils/benchmark/utils/timer.py          |  27 +
 .../utils/valgrind_wrapper/__init__.py        |   0
 .../utils/valgrind_wrapper/timer_interface.py | 473 ++++++++++++++++++
 12 files changed, 554 insertions(+), 2 deletions(-)
 create mode 160000 third_party/valgrind
 create mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
 create mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py

diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index 5e8173a43627..0df15d9457c1 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -47,6 +47,7 @@ install_ubuntu() {
     software-properties-common \
     sudo \
     wget \
+    valgrind \
     vim
 
   # TODO: THIS IS A HACK!!!
@@ -92,6 +93,7 @@ install_centos() {
     opencv-devel \
     sudo \
     wget \
+    valgrind \
     vim
 
   # Cleanup
@@ -131,4 +133,3 @@ sudo make install
 cd ../../
 rm -rf valgrind_build
 alias valgrind="/usr/local/bin/valgrind"
-
diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index efab1e5ded3a..6df086ccf965 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -16,6 +16,7 @@ source ~/anaconda/bin/activate
 
 # Install dependencies
 conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
+conda install -c conda-forge valgrind
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
 # sync submodules
diff --git a/.gitmodules b/.gitmodules
index 509ab94f1cf4..d7a11cc22996 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -130,3 +130,7 @@
     ignore = dirty
     path = third_party/tensorpipe
     url = https://github.com/pytorch/tensorpipe.git
+[submodule "third_party/valgrind"]
+    ignore = dirty
+	path = third_party/valgrind
+	url = https://sourceware.org/git/valgrind.git
diff --git a/test/test_utils.py b/test/test_utils.py
index dc7efe5a8f5b..308b242b8116 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -15,7 +15,7 @@
 import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS
+from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, slowTest
 from urllib.error import URLError
 import numpy as np
 
@@ -768,6 +768,16 @@ class MockCudaTimer(benchmark_utils.Timer):
             self.assertEqual(len(measurement.times), repeats)
             self.assertEqual(measurement.number_per_run, number_per_run)
 
+    @slowTest
+    @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
+    def test_collect_callgrind(self):
+        timer = benchmark_utils.Timer("y = torch.ones((1,)) + 1")
+
+        # Don't collect baseline to speed up unit test by ~30 seconds.
+        stats = timer.collect_callgrind(number=1000, collect_baseline=False)
+
+        self.assertEqual(stats.counts(include_lookdict_unicode=False), 38803198, atol=0, rtol=0.0001)
+
     def test_compare(self):
         # Simulate several approaches.
         costs = (
diff --git a/third_party/valgrind b/third_party/valgrind
new file mode 160000
index 000000000000..2593ccd82c18
--- /dev/null
+++ b/third_party/valgrind
@@ -0,0 +1 @@
+Subproject commit 2593ccd82c189bf40b60a3a4934c5d0bbdb75427
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 2ae2f7f737fe..de84804f3012 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -66,6 +66,9 @@ set(TORCH_PYTHON_INCLUDE_DIRECTORIES
     ${CMAKE_BINARY_DIR}/third_party
     ${CMAKE_BINARY_DIR}/third_party/onnx
 
+    ${TORCH_ROOT}/third_party/valgrind/callgrind
+    ${TORCH_ROOT}/third_party/valgrind/include
+
     ${TORCH_ROOT}/third_party/gloo
     ${TORCH_ROOT}/third_party/onnx
     ${pybind11_INCLUDE_DIRS}
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 590146bb2c5e..4c2c65a33fe5 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -374,6 +374,10 @@ def _vmapmode_increment_nesting() -> _int: ...  # THPModule_vmapmode_increment_n
 def _vmapmode_decrement_nesting() -> _int: ...  # THPModule_vmapmode_decrement_nesting
 def _log_api_usage_once(str) -> None: ...  # LogAPIUsageOnceFromPython
 
+# Defined in `valgrind.h` and `callgrind.h` respecitively.
+def valgrind_supported_platform() -> _bool: ...  # NVALGRIND
+def valgrind_toggle() -> None: ...  # CALLGRIND_TOGGLE_COLLECT
+
 has_openmp: _bool
 has_mkl: _bool
 has_lapack: _bool
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ae6f15155f2a..805548d18a98 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -61,6 +61,12 @@
 #endif
 #endif
 
+#if (defined(_WIN32) || defined(_WIN64) || defined(FBCODE_CAFFE2) || defined(C10_MOBILE))
+#define NVALGRIND
+#else
+#include <callgrind.h>
+#endif
+
 #define WITH_NUMPY_IMPORT_ARRAY
 #include <torch/csrc/utils/numpy_stub.h>
 
@@ -821,6 +827,26 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False));
   ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
 
+  py_module.def(
+    "valgrind_supported_platform", [](){
+      #if defined(NVALGRIND)
+      return false;
+      #else
+      return true;
+      #endif
+    }
+  );
+
+  py_module.def(
+    "valgrind_toggle", [](){
+      #if defined(NVALGRIND)
+      TORCH_CHECK(false, "Valgrind is not supported.");
+      #else
+      CALLGRIND_TOGGLE_COLLECT;
+      #endif
+    }
+  );
+
 #ifdef USE_CUDA
   PyObject *has_cuda = Py_True;
 #else
diff --git a/torch/overrides.py b/torch/overrides.py
index d7cda983fde6..bab17c1e961f 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -156,6 +156,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.is_deterministic,
         torch.set_deterministic,
         torch.unify_type_list,
+        torch.valgrind_supported_platform,
+        torch.valgrind_toggle,
         Tensor.__delitem__,
         Tensor.__dir__,
         Tensor.__getattribute__,
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 41d3892c86b3..d4c321156da9 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -6,6 +6,7 @@
 import numpy as np
 import torch
 from torch.utils.benchmark.utils import common
+from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
 
 
 __all__ = ["Timer", "timer"]
@@ -42,6 +43,7 @@ def __init__(
         # specified as a convenience feature.
         globals = dict(globals or {})
         globals.setdefault("torch", torch)
+        self._globals = globals
 
         self._timer = self._timer_cls(stmt=stmt, setup=setup, timer=timer, globals=globals)
         self._task_spec = common.TaskSpec(
@@ -159,3 +161,28 @@ def stop_hook(times) -> bool:
             raw_times=times,
             task_spec=self._task_spec
         )
+
+    def collect_callgrind(self, number=100, collect_baseline=True):
+        if not isinstance(self._task_spec.stmt, str):
+            raise ValueError("`collect_callgrind` currently only supports string `stmt`")
+
+        # __init__ adds torch, and Timer adds __builtins__
+        allowed_keys = {"torch", "__builtins__"}
+        if any(k not in allowed_keys for k in self._globals.keys()):
+            raise ValueError(
+                "`collect_callgrind` does not currently support passing globals. "
+                "Please define a `setup` str instead.")
+
+        if self._globals.get("torch", torch) is not torch:
+            raise ValueError("`collect_callgrind` does not support mocking out `torch`.")
+
+        # Check that the statement is valid. It doesn't guarantee success, but it's much
+        # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
+        # the parent process rather than the valgrind subprocess.
+        self._timer.timeit(1)
+        return valgrind_timer_interface.wrapper_singleton().collect_callgrind(
+            stmt=self._task_spec.stmt,
+            setup=self._task_spec.setup,
+            number=number,
+            num_threads=self._task_spec.num_threads,
+            collect_baseline=collect_baseline)
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py b/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
new file mode 100644
index 000000000000..423cc3dc4a86
--- /dev/null
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -0,0 +1,473 @@
+"""Intermediate layer between `Timer` and `valgrind`."""
+import collections
+import dataclasses
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+from typing import Any, DefaultDict, Dict, List, NamedTuple, Optional, Tuple
+
+import torch
+
+
+FunctionCount = NamedTuple("FunctionCount", [("count", int), ("function", str)])
+
+
+@dataclasses.dataclass(repr=False, eq=False, frozen=True)
+class CallgrindStats(object):
+    stmt: str
+    setup: str
+    number_per_run: int
+    num_threads: int
+    built_with_debug_symbols: bool
+    baseline_inclusive_stats: Tuple[FunctionCount, ...]
+    baseline_exclusive_stats: Tuple[FunctionCount, ...]
+    stmt_inclusive_stats: Tuple[FunctionCount, ...]
+    stmt_exclusive_stats: Tuple[FunctionCount, ...]
+
+    def __repr__(self) -> str:
+        newline = "\n"  # `\` cannot appear in fstring code section.
+        base_stats = self.baseline_exclusive_stats
+        self_stats = self.stmt_exclusive_stats
+        output = textwrap.dedent(f"""
+        {super().__repr__()}
+          stmt:  {self.stmt.replace(newline, newline + ' ' * 9)}
+          setup: {self.setup.replace(newline, newline + ' ' * 9)}
+          {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
+        {'':>25}All{'':>10}Noisy symbols removed
+          Instructions: {self._counts(self_stats, True):>12}{'':>15}{self._counts(self_stats, False):>12}
+          Baseline:     {self._counts(base_stats, True):>12}{'':>15}{self._counts(base_stats, False):>12}
+        """).strip()
+        if not self.built_with_debug_symbols:
+            output += textwrap.dedent("""
+            Warning: PyTorch was not built with debug symbols.
+                     Source information may be limited. Rebuild with
+                     REL_WITH_DEB_INFO=1 for more detailed results.""")
+        return output
+
+    def stats(self, inclusive: bool = False) -> Tuple[FunctionCount, ...]:
+        """Returns stats as a tuple of (count, function)
+
+        `inclusive` matches the semantics of callgrind. If True, the counts
+        include instructions executed by children. `inclusive=True` is useful
+        for identifying hot spots in code; `inclusive=False` is useful for
+        identifying reducing noise when diffing counts from two different
+        runs. (See CallgrindStats.delta(...) for more details)
+        """
+        if inclusive:
+            first, second = self.stmt_inclusive_stats, self.baseline_inclusive_stats
+        else:
+            first, second = self.stmt_exclusive_stats, self.baseline_exclusive_stats
+        return self._diff(first, second)
+
+    def counts(self, *, include_lookdict_unicode: bool = True) -> int:
+        """Returns the total number of instructions executed.
+
+        Several instructions in the CPython interpreter are rather noisy. These
+        instructions involve unicode to dictionary lookups which Python uses to
+        map variable names. By default these are included, but setting
+        `include_lookdict_unicode=False` will exclude them and generally lead
+        to less noisy counts.
+        """
+        return self._counts(self.stmt_exclusive_stats, include_lookdict_unicode)
+
+    # FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
+    def delta(
+        self,
+        other,  # type: CallgrindStats
+        inclusive: bool = False,
+        subtract_baselines: bool = True
+    ) -> Tuple[FunctionCount, ...]:
+        """Diff two sets of counts.
+
+        One common reason to collect instruction counts is to determine the
+        the effect that a particular change will have on the number of instructions
+        needed to perform some unit of work. If a change increases that number, the
+        next logical question is "why". This generally involves looking at what part
+        if the code increased in instruction count. This function automates that
+        process so that one can easily diff counts on both an inclusive and
+        exclusive basis. The `subtract_baselines` argument allows one to disable
+        baseline correction, though in most cases it shouldn't matter as the
+        baselines are expected to more or less cancel out.
+        """
+        if subtract_baselines:
+            first = self.stats(inclusive=inclusive)
+            second = other.stats(inclusive=inclusive)
+        else:
+            if inclusive:
+                first, second = self.stmt_inclusive_stats, other.stmt_inclusive_stats
+            else:
+                first, second = self.stmt_exclusive_stats, other.stmt_exclusive_stats
+        return self._diff(first, second)
+
+    def as_standardized(self) -> "CallgrindStats":
+        """Strip library names and some prefixes from function strings.
+
+        When comparing two different sets of instruction counts, on stumbling
+        block can be path prefixes. Callgrind includes the full filepath
+        when reporting a function (as it should). However, this can cause
+        issues when diffing profiles. If a key component such as Python
+        or PyTorch was built in separate locations in the two profiles, which
+        can result in something resembling:
+            23234231 /tmp/first_build_dir/thing.c:foo(...)
+             9823794 /tmp/first_build_dir/thing.c:bar(...)
+              ...
+               53453 .../aten/src/Aten/...:function_that_actually_changed(...)
+              ...
+             -9823794 /tmp/second_build_dir/thing.c:bar(...)
+            -23234231 /tmp/second_build_dir/thing.c:foo(...)
+
+        Stripping prefixes can ameliorate this issue by regularizing the
+        strings and causing better cancellation of equivilent call sites
+        when diffing.
+        """
+        def strip(stats: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
+            counts: DefaultDict[str, int] = collections.defaultdict(int)
+
+            # "Python" and "Objects" come from CPython.
+            prefix_truncations = ("build/aten/", "Python/", "Objects/")
+            for c, fn in stats:
+                fn = re.sub(r"^.+build/\.\./", "build/../", fn)
+                for new_prefix in prefix_truncations:
+                    fn = re.sub(r"^.+/" + re.escape(new_prefix), new_prefix, fn)
+
+                # Strip library name. e.g. `libtorch.so`
+                fn = re.sub(r"\s\[.+\]$", "", fn)
+                counts[fn] += c
+            return tuple(sorted([
+                FunctionCount(c, fn) for fn, c in counts.items() if c
+            ], reverse=True))
+
+        return CallgrindStats(
+            stmt=self.stmt,
+            setup=self.setup,
+            number_per_run=self.number_per_run,
+            num_threads=self.num_threads,
+            built_with_debug_symbols=self.built_with_debug_symbols,
+            baseline_inclusive_stats=strip(self.baseline_inclusive_stats),
+            baseline_exclusive_stats=strip(self.baseline_exclusive_stats),
+            stmt_inclusive_stats=strip(self.stmt_inclusive_stats),
+            stmt_exclusive_stats=strip(self.stmt_exclusive_stats),
+        )
+
+    @staticmethod
+    def _counts(stats: Tuple[FunctionCount, ...], include_lookdict_unicode: bool) -> int:
+        return sum(
+            c for c, fn in stats
+            if include_lookdict_unicode
+            or "dictobject.c:lookdict_unicode" not in fn
+        )
+
+    @staticmethod
+    def _diff(first: Tuple[FunctionCount, ...], second: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
+        counts = collections.defaultdict(int, {fn: c for c, fn in first})
+        assert len(counts) == len(first)
+        for c, fn in second:
+            counts[fn] -= c
+
+        return tuple(sorted([
+            FunctionCount(c, fn) for fn, c in counts.items() if c
+        ], reverse=True))
+
+
+class _ValgrindWrapper(object):
+    def __init__(self) -> None:
+        self._commands_available: Dict[str, bool] = {}
+        if torch._C.valgrind_supported_platform():
+            # Only bother checking on supported platforms.
+            for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
+                self._commands_available[cmd] = not subprocess.run(
+                    ["which", cmd],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                ).returncode
+
+        self._build_type: Optional[str] = None
+        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())
+        if build_search is not None:
+            self._build_type = build_search.groups()[0].split(",")[0]
+
+        self._baseline_cache: Dict[Tuple[int, int], Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]] = {}
+
+    def _validate(self) -> None:
+        if not torch._C.valgrind_supported_platform():
+            raise OSError("Valgrind is not supported on this platform.")
+
+        missing_cmds = [cmd for cmd, available in self._commands_available.items() if not available]
+        if missing_cmds:
+            raise OSError("Missing: " + ", ".join(missing_cmds))
+
+    def collect_callgrind(
+        self,
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int,
+        collect_baseline: bool
+    ) -> CallgrindStats:
+        """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
+        self._validate()
+        baseline_inclusive_stats: Tuple[FunctionCount, ...] = ()
+        baseline_exclusive_stats: Tuple[FunctionCount, ...] = ()
+        if collect_baseline:
+            cache_key = (number, num_threads)
+            if cache_key not in self._baseline_cache:
+                self._baseline_cache[cache_key] = self._invoke(
+                    stmt="pass", setup="pass", number=number, num_threads=num_threads)
+            baseline_inclusive_stats, baseline_exclusive_stats = \
+                self._baseline_cache[cache_key]
+
+        stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(
+            stmt=stmt,
+            setup=setup,
+            number=number,
+            num_threads=num_threads
+        )
+
+        return CallgrindStats(
+            stmt=stmt,
+            setup=setup,
+            number_per_run=number,
+            num_threads=num_threads,
+            built_with_debug_symbols=self._build_type == "RelWithDebInfo",
+            baseline_inclusive_stats=baseline_inclusive_stats,
+            baseline_exclusive_stats=baseline_exclusive_stats,
+            stmt_inclusive_stats=stmt_inclusive_stats,
+            stmt_exclusive_stats=stmt_exclusive_stats,
+        )
+
+    def _invoke(
+        self,
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int
+    ) -> Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]:
+        """Core invocation method for Callgrind collection.
+
+        Valgrind operates by effectively replacing the CPU with an emulated
+        version which allows it to instrument any code at the cost of severe
+        performance degradation. This has the practical effect that in order
+        to collect Callgrind statistics, a new process has to be created
+        running under `valgrind`. The steps for this process are:
+
+        1) Create a scratch directory.
+        2) Codegen a run script. (_ValgrindWrapper._construct_script)
+            Inside the run script:
+                * Validate that Python and torch match the parent process
+                * Validate that it is indeed running under valgrind
+                * Execute `setup` and warm up `stmt`
+                * Begin collecting stats
+                * Run the `stmt` loop
+                * Stop collecting stats
+        3) Parse the run results.
+        4) Cleanup the scratch directory.
+        """
+        working_dir = tempfile.mkdtemp()
+        script_file = os.path.join(working_dir, "timer_callgrind.py")
+        callgrind_out = os.path.join(working_dir, "callgrind.out")
+        error_log = os.path.join(working_dir, "error.txt")
+        stat_log = os.path.join(working_dir, "callgrind_stat.txt")
+        stdout_stderr_log = os.path.join(working_dir, "stdout_stderr.log")
+
+        def run(args: List[str], **kwargs: Any) -> Tuple[subprocess.CompletedProcess, str]:
+            # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
+            f_stdout_stderr = open(stdout_stderr_log, "wb")
+            try:
+                invocation = subprocess.run(
+                    args,
+                    stdout=f_stdout_stderr,
+                    stderr=subprocess.STDOUT,
+                    **kwargs,
+                )
+                with open(stdout_stderr_log, "rt") as f:
+                    return invocation, f.read()
+            finally:
+                f_stdout_stderr.close()
+
+        try:
+            with open(script_file, "wt") as f:
+                f.write(self._construct_script(
+                    stmt=stmt, setup=setup, number=number,
+                    num_threads=num_threads, error_log=error_log,
+                    stat_log=stat_log))
+
+            valgrind_invocation, valgrind_invocation_output = run([
+                "valgrind",
+                "--tool=callgrind",
+                f"--callgrind-out-file={callgrind_out}",
+                "--dump-line=yes",
+                "--dump-instr=yes",
+                "--instr-atstart=yes",
+                "--collect-atstart=no",
+                "python",
+                script_file,
+            ])
+
+            if valgrind_invocation.returncode:
+                error_report = ""
+                if os.path.exists(error_log):
+                    with open(error_log, "rt") as f:
+                        error_report = f.read()
+                if not error_report:
+                    error_report = "Unknown error.\n" + valgrind_invocation_output
+
+                raise OSError(f"Failed to collect callgrind profile:\n{error_report}")
+
+            def parse_output(inclusive: bool) -> Tuple[FunctionCount, ...]:
+                annotate_invocation, annotate_invocation_output = run([
+                    "callgrind_annotate",
+                    f"--inclusive={'yes' if inclusive else 'no'}",
+                    callgrind_out
+                ], check=True)
+
+                begin_collecting = False
+                fn_counts = []
+                for l in annotate_invocation_output.splitlines(keepends=False):
+                    if not begin_collecting and re.match(r"Ir\s+file:function", l):
+                        begin_collecting = True
+                        continue
+
+                    count_match = re.match(r"^\s*([0-9,]+)\s+(.+:.+)$", l)
+                    if count_match:
+                        ir_str, file_function = count_match.groups()
+                        ir = int(ir_str.replace(",", ""))
+                        fn_counts.append(FunctionCount(ir, file_function))
+                        continue
+
+                    if begin_collecting and re.match(r"-+", l):
+                        continue
+
+                    begin_collecting = False
+
+                return tuple(fn_counts)
+            return parse_output(inclusive=True), parse_output(inclusive=False)
+        finally:
+            shutil.rmtree(working_dir)
+
+    @staticmethod
+    def _construct_script(
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int,
+        error_log: str,
+        stat_log: str
+    ) -> str:
+        # The naive template looks something like:
+        #   "for _ in range({number}): {stmt}"
+        # However a loop in Python is surprisingly expensive, and significantly
+        # increases the number of background Python instructions. So instead we
+        # partially unroll the loops, with a block size of 100 chosen to keep
+        # the instruction overhead from `range` low while also not ballooning
+        # the size of the generated file.
+        block_size = 100
+        loop_count = number // block_size
+        remainder = number - block_size * loop_count
+        blocked_stmt = ""
+        if loop_count:
+            unrolled_stmts = textwrap.indent("\n".join([stmt] * block_size), " " * 4)
+            blocked_stmt += f"for _ in range({loop_count}):\n{unrolled_stmts}\n"
+        if remainder:
+            blocked_stmt += "\n".join([stmt] * remainder)
+
+        return textwrap.dedent(r"""
+            import gc
+            import os
+            import subprocess
+            import sys
+            import time
+
+            import torch
+            torch.set_num_threads({num_threads})
+
+            PID = os.getpid()
+
+            def log_failure(msg):
+                with open({error_log_repr}, "wt") as f:
+                    f.write(msg)
+                sys.exit(1)
+
+            def check_result(completed_process):
+                if completed_process.returncode:
+                    log_failure(f"Command failed: {{' '.join(completed_process.args)}}")
+                return completed_process
+
+            # =============================================================================
+            # == Check that subprocess matches parent =====================================
+            # =============================================================================
+            if sys.executable != "{parent_interpreter}":
+                log_failure(
+                    "Interpreter mismatch:\n"
+                    f"  {{sys.executable}}\n    vs.\n  {parent_interpreter}"
+                )
+
+            if torch.__file__ != "{torch_file}":
+                log_failure(
+                    "PyTorch does not match expected file:\n"
+                    f"  {{torch.__file__}}\n    vs.\n  {torch_file}"
+                )
+
+            # =============================================================================
+            # == User specified setup =====================================================
+            # =============================================================================
+            {setup}
+
+            for _ in range({warmup_number}):
+            {indented_stmt}
+
+            # =============================================================================
+            # == Callgrind management =====================================================
+            # =============================================================================
+            with open("{stat_log}", "wb") as stat_file:
+                # If many instances of callgrind are running at once, the output of
+                # `callgrind_control` may exceed 16kb which would cause `subprocess.PIPE`
+                # to deadlock. So instead we use a file.
+                callgrind_stat = check_result(subprocess.run(
+                    ["callgrind_control", "--stat"],
+                    stdout=stat_file,
+                    stderr=subprocess.STDOUT,
+                ))
+
+            with open("{stat_log}", "rt") as stat_file:
+                stat_lines = stat_file.read().splitlines()
+
+            if f"PID {{PID}}: python {{__file__}}" not in stat_lines:
+                log_failure("Process does not appear to be running callgrind.")
+
+            gc.collect()
+            time.sleep(0.01)
+
+            # =============================================================================
+            # == User code block ==========================================================
+            # =============================================================================
+            torch._C.valgrind_toggle()
+            {blocked_stmt}
+
+            # Sleep is to allow the interpreter to catch up before we stop collecting in
+            # order to reduce jitter.
+            time.sleep(0.01)
+            torch._C.valgrind_toggle()
+        """).strip().format(
+            indented_stmt=textwrap.indent(stmt, " " * 4),
+            blocked_stmt=blocked_stmt,
+            number=number,
+            setup=setup,
+            warmup_number=min(number, 10),
+            num_threads=num_threads,
+            error_log_repr=repr(error_log),
+            stat_log=stat_log,
+            parent_interpreter=sys.executable,
+            torch_file=torch.__file__,
+        )
+
+
+CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
+def wrapper_singleton() -> _ValgrindWrapper:
+    global CALLGRIND_SINGLETON
+    if CALLGRIND_SINGLETON is None:
+        CALLGRIND_SINGLETON = _ValgrindWrapper()
+    return CALLGRIND_SINGLETON

From 75668237790d1d2812db6f4388790e632e1f168b Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Wed, 30 Sep 2020 06:48:00 -0700
Subject: [PATCH 297/449] Enable PE + TE (#45546)

Summary:
This PR enables PE + TE for 1.7

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45546

Reviewed By: ZolotukhinM

Differential Revision: D24006940

Pulled By: Krovatkin

fbshipit-source-id: a3326077d34a023941acdb06c4907c96e7ba0115
---
 test/test_jit_cuda_fuser.py                              | 5 +++--
 torch/csrc/jit/passes/tensorexpr_fuser.cpp               | 2 +-
 torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 6f28e302c344..762d01f19556 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, skipIfRocm
+from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, skipIfRocm, TEST_WITH_ROCM
 from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed
 
 from test_jit import JitTestCase, RUN_CUDA
@@ -804,4 +804,5 @@ def test_register_fuser(self):
 
 
 if __name__ == '__main__':
-    run_tests()
+    if not TEST_WITH_ROCM:
+        run_tests()
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 8e90fbefa77f..fd70f2963b8b 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -192,7 +192,7 @@ bool isSupported(Node* node) {
 
 } // namespace tensorexpr
 
-static bool texpr_fuser_enabled_ = false;
+static bool texpr_fuser_enabled_ = true;
 
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index 5d63d78d4765..cb733938c033 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -45,7 +45,7 @@ static std::atomic<bool> executor_mode{true};
 static std::atomic<bool> profiling_mode{false};
 #else
 static std::atomic<bool> executor_mode{true};
-static std::atomic<bool> profiling_mode{false};
+static std::atomic<bool> profiling_mode{true};
 #endif
 
 static std::atomic<size_t> num_profiled_runs{1};

From e02868e12d2205fdf98721b220755ff03ae315eb Mon Sep 17 00:00:00 2001
From: VinodSKumar <vinods.kumar@gmail.com>
Date: Wed, 30 Sep 2020 07:03:30 -0700
Subject: [PATCH 298/449] Unify Transformer coder Constructors (#45515)

Summary:
Fixes #{[45502](https://github.com/pytorch/pytorch/issues/45502)}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45515

Reviewed By: zhangguanheng66, ZolotukhinM

Differential Revision: D23994644

Pulled By: glaringlee

fbshipit-source-id: b8728e8dfd8857e27246ebb11b17c2d1b48796ca
---
 torch/csrc/api/include/torch/nn/modules/transformercoder.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/api/include/torch/nn/modules/transformercoder.h b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
index 04518a177333..6b69f53ecf33 100644
--- a/torch/csrc/api/include/torch/nn/modules/transformercoder.h
+++ b/torch/csrc/api/include/torch/nn/modules/transformercoder.h
@@ -32,6 +32,8 @@ namespace nn {
 class TORCH_API TransformerEncoderImpl : public Cloneable<TransformerEncoderImpl> {
 
   public:
+    TransformerEncoderImpl(TransformerEncoderLayer encoder_layer, int64_t num_layers)
+      : TransformerEncoderImpl(TransformerEncoderOptions(encoder_layer, num_layers)) {}
     explicit TransformerEncoderImpl(TransformerEncoderOptions options_);
 
     Tensor forward(

From fdbed7118e8501f20568020afe5ebb00e4b97e49 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Wed, 30 Sep 2020 07:26:00 -0700
Subject: [PATCH 299/449] Some fixes to smooth_l1_loss (#45532)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45532

- updated documentation
- explicitly not supporting negative values for beta (previously the
result was incorrect)
- Removing default value for beta in the backwards function, since it's
only used internally by autograd (as per convention)

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D24002415

Pulled By: bdhirsh

fbshipit-source-id: 980c141019ec2d437b771ee11fc1cec4b1fcfb48
---
 aten/src/ATen/native/Loss.cpp              | 8 ++++++--
 aten/src/ATen/native/native_functions.yaml | 4 ++--
 tools/autograd/derivatives.yaml            | 2 +-
 torch/nn/modules/loss.py                   | 5 ++++-
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 3563a747cdde..2a3e97cf5dd8 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -296,8 +296,10 @@ Tensor soft_margin_loss(
 }
 
 Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
-  if (beta <= 0)
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
       return at::native::l1_loss(input, target, reduction);
+  }
   Tensor loss;
   auto iter = TensorIterator::binary_op(loss, input, target);
   smooth_l1_stub(iter.device_type(), iter, beta);
@@ -305,8 +307,10 @@ Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t r
 }
 
 Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
-  if (beta <= 0)
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
       return at::native::l1_loss_out(result, input, target, reduction);
+  }
   if (reduction != Reduction::None) {
     Tensor loss;
     auto iter = TensorIterator::binary_op(loss, input, target);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 43df6031727c..2aa5cbff4bba 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6794,13 +6794,13 @@
   dispatch:
     CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 707baa3bbeaf..92ee277e9ecf 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1589,7 +1589,7 @@
   grad_output: replication_pad3d(grad, padding)
   self: zeros_like(self)
 
-- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
   self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
   target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index bd3be4e10daa..e408bde41d47 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -758,7 +758,7 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
 
 class SmoothL1Loss(_Loss):
     r"""Creates a criterion that uses a squared term if the absolute
-    element-wise error falls below 1 and an L1 term otherwise.
+    element-wise error falls below beta and an L1 term otherwise.
     It is less sensitive to outliers than the `MSELoss` and in some cases
     prevents exploding gradients (e.g. see `Fast R-CNN` paper by Ross Girshick).
     Also known as the Huber loss:
@@ -780,6 +780,9 @@ class SmoothL1Loss(_Loss):
 
     beta is an optional parameter that defaults to 1.
 
+    Note: When beta is set to 0, this is equivalent to we call out directly to :class:`L1Loss`.
+    Passing a negative value in for beta will result in an exception.
+
     The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
 
     Args:

From 96540e918c4ca3f0a03866b9d281c34c65bd76a4 Mon Sep 17 00:00:00 2001
From: Erjia Guan <erjia@fb.com>
Date: Wed, 30 Sep 2020 07:56:04 -0700
Subject: [PATCH 300/449] Add ShuffleDataset with buffer (#45290)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45290

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D24001084

Pulled By: erjia-guan

fbshipit-source-id: d8a7455cf3f18e1f8c1edc53c42c1a99c8573c51
---
 docs/source/data.rst         |  1 +
 test/test_dataloader.py      | 39 +++++++++++++++++++++++-
 torch/utils/data/__init__.py | 11 +++----
 torch/utils/data/dataset.py  | 58 ++++++++++++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/docs/source/data.rst b/docs/source/data.rst
index 9ba88f02c31f..c5d6f61b7ba9 100644
--- a/docs/source/data.rst
+++ b/docs/source/data.rst
@@ -403,6 +403,7 @@ Example::
 .. autoclass:: TensorDataset
 .. autoclass:: ConcatDataset
 .. autoclass:: ChainDataset
+.. autoclass:: BufferedShuffleDataset
 .. autoclass:: Subset
 .. autofunction:: torch.utils.data.get_worker_info
 .. autofunction:: torch.utils.data.random_split
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index ce23593ec7bc..9074cc3c0b7d 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -11,8 +11,10 @@
 import itertools
 import warnings
 import tempfile
+import random
 from torch import multiprocessing as mp
-from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset
+from torch.utils.data import (_utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset,
+                              ChainDataset, BufferedShuffleDataset)
 from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL
 from torch.utils.data.dataset import random_split
 from torch._utils import ExceptionWrapper
@@ -710,6 +712,10 @@ def init_fn(worker_id):
     torch.manual_seed(12345)
 
 
+def shuffle_ds_init_fn(worker_id):
+    random.seed(123)
+
+
 # used with test_error_in_init
 class ErrorIterableDataset(IterableDataset):
     def __iter__(self):
@@ -1213,6 +1219,37 @@ def test_chain_iterable_style_dataset(self):
         with self.assertRaisesRegex(AssertionError, "ChainDataset only supports IterableDataset"):
             list(iter(ChainDataset([dataset1, self.dataset])))
 
+    def test_buffer_shuffle_dataset(self):
+        dataset = CountingIterableDataset(20)
+        expected = list(range(20))
+        buffer_sizes = [5, 20, 25]
+        for num_workers in [0, 1]:
+            # Buffer Size <= 1: Not shuffled dataset
+            fetched_nos = list(self._get_data_loader(BufferedShuffleDataset(dataset, 1), num_workers=num_workers))
+            self.assertEqual(len(fetched_nos), len(expected))
+            for e, d in zip(expected, fetched_nos):
+                self.assertIsInstance(d, torch.Tensor)
+                self.assertEqual(e, d)
+            # Buffer Size > 1: Shuffled dataset
+            for buffer_size in buffer_sizes:
+                fetched = sorted(list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers)))
+                self.assertEqual(len(fetched), len(expected))
+                for e, d in zip(expected, fetched):
+                    self.assertIsInstance(d, torch.Tensor)
+                    self.assertEqual(e, d)
+                # Random Seed for single process
+                random.seed(123)
+                fetched_seed1 = list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers,
+                                     worker_init_fn=shuffle_ds_init_fn))
+                random.seed(123)
+                fetched_seed2 = list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers,
+                                     worker_init_fn=shuffle_ds_init_fn))
+                self.assertEqual(len(fetched_seed1), len(fetched_seed2))
+                for d1, d2 in zip(fetched_seed1, fetched_seed2):
+                    self.assertIsInstance(d1, torch.Tensor)
+                    self.assertIsInstance(d2, torch.Tensor)
+                    self.assertEqual(d1, d2)
+
     def test_multiprocessing_contexts(self):
         reference = [
             torch.arange(3),
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 92f019bae1b9..0a0e7d98cdfb 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -1,11 +1,12 @@
 from .sampler import Sampler, SequentialSampler, RandomSampler, SubsetRandomSampler, WeightedRandomSampler, BatchSampler
-from .dataset import Dataset, IterableDataset, TensorDataset, ConcatDataset, ChainDataset, Subset, random_split
+from .dataset import (Dataset, IterableDataset, TensorDataset, ConcatDataset, ChainDataset, BufferedShuffleDataset, 
+                      Subset, random_split)
 from .distributed import DistributedSampler
 from .dataloader import DataLoader, _DatasetKind, get_worker_info
 
 
 __all__ = ['Sampler', 'SequentialSampler', 'RandomSampler',
-           'SubsetRandomSampler', 'WeightedRandomSampler', 'BatchSampler'
-           'DistributedSampler' 'Dataset', 'IterableDataset', 'TensorDataset',
-           'ConcatDataset', 'ChainDataset', 'Subset', 'random_split'
-           'DataLoader', '_DatasetKind', 'get_worker_info']
+           'SubsetRandomSampler', 'WeightedRandomSampler', 'BatchSampler',
+           'DistributedSampler', 'Dataset', 'IterableDataset', 'TensorDataset',
+           'ConcatDataset', 'ChainDataset', 'BufferedShuffleDataset', 'Subset',
+           'random_split', 'DataLoader', '_DatasetKind', 'get_worker_info']
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 7f466d18dcc8..c910cab9aef8 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -1,4 +1,5 @@
 import bisect
+import random
 import warnings
 
 from torch._utils import _accumulate
@@ -253,6 +254,63 @@ def __len__(self):
         return total
 
 
+class BufferedShuffleDataset(IterableDataset[T_co]):
+    r"""Dataset shuffled from the original dataset.
+
+    This class is useful to shuffle an existing instance of an IterableDataset.
+    The buffer with `buffer_size` is filled with the items from the dataset first. Then,
+    each item will be yielded from the buffer by reservoir sampling via iterator.
+
+    `buffer_size` is required to be larger than 0. For `buffer_size == 1`, the
+    dataset is not shuffled. In order to fully shuffle the whole dataset, `buffer_size`
+    is required to be greater than or equal to the size of dataset.
+
+    When it is used with :class:`~torch.utils.data.DataLoader`, each item in the
+    dataset will be yielded from the :class:`~torch.utils.data.DataLoader` iterator.
+    And, the method to set up a random seed is different based on :attr:`num_workers`.
+
+    For single-process mode (:attr:`num_workers == 0`), the random seed is required to
+    be set before the :class:`~torch.utils.data.DataLoader` in the main process.
+
+        >>> ds = BufferedShuffleDataset(dataset)
+        >>> random.seed(...)
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
+
+    For multi-process mode (:attr:`num_workers > 0`), the random seed is set by a callable
+    function in each worker.
+
+        >>> ds = BufferedShuffleDataset(dataset)
+        >>> def init_fn(worker_id):
+        ...     random.seed(...)
+        >>> print(list(torch.utils.data.DataLoader(ds, ..., num_workers=n, worker_init_fn=init_fn)))
+
+    Arguments:
+        dataset (IterableDataset): The original IterableDataset.
+        buffer_size (int): The buffer size for shuffling.
+    """
+    dataset: IterableDataset[T_co]
+    buffer_size: int
+
+    def __init__(self, dataset: IterableDataset[T_co], buffer_size: int) -> None:
+        super(BufferedShuffleDataset, self).__init__()
+        assert buffer_size > 0, "buffer_size should be larger than 0"
+        self.dataset = dataset
+        self.buffer_size = buffer_size
+
+    def __iter__(self) -> Iterator[T_co]:
+        buf: List[T_co] = []
+        for x in self.dataset:
+            if len(buf) == self.buffer_size:
+                idx = random.randint(0, self.buffer_size - 1)
+                yield buf[idx]
+                buf[idx] = x
+            else:
+                buf.append(x)
+        random.shuffle(buf)
+        while buf:
+            yield buf.pop()
+
+
 class Subset(Dataset[T_co]):
     r"""
     Subset of a dataset at specified indices.

From 7e863475d79f63226eb7e93c49e53e9be47db17c Mon Sep 17 00:00:00 2001
From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com>
Date: Wed, 30 Sep 2020 08:23:43 -0700
Subject: [PATCH 301/449] Upgrade ReadMe document to guide user to install
 libuv(1.39) in conda env on Windows platform (#45553)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45553

Reviewed By: SciPioneer

Differential Revision: D24017246

Pulled By: mrshenli

fbshipit-source-id: ec69f864a7acfbdddd60c3d2b442294ec3e34558
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ec68e8dfeb88..c6c1138747a2 100644
--- a/README.md
+++ b/README.md
@@ -188,7 +188,7 @@ On Windows
 ```bash
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
-conda install -y -q -c rdonnelly libuv
+conda install -c conda-forge libuv=1.39
 ```
 
 #### Get the PyTorch Source

From 415ed434aaaffb7dd89bbce9c8db8beaa4562483 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Wed, 30 Sep 2020 08:39:43 -0700
Subject: [PATCH 302/449] Add whitelist for complex backward (#45461)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45461

This PR disables autograd for all C -> C, R -> C functions which are not included in the whitelist `GRADIENT_IMPLEMENTED_FOR_COMPLEX`. In practice, there will be a RuntimeError during forward computation when the outputs are differentiable:
```
>>> x=torch.randn(4, 4, requires_grad=True, dtype=torch.cdouble)
>>> x.pow(3)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
RuntimeError: pow does not support automatic differentiation for outputs with complex dtype.
```

The implicit assumption here is that all the C -> R functions have correct backward definitions. So before merging this PR, the following functions must be tested and verified to have correct backward definitions:
`torch.abs` (updated in #39955 ), `torch.angle`, `torch.norm`, `torch.irfft`, `torch.istft`.

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D23998156

Pulled By: anjali411

fbshipit-source-id: 370eb07fe56ac84dd8e2233ef7bf3a3eb8aeb179
---
 test/test_autograd.py                   |  1 +
 tools/autograd/gen_variable_type.py     | 26 +++++++++++++++++++++++++
 torch/csrc/autograd/VariableTypeUtils.h | 13 +++++++++++++
 torch/csrc/autograd/autograd.cpp        |  4 ----
 torch/csrc/autograd/python_engine.cpp   |  4 ----
 5 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/test/test_autograd.py b/test/test_autograd.py
index dbcb8639a5a5..d6661b4662fe 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4808,6 +4808,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
                 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot'] + separate_complex_tests
 
+# TODO(@anjali411): add tests for 'sub', 'div
 # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
 # complex_list += ['fill_', 't', '__rdiv__', 'tanh']
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 0b374e201fd1..ee37c4749a5e 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -148,6 +148,21 @@
     'bucketize'
 }
 
+# The C -> R functions at the time of adding this are still being audited and tested
+# but will not error out.
+# C -> C, R -> C functions for which backward is correctly implemented and tested
+GRADIENT_IMPLEMENTED_FOR_COMPLEX = {
+    't', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone',
+    'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose',
+    'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
+    'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'eq_',
+    'ne_', 'add', '__radd__', 'sum', '_conj', 'sin', 'cos', 'mul', 'sinh',
+    'cosh', '__rmul__', 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex',
+    'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
+    'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
+    'dot', 'vdot', 'cholesky'
+}
+
 # Some operators invalidate the grad_accumulator. Let's reset it.
 RESET_GRAD_ACCUMULATOR = {
     'set', 'resize'
@@ -961,6 +976,16 @@ def setup_derivative(differentiable_inputs):
         body.append(SETUP_DERIVATIVE.substitute(env, setup=setup))
         return body
 
+    def emit_check_if_in_complex_autograd_allowlist():
+        body = []
+        if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX:
+            return body
+        for arg in differentiable_outputs:
+            name = arg['name']
+            if arg['type'] == 'Tensor' or arg['type'] == 'TensorList':
+                body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name))
+        return body
+
     def emit_check_no_requires_grad(tensor_args, args_with_derivatives):
         """Checks that arguments without derivatives don't require grad"""
         body = []
@@ -1218,6 +1243,7 @@ def emit_increment_version():
         body.append(emit_history())
     if requires_derivative:
         body.append(emit_save_outputs())
+        body.extend(emit_check_if_in_complex_autograd_allowlist())
     if base_name in RESET_GRAD_ACCUMULATOR:
         # `inplace` implies that there is exactly one output named `self`,
         # so we can keep the generated code easy. If you need to
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index 692972533adc..2ef1415cc937 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -67,6 +67,19 @@ inline void throw_error_out_requires_grad(const char* name) {
       "but one of the arguments requires grad.");
 }
 
+inline void throw_error_for_complex_autograd(const Tensor& tensor, const char* name) {
+  if (tensor.requires_grad()) {
+    TORCH_CHECK(!tensor.is_complex(), name,
+                " does not support automatic differentiation for outputs with complex dtype.");
+  }
+}
+
+inline void throw_error_for_complex_autograd(const TensorList& tensorlist, const char* name) {
+  for (auto tensor: tensorlist) {
+    throw_error_for_complex_autograd(tensor, name);
+  }
+}
+
 // TODO: Blegh, bare references
 
 inline void rebase_history(Variable& var, std::shared_ptr<Node> grad_fn) {
diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp
index ab02a03279a1..b8756ff1c7b4 100644
--- a/torch/csrc/autograd/autograd.cpp
+++ b/torch/csrc/autograd/autograd.cpp
@@ -75,10 +75,6 @@ variable_list run_backward(
   for (size_t i = 0; i < num_tensors; i++) {
     const Variable& output = outputs[i];
     auto gradient_edge = impl::gradient_edge(output);
-    if(output.is_complex()) {
-      TORCH_WARN_ONCE("Complex backward is not fully supported yet and could lead to wrong ",
-                      "gradients for functions we have not fixed yet");
-    }
     TORCH_CHECK(
         gradient_edge.function,
         "element ", i, " of tensors does not require grad and does not have a grad_fn");
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index f4c88225efc8..586e956a8549 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -167,10 +167,6 @@ PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwar
         "vmapped tensors (output ", i, " is being vmapped over). Please "
         "call autograd.grad() outside torch.vmap or file a bug report "
         "with your use case.")
-    if(variable.is_complex()) {
-      TORCH_WARN_ONCE("Complex backward is not fully supported yet and could lead to wrong ",
-                      "gradients for functions we have not fixed yet");
-    }
     auto gradient_edge = torch::autograd::impl::gradient_edge(variable);
     THPUtils_assert(gradient_edge.function,
         "element %d of tensors does not require grad and does not have a grad_fn", i);

From 939e0389def30aca9a799209a6d22f09e3b0583b Mon Sep 17 00:00:00 2001
From: Iurii Zdebskyi <iuriiz@devfair004.maas>
Date: Wed, 30 Sep 2020 08:53:17 -0700
Subject: [PATCH 303/449] Update test_multi_tensor_optimizers test (#45510)

Summary:
Following up on previous [feedback](https://github.com/pytorch/pytorch/pull/45475/files#r496330797).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45510

Reviewed By: heitorschueroff

Differential Revision: D23992304

Pulled By: izdeby

fbshipit-source-id: 4784ed8d79e09da3aa61880add6443e3a8d322e4
---
 test/test_optim.py | 74 ++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 49 deletions(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index a41ee7297f8b..3e3e6610fa01 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -310,60 +310,36 @@ def test_multi_tensor_optimizers(self):
         if not torch.cuda.is_available():
             return
 
-        optimizer_pairs = [
-            (optim.Adam, optim._multi_tensor.Adam),
-            (optim.Adam, optim._multi_tensor.Adam),
-            (optim.Adam, optim._multi_tensor.Adam),
-            (optim.Adam, optim._multi_tensor.Adam),
-            (optim.AdamW, optim._multi_tensor.AdamW),
-            (optim.AdamW, optim._multi_tensor.AdamW),
-            (optim.AdamW, optim._multi_tensor.AdamW),
-            (optim.AdamW, optim._multi_tensor.AdamW),
-            (optim.SGD, optim._multi_tensor.SGD),
-            (optim.SGD, optim._multi_tensor.SGD),
-            (optim.RMSprop, optim._multi_tensor.RMSprop),
-            (optim.RMSprop, optim._multi_tensor.RMSprop),
-            (optim.RMSprop, optim._multi_tensor.RMSprop),
-            (optim.RMSprop, optim._multi_tensor.RMSprop),
-            (optim.Rprop, optim._multi_tensor.Rprop),
-            (optim.ASGD, optim._multi_tensor.ASGD),
-            (optim.ASGD, optim._multi_tensor.ASGD),
-            (optim.Adamax, optim._multi_tensor.Adamax),
-            (optim.Adamax, optim._multi_tensor.Adamax),
-            (optim.Adadelta, optim._multi_tensor.Adadelta),
-            (optim.Adadelta, optim._multi_tensor.Adadelta),
-        ]
-
-        flag_params = [
-            dict(weight_decay=1., amsgrad=True),  # Adam
-            dict(weight_decay=1., amsgrad=False),  # Adam
-            dict(weight_decay=0., amsgrad=True),  # Adam
-            dict(weight_decay=0., amsgrad=False),  # Adam
-            dict(weight_decay=1., amsgrad=True),  # AdamW
-            dict(weight_decay=1., amsgrad=False),  # AdamW
-            dict(weight_decay=0., amsgrad=True),  # AdamW
-            dict(weight_decay=0., amsgrad=False),  # AdamW
-            dict(lr=0.2, momentum=1, dampening=0, weight_decay=1, nesterov=True),  # SGD
-            dict(lr=0.2, momentum=1, dampening=0.5, weight_decay=1, nesterov=False),  # SGD
-            dict(weight_decay=1, momentum=1, centered=True),  # RMSprop
-            dict(weight_decay=1, momentum=0, centered=True),  # RMSprop
-            dict(weight_decay=1, momentum=1, centered=False),  # RMSprop
-            dict(weight_decay=0, momentum=1, centered=False),  # RMSprop
-            dict(lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)),  # Rprop
-            dict(weight_decay=0),  # ASGD
-            dict(weight_decay=1),  # ASGD
-            dict(weight_decay=0),  # Adamax
-            dict(weight_decay=1),  # Adamax
-            dict(weight_decay=0),  # Adadelta
-            dict(weight_decay=1),  # Adadelta
+        optimizer_pairs_with_flags = [
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=1., amsgrad=True)),
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=1., amsgrad=False)),
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=0., amsgrad=True)),
+            ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=0., amsgrad=False)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=1., amsgrad=True)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=1., amsgrad=False)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=0., amsgrad=True)),
+            ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=0., amsgrad=False)),
+            ((optim.SGD, optim._multi_tensor.SGD), dict(lr=0.2, momentum=1, dampening=0, weight_decay=1, nesterov=True)),
+            ((optim.SGD, optim._multi_tensor.SGD), dict(lr=0.2, momentum=1, dampening=0.5, weight_decay=1, nesterov=False)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=1, centered=True)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=0, centered=True)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=1, centered=False)),
+            ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=0, momentum=1, centered=False)),
+            ((optim.Rprop, optim._multi_tensor.Rprop), dict(lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50))),
+            ((optim.ASGD, optim._multi_tensor.ASGD), dict(weight_decay=0)),
+            ((optim.ASGD, optim._multi_tensor.ASGD), dict(weight_decay=1)),
+            ((optim.Adamax, optim._multi_tensor.Adamax), dict(weight_decay=0)),
+            ((optim.Adamax, optim._multi_tensor.Adamax), dict(weight_decay=1)),
+            ((optim.Adadelta, optim._multi_tensor.Adadelta), dict(weight_decay=0)),
+            ((optim.Adadelta, optim._multi_tensor.Adadelta), dict(weight_decay=1)),
         ]
 
         kIterations = 1001
         device = 'cuda'
 
-        for index in range(len(optimizer_pairs)):
+        for optimizers, params in optimizer_pairs_with_flags:
             res = []
-            for opt in optimizer_pairs[index]:
+            for opt in optimizers:
                 weight = torch.tensor([[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]], 
                                       dtype=torch.float64, device=device, requires_grad=True)
                 bias = torch.tensor([-0.1085, -0.2979, 0.6892], dtype=torch.float64, device=device, requires_grad=True)
@@ -385,7 +361,7 @@ def test_multi_tensor_optimizers(self):
                 pretrained_dict['2.bias'] = bias2
                 model.load_state_dict(pretrained_dict)
 
-                optimizer = opt(model.parameters(), **flag_params[index])
+                optimizer = opt(model.parameters(), **params)
 
                 for _ in range(kIterations): 
                     optimizer.zero_grad()

From 4f3920951eff8db6b71da99d9b92c3bb10679d64 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Wed, 30 Sep 2020 09:15:13 -0700
Subject: [PATCH 304/449] type check for torch.quantization.quantize_jit
 (#45548)

Summary:
added type signal for more jit python functions

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45548

Reviewed By: malfet

Differential Revision: D24010922

Pulled By: walterddr

fbshipit-source-id: 2fdd75482481adf2eddc01b915d7d5720fbb2b82
---
 mypy.ini                 |  3 ---
 torch/_C/__init__.pyi.in | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 24025938300f..e8791f497c35 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -77,9 +77,6 @@ ignore_errors = True
 [mypy-torch.quantization.fake_quantize]
 ignore_errors = True
 
-[mypy-torch.quantization.quantize_jit]
-ignore_errors = True
-
 [mypy-torch.quantization._numeric_suite]
 ignore_errors = True
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 4c2c65a33fe5..b0479c01f58a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -179,6 +179,20 @@ def _jit_set_texpr_fuser_enabled(enable: _bool): ...
 def _jit_set_nvfuser_enabled(enable: _bool) -> _bool: ...
 def _jit_pass_canonicalize(graph: Graph): ...
 def _jit_pass_erase_shape_information(graph: Graph): ...
+def _jit_pass_fold_convbn(module: 'torch.jit.ScriptModule'): ...
+def _jit_pass_insert_observers(module: 'torch.jit.ScriptModule',
+                               method_name: str,
+                               qconfig_dict: Dict[str, Any],
+                               inplace: _bool,
+                               quant_type: _int): ...
+def _jit_pass_insert_quant_dequant(module: 'torch.jit.ScriptModule',
+                                   method_name: str,
+                                   inplace: _bool,
+                                   debug: _bool,
+                                   quant_type: _int): ...
+def _jit_pass_quant_finalize(module: 'torch.jit.ScriptModule',
+                             quant_type: _int,
+                             preserved_attrs: Sequence[str]): ...
 def _jit_set_profiling_executor(profiling_flag: _bool) -> _bool: ...
 def _jit_set_profiling_mode(profiling_flag: _bool) -> _bool: ...
 def _jit_try_infer_type(obj: Any) -> JitType: ...

From 6c4aa2a79cc6bfc720a958af569a9c80c0379e77 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Wed, 30 Sep 2020 09:58:16 -0700
Subject: [PATCH 305/449] Revert D24002415: Some fixes to smooth_l1_loss

Test Plan: revert-hammer

Differential Revision:
D24002415 (https://github.com/pytorch/pytorch/commit/fdbed7118e8501f20568020afe5ebb00e4b97e49)

Original commit changeset: 980c141019ec

fbshipit-source-id: 8981b5f6d982ed66c670122e437540444cb5f39c
---
 aten/src/ATen/native/Loss.cpp              | 8 ++------
 aten/src/ATen/native/native_functions.yaml | 4 ++--
 tools/autograd/derivatives.yaml            | 2 +-
 torch/nn/modules/loss.py                   | 5 +----
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 2a3e97cf5dd8..3563a747cdde 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -296,10 +296,8 @@ Tensor soft_margin_loss(
 }
 
 Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
-  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
-  if (beta == 0) {
+  if (beta <= 0)
       return at::native::l1_loss(input, target, reduction);
-  }
   Tensor loss;
   auto iter = TensorIterator::binary_op(loss, input, target);
   smooth_l1_stub(iter.device_type(), iter, beta);
@@ -307,10 +305,8 @@ Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t r
 }
 
 Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
-  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
-  if (beta == 0) {
+  if (beta <= 0)
       return at::native::l1_loss_out(result, input, target, reduction);
-  }
   if (reduction != Reduction::None) {
     Tensor loss;
     auto iter = TensorIterator::binary_op(loss, input, target);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2aa5cbff4bba..43df6031727c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6794,13 +6794,13 @@
   dispatch:
     CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 92ee277e9ecf..707baa3bbeaf 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1589,7 +1589,7 @@
   grad_output: replication_pad3d(grad, padding)
   self: zeros_like(self)
 
-- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
   grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
   self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
   target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index e408bde41d47..bd3be4e10daa 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -758,7 +758,7 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
 
 class SmoothL1Loss(_Loss):
     r"""Creates a criterion that uses a squared term if the absolute
-    element-wise error falls below beta and an L1 term otherwise.
+    element-wise error falls below 1 and an L1 term otherwise.
     It is less sensitive to outliers than the `MSELoss` and in some cases
     prevents exploding gradients (e.g. see `Fast R-CNN` paper by Ross Girshick).
     Also known as the Huber loss:
@@ -780,9 +780,6 @@ class SmoothL1Loss(_Loss):
 
     beta is an optional parameter that defaults to 1.
 
-    Note: When beta is set to 0, this is equivalent to we call out directly to :class:`L1Loss`.
-    Passing a negative value in for beta will result in an exception.
-
     The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
 
     Args:

From 51d0ae92072d8edca0addb361322b8bba8a91615 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Wed, 30 Sep 2020 10:13:50 -0700
Subject: [PATCH 306/449] Revert D24010742: [pytorch][PR] Add callgrind
 collection to Timer

Test Plan: revert-hammer

Differential Revision:
D24010742 (https://github.com/pytorch/pytorch/commit/9b27e0926b9025d91c1df50bb6575a801a31f299)

Original commit changeset: df6bc765f8ef

fbshipit-source-id: 4c1edd57ea932896f7052716427059c924222501
---
 .circleci/docker/common/install_base.sh       |   3 +-
 .circleci/scripts/binary_ios_build.sh         |   1 -
 .gitmodules                                   |   4 -
 test/test_utils.py                            |  12 +-
 third_party/valgrind                          |   1 -
 torch/CMakeLists.txt                          |   3 -
 torch/_C/__init__.pyi.in                      |   4 -
 torch/csrc/Module.cpp                         |  26 -
 torch/overrides.py                            |   2 -
 torch/utils/benchmark/utils/timer.py          |  27 -
 .../utils/valgrind_wrapper/__init__.py        |   0
 .../utils/valgrind_wrapper/timer_interface.py | 473 ------------------
 12 files changed, 2 insertions(+), 554 deletions(-)
 delete mode 160000 third_party/valgrind
 delete mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
 delete mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py

diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index 0df15d9457c1..5e8173a43627 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -47,7 +47,6 @@ install_ubuntu() {
     software-properties-common \
     sudo \
     wget \
-    valgrind \
     vim
 
   # TODO: THIS IS A HACK!!!
@@ -93,7 +92,6 @@ install_centos() {
     opencv-devel \
     sudo \
     wget \
-    valgrind \
     vim
 
   # Cleanup
@@ -133,3 +131,4 @@ sudo make install
 cd ../../
 rm -rf valgrind_build
 alias valgrind="/usr/local/bin/valgrind"
+
diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index 6df086ccf965..efab1e5ded3a 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -16,7 +16,6 @@ source ~/anaconda/bin/activate
 
 # Install dependencies
 conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
-conda install -c conda-forge valgrind
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
 # sync submodules
diff --git a/.gitmodules b/.gitmodules
index d7a11cc22996..509ab94f1cf4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -130,7 +130,3 @@
     ignore = dirty
     path = third_party/tensorpipe
     url = https://github.com/pytorch/tensorpipe.git
-[submodule "third_party/valgrind"]
-    ignore = dirty
-	path = third_party/valgrind
-	url = https://sourceware.org/git/valgrind.git
diff --git a/test/test_utils.py b/test/test_utils.py
index 308b242b8116..dc7efe5a8f5b 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -15,7 +15,7 @@
 import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, slowTest
+from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS
 from urllib.error import URLError
 import numpy as np
 
@@ -768,16 +768,6 @@ class MockCudaTimer(benchmark_utils.Timer):
             self.assertEqual(len(measurement.times), repeats)
             self.assertEqual(measurement.number_per_run, number_per_run)
 
-    @slowTest
-    @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
-    def test_collect_callgrind(self):
-        timer = benchmark_utils.Timer("y = torch.ones((1,)) + 1")
-
-        # Don't collect baseline to speed up unit test by ~30 seconds.
-        stats = timer.collect_callgrind(number=1000, collect_baseline=False)
-
-        self.assertEqual(stats.counts(include_lookdict_unicode=False), 38803198, atol=0, rtol=0.0001)
-
     def test_compare(self):
         # Simulate several approaches.
         costs = (
diff --git a/third_party/valgrind b/third_party/valgrind
deleted file mode 160000
index 2593ccd82c18..000000000000
--- a/third_party/valgrind
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2593ccd82c189bf40b60a3a4934c5d0bbdb75427
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index de84804f3012..2ae2f7f737fe 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -66,9 +66,6 @@ set(TORCH_PYTHON_INCLUDE_DIRECTORIES
     ${CMAKE_BINARY_DIR}/third_party
     ${CMAKE_BINARY_DIR}/third_party/onnx
 
-    ${TORCH_ROOT}/third_party/valgrind/callgrind
-    ${TORCH_ROOT}/third_party/valgrind/include
-
     ${TORCH_ROOT}/third_party/gloo
     ${TORCH_ROOT}/third_party/onnx
     ${pybind11_INCLUDE_DIRS}
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b0479c01f58a..8da2fb7a587f 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -388,10 +388,6 @@ def _vmapmode_increment_nesting() -> _int: ...  # THPModule_vmapmode_increment_n
 def _vmapmode_decrement_nesting() -> _int: ...  # THPModule_vmapmode_decrement_nesting
 def _log_api_usage_once(str) -> None: ...  # LogAPIUsageOnceFromPython
 
-# Defined in `valgrind.h` and `callgrind.h` respecitively.
-def valgrind_supported_platform() -> _bool: ...  # NVALGRIND
-def valgrind_toggle() -> None: ...  # CALLGRIND_TOGGLE_COLLECT
-
 has_openmp: _bool
 has_mkl: _bool
 has_lapack: _bool
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 805548d18a98..ae6f15155f2a 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -61,12 +61,6 @@
 #endif
 #endif
 
-#if (defined(_WIN32) || defined(_WIN64) || defined(FBCODE_CAFFE2) || defined(C10_MOBILE))
-#define NVALGRIND
-#else
-#include <callgrind.h>
-#endif
-
 #define WITH_NUMPY_IMPORT_ARRAY
 #include <torch/csrc/utils/numpy_stub.h>
 
@@ -827,26 +821,6 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False));
   ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
 
-  py_module.def(
-    "valgrind_supported_platform", [](){
-      #if defined(NVALGRIND)
-      return false;
-      #else
-      return true;
-      #endif
-    }
-  );
-
-  py_module.def(
-    "valgrind_toggle", [](){
-      #if defined(NVALGRIND)
-      TORCH_CHECK(false, "Valgrind is not supported.");
-      #else
-      CALLGRIND_TOGGLE_COLLECT;
-      #endif
-    }
-  );
-
 #ifdef USE_CUDA
   PyObject *has_cuda = Py_True;
 #else
diff --git a/torch/overrides.py b/torch/overrides.py
index bab17c1e961f..d7cda983fde6 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -156,8 +156,6 @@ def get_ignored_functions() -> Set[Callable]:
         torch.is_deterministic,
         torch.set_deterministic,
         torch.unify_type_list,
-        torch.valgrind_supported_platform,
-        torch.valgrind_toggle,
         Tensor.__delitem__,
         Tensor.__dir__,
         Tensor.__getattribute__,
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index d4c321156da9..41d3892c86b3 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -6,7 +6,6 @@
 import numpy as np
 import torch
 from torch.utils.benchmark.utils import common
-from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
 
 
 __all__ = ["Timer", "timer"]
@@ -43,7 +42,6 @@ def __init__(
         # specified as a convenience feature.
         globals = dict(globals or {})
         globals.setdefault("torch", torch)
-        self._globals = globals
 
         self._timer = self._timer_cls(stmt=stmt, setup=setup, timer=timer, globals=globals)
         self._task_spec = common.TaskSpec(
@@ -161,28 +159,3 @@ def stop_hook(times) -> bool:
             raw_times=times,
             task_spec=self._task_spec
         )
-
-    def collect_callgrind(self, number=100, collect_baseline=True):
-        if not isinstance(self._task_spec.stmt, str):
-            raise ValueError("`collect_callgrind` currently only supports string `stmt`")
-
-        # __init__ adds torch, and Timer adds __builtins__
-        allowed_keys = {"torch", "__builtins__"}
-        if any(k not in allowed_keys for k in self._globals.keys()):
-            raise ValueError(
-                "`collect_callgrind` does not currently support passing globals. "
-                "Please define a `setup` str instead.")
-
-        if self._globals.get("torch", torch) is not torch:
-            raise ValueError("`collect_callgrind` does not support mocking out `torch`.")
-
-        # Check that the statement is valid. It doesn't guarantee success, but it's much
-        # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
-        # the parent process rather than the valgrind subprocess.
-        self._timer.timeit(1)
-        return valgrind_timer_interface.wrapper_singleton().collect_callgrind(
-            stmt=self._task_spec.stmt,
-            setup=self._task_spec.setup,
-            number=number,
-            num_threads=self._task_spec.num_threads,
-            collect_baseline=collect_baseline)
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py b/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
deleted file mode 100644
index 423cc3dc4a86..000000000000
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ /dev/null
@@ -1,473 +0,0 @@
-"""Intermediate layer between `Timer` and `valgrind`."""
-import collections
-import dataclasses
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tempfile
-import textwrap
-from typing import Any, DefaultDict, Dict, List, NamedTuple, Optional, Tuple
-
-import torch
-
-
-FunctionCount = NamedTuple("FunctionCount", [("count", int), ("function", str)])
-
-
-@dataclasses.dataclass(repr=False, eq=False, frozen=True)
-class CallgrindStats(object):
-    stmt: str
-    setup: str
-    number_per_run: int
-    num_threads: int
-    built_with_debug_symbols: bool
-    baseline_inclusive_stats: Tuple[FunctionCount, ...]
-    baseline_exclusive_stats: Tuple[FunctionCount, ...]
-    stmt_inclusive_stats: Tuple[FunctionCount, ...]
-    stmt_exclusive_stats: Tuple[FunctionCount, ...]
-
-    def __repr__(self) -> str:
-        newline = "\n"  # `\` cannot appear in fstring code section.
-        base_stats = self.baseline_exclusive_stats
-        self_stats = self.stmt_exclusive_stats
-        output = textwrap.dedent(f"""
-        {super().__repr__()}
-          stmt:  {self.stmt.replace(newline, newline + ' ' * 9)}
-          setup: {self.setup.replace(newline, newline + ' ' * 9)}
-          {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
-        {'':>25}All{'':>10}Noisy symbols removed
-          Instructions: {self._counts(self_stats, True):>12}{'':>15}{self._counts(self_stats, False):>12}
-          Baseline:     {self._counts(base_stats, True):>12}{'':>15}{self._counts(base_stats, False):>12}
-        """).strip()
-        if not self.built_with_debug_symbols:
-            output += textwrap.dedent("""
-            Warning: PyTorch was not built with debug symbols.
-                     Source information may be limited. Rebuild with
-                     REL_WITH_DEB_INFO=1 for more detailed results.""")
-        return output
-
-    def stats(self, inclusive: bool = False) -> Tuple[FunctionCount, ...]:
-        """Returns stats as a tuple of (count, function)
-
-        `inclusive` matches the semantics of callgrind. If True, the counts
-        include instructions executed by children. `inclusive=True` is useful
-        for identifying hot spots in code; `inclusive=False` is useful for
-        identifying reducing noise when diffing counts from two different
-        runs. (See CallgrindStats.delta(...) for more details)
-        """
-        if inclusive:
-            first, second = self.stmt_inclusive_stats, self.baseline_inclusive_stats
-        else:
-            first, second = self.stmt_exclusive_stats, self.baseline_exclusive_stats
-        return self._diff(first, second)
-
-    def counts(self, *, include_lookdict_unicode: bool = True) -> int:
-        """Returns the total number of instructions executed.
-
-        Several instructions in the CPython interpreter are rather noisy. These
-        instructions involve unicode to dictionary lookups which Python uses to
-        map variable names. By default these are included, but setting
-        `include_lookdict_unicode=False` will exclude them and generally lead
-        to less noisy counts.
-        """
-        return self._counts(self.stmt_exclusive_stats, include_lookdict_unicode)
-
-    # FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
-    def delta(
-        self,
-        other,  # type: CallgrindStats
-        inclusive: bool = False,
-        subtract_baselines: bool = True
-    ) -> Tuple[FunctionCount, ...]:
-        """Diff two sets of counts.
-
-        One common reason to collect instruction counts is to determine the
-        the effect that a particular change will have on the number of instructions
-        needed to perform some unit of work. If a change increases that number, the
-        next logical question is "why". This generally involves looking at what part
-        if the code increased in instruction count. This function automates that
-        process so that one can easily diff counts on both an inclusive and
-        exclusive basis. The `subtract_baselines` argument allows one to disable
-        baseline correction, though in most cases it shouldn't matter as the
-        baselines are expected to more or less cancel out.
-        """
-        if subtract_baselines:
-            first = self.stats(inclusive=inclusive)
-            second = other.stats(inclusive=inclusive)
-        else:
-            if inclusive:
-                first, second = self.stmt_inclusive_stats, other.stmt_inclusive_stats
-            else:
-                first, second = self.stmt_exclusive_stats, other.stmt_exclusive_stats
-        return self._diff(first, second)
-
-    def as_standardized(self) -> "CallgrindStats":
-        """Strip library names and some prefixes from function strings.
-
-        When comparing two different sets of instruction counts, on stumbling
-        block can be path prefixes. Callgrind includes the full filepath
-        when reporting a function (as it should). However, this can cause
-        issues when diffing profiles. If a key component such as Python
-        or PyTorch was built in separate locations in the two profiles, which
-        can result in something resembling:
-            23234231 /tmp/first_build_dir/thing.c:foo(...)
-             9823794 /tmp/first_build_dir/thing.c:bar(...)
-              ...
-               53453 .../aten/src/Aten/...:function_that_actually_changed(...)
-              ...
-             -9823794 /tmp/second_build_dir/thing.c:bar(...)
-            -23234231 /tmp/second_build_dir/thing.c:foo(...)
-
-        Stripping prefixes can ameliorate this issue by regularizing the
-        strings and causing better cancellation of equivilent call sites
-        when diffing.
-        """
-        def strip(stats: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
-            counts: DefaultDict[str, int] = collections.defaultdict(int)
-
-            # "Python" and "Objects" come from CPython.
-            prefix_truncations = ("build/aten/", "Python/", "Objects/")
-            for c, fn in stats:
-                fn = re.sub(r"^.+build/\.\./", "build/../", fn)
-                for new_prefix in prefix_truncations:
-                    fn = re.sub(r"^.+/" + re.escape(new_prefix), new_prefix, fn)
-
-                # Strip library name. e.g. `libtorch.so`
-                fn = re.sub(r"\s\[.+\]$", "", fn)
-                counts[fn] += c
-            return tuple(sorted([
-                FunctionCount(c, fn) for fn, c in counts.items() if c
-            ], reverse=True))
-
-        return CallgrindStats(
-            stmt=self.stmt,
-            setup=self.setup,
-            number_per_run=self.number_per_run,
-            num_threads=self.num_threads,
-            built_with_debug_symbols=self.built_with_debug_symbols,
-            baseline_inclusive_stats=strip(self.baseline_inclusive_stats),
-            baseline_exclusive_stats=strip(self.baseline_exclusive_stats),
-            stmt_inclusive_stats=strip(self.stmt_inclusive_stats),
-            stmt_exclusive_stats=strip(self.stmt_exclusive_stats),
-        )
-
-    @staticmethod
-    def _counts(stats: Tuple[FunctionCount, ...], include_lookdict_unicode: bool) -> int:
-        return sum(
-            c for c, fn in stats
-            if include_lookdict_unicode
-            or "dictobject.c:lookdict_unicode" not in fn
-        )
-
-    @staticmethod
-    def _diff(first: Tuple[FunctionCount, ...], second: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
-        counts = collections.defaultdict(int, {fn: c for c, fn in first})
-        assert len(counts) == len(first)
-        for c, fn in second:
-            counts[fn] -= c
-
-        return tuple(sorted([
-            FunctionCount(c, fn) for fn, c in counts.items() if c
-        ], reverse=True))
-
-
-class _ValgrindWrapper(object):
-    def __init__(self) -> None:
-        self._commands_available: Dict[str, bool] = {}
-        if torch._C.valgrind_supported_platform():
-            # Only bother checking on supported platforms.
-            for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
-                self._commands_available[cmd] = not subprocess.run(
-                    ["which", cmd],
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
-                ).returncode
-
-        self._build_type: Optional[str] = None
-        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())
-        if build_search is not None:
-            self._build_type = build_search.groups()[0].split(",")[0]
-
-        self._baseline_cache: Dict[Tuple[int, int], Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]] = {}
-
-    def _validate(self) -> None:
-        if not torch._C.valgrind_supported_platform():
-            raise OSError("Valgrind is not supported on this platform.")
-
-        missing_cmds = [cmd for cmd, available in self._commands_available.items() if not available]
-        if missing_cmds:
-            raise OSError("Missing: " + ", ".join(missing_cmds))
-
-    def collect_callgrind(
-        self,
-        stmt: str,
-        setup: str,
-        number: int,
-        num_threads: int,
-        collect_baseline: bool
-    ) -> CallgrindStats:
-        """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
-        self._validate()
-        baseline_inclusive_stats: Tuple[FunctionCount, ...] = ()
-        baseline_exclusive_stats: Tuple[FunctionCount, ...] = ()
-        if collect_baseline:
-            cache_key = (number, num_threads)
-            if cache_key not in self._baseline_cache:
-                self._baseline_cache[cache_key] = self._invoke(
-                    stmt="pass", setup="pass", number=number, num_threads=num_threads)
-            baseline_inclusive_stats, baseline_exclusive_stats = \
-                self._baseline_cache[cache_key]
-
-        stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(
-            stmt=stmt,
-            setup=setup,
-            number=number,
-            num_threads=num_threads
-        )
-
-        return CallgrindStats(
-            stmt=stmt,
-            setup=setup,
-            number_per_run=number,
-            num_threads=num_threads,
-            built_with_debug_symbols=self._build_type == "RelWithDebInfo",
-            baseline_inclusive_stats=baseline_inclusive_stats,
-            baseline_exclusive_stats=baseline_exclusive_stats,
-            stmt_inclusive_stats=stmt_inclusive_stats,
-            stmt_exclusive_stats=stmt_exclusive_stats,
-        )
-
-    def _invoke(
-        self,
-        stmt: str,
-        setup: str,
-        number: int,
-        num_threads: int
-    ) -> Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]:
-        """Core invocation method for Callgrind collection.
-
-        Valgrind operates by effectively replacing the CPU with an emulated
-        version which allows it to instrument any code at the cost of severe
-        performance degradation. This has the practical effect that in order
-        to collect Callgrind statistics, a new process has to be created
-        running under `valgrind`. The steps for this process are:
-
-        1) Create a scratch directory.
-        2) Codegen a run script. (_ValgrindWrapper._construct_script)
-            Inside the run script:
-                * Validate that Python and torch match the parent process
-                * Validate that it is indeed running under valgrind
-                * Execute `setup` and warm up `stmt`
-                * Begin collecting stats
-                * Run the `stmt` loop
-                * Stop collecting stats
-        3) Parse the run results.
-        4) Cleanup the scratch directory.
-        """
-        working_dir = tempfile.mkdtemp()
-        script_file = os.path.join(working_dir, "timer_callgrind.py")
-        callgrind_out = os.path.join(working_dir, "callgrind.out")
-        error_log = os.path.join(working_dir, "error.txt")
-        stat_log = os.path.join(working_dir, "callgrind_stat.txt")
-        stdout_stderr_log = os.path.join(working_dir, "stdout_stderr.log")
-
-        def run(args: List[str], **kwargs: Any) -> Tuple[subprocess.CompletedProcess, str]:
-            # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
-            f_stdout_stderr = open(stdout_stderr_log, "wb")
-            try:
-                invocation = subprocess.run(
-                    args,
-                    stdout=f_stdout_stderr,
-                    stderr=subprocess.STDOUT,
-                    **kwargs,
-                )
-                with open(stdout_stderr_log, "rt") as f:
-                    return invocation, f.read()
-            finally:
-                f_stdout_stderr.close()
-
-        try:
-            with open(script_file, "wt") as f:
-                f.write(self._construct_script(
-                    stmt=stmt, setup=setup, number=number,
-                    num_threads=num_threads, error_log=error_log,
-                    stat_log=stat_log))
-
-            valgrind_invocation, valgrind_invocation_output = run([
-                "valgrind",
-                "--tool=callgrind",
-                f"--callgrind-out-file={callgrind_out}",
-                "--dump-line=yes",
-                "--dump-instr=yes",
-                "--instr-atstart=yes",
-                "--collect-atstart=no",
-                "python",
-                script_file,
-            ])
-
-            if valgrind_invocation.returncode:
-                error_report = ""
-                if os.path.exists(error_log):
-                    with open(error_log, "rt") as f:
-                        error_report = f.read()
-                if not error_report:
-                    error_report = "Unknown error.\n" + valgrind_invocation_output
-
-                raise OSError(f"Failed to collect callgrind profile:\n{error_report}")
-
-            def parse_output(inclusive: bool) -> Tuple[FunctionCount, ...]:
-                annotate_invocation, annotate_invocation_output = run([
-                    "callgrind_annotate",
-                    f"--inclusive={'yes' if inclusive else 'no'}",
-                    callgrind_out
-                ], check=True)
-
-                begin_collecting = False
-                fn_counts = []
-                for l in annotate_invocation_output.splitlines(keepends=False):
-                    if not begin_collecting and re.match(r"Ir\s+file:function", l):
-                        begin_collecting = True
-                        continue
-
-                    count_match = re.match(r"^\s*([0-9,]+)\s+(.+:.+)$", l)
-                    if count_match:
-                        ir_str, file_function = count_match.groups()
-                        ir = int(ir_str.replace(",", ""))
-                        fn_counts.append(FunctionCount(ir, file_function))
-                        continue
-
-                    if begin_collecting and re.match(r"-+", l):
-                        continue
-
-                    begin_collecting = False
-
-                return tuple(fn_counts)
-            return parse_output(inclusive=True), parse_output(inclusive=False)
-        finally:
-            shutil.rmtree(working_dir)
-
-    @staticmethod
-    def _construct_script(
-        stmt: str,
-        setup: str,
-        number: int,
-        num_threads: int,
-        error_log: str,
-        stat_log: str
-    ) -> str:
-        # The naive template looks something like:
-        #   "for _ in range({number}): {stmt}"
-        # However a loop in Python is surprisingly expensive, and significantly
-        # increases the number of background Python instructions. So instead we
-        # partially unroll the loops, with a block size of 100 chosen to keep
-        # the instruction overhead from `range` low while also not ballooning
-        # the size of the generated file.
-        block_size = 100
-        loop_count = number // block_size
-        remainder = number - block_size * loop_count
-        blocked_stmt = ""
-        if loop_count:
-            unrolled_stmts = textwrap.indent("\n".join([stmt] * block_size), " " * 4)
-            blocked_stmt += f"for _ in range({loop_count}):\n{unrolled_stmts}\n"
-        if remainder:
-            blocked_stmt += "\n".join([stmt] * remainder)
-
-        return textwrap.dedent(r"""
-            import gc
-            import os
-            import subprocess
-            import sys
-            import time
-
-            import torch
-            torch.set_num_threads({num_threads})
-
-            PID = os.getpid()
-
-            def log_failure(msg):
-                with open({error_log_repr}, "wt") as f:
-                    f.write(msg)
-                sys.exit(1)
-
-            def check_result(completed_process):
-                if completed_process.returncode:
-                    log_failure(f"Command failed: {{' '.join(completed_process.args)}}")
-                return completed_process
-
-            # =============================================================================
-            # == Check that subprocess matches parent =====================================
-            # =============================================================================
-            if sys.executable != "{parent_interpreter}":
-                log_failure(
-                    "Interpreter mismatch:\n"
-                    f"  {{sys.executable}}\n    vs.\n  {parent_interpreter}"
-                )
-
-            if torch.__file__ != "{torch_file}":
-                log_failure(
-                    "PyTorch does not match expected file:\n"
-                    f"  {{torch.__file__}}\n    vs.\n  {torch_file}"
-                )
-
-            # =============================================================================
-            # == User specified setup =====================================================
-            # =============================================================================
-            {setup}
-
-            for _ in range({warmup_number}):
-            {indented_stmt}
-
-            # =============================================================================
-            # == Callgrind management =====================================================
-            # =============================================================================
-            with open("{stat_log}", "wb") as stat_file:
-                # If many instances of callgrind are running at once, the output of
-                # `callgrind_control` may exceed 16kb which would cause `subprocess.PIPE`
-                # to deadlock. So instead we use a file.
-                callgrind_stat = check_result(subprocess.run(
-                    ["callgrind_control", "--stat"],
-                    stdout=stat_file,
-                    stderr=subprocess.STDOUT,
-                ))
-
-            with open("{stat_log}", "rt") as stat_file:
-                stat_lines = stat_file.read().splitlines()
-
-            if f"PID {{PID}}: python {{__file__}}" not in stat_lines:
-                log_failure("Process does not appear to be running callgrind.")
-
-            gc.collect()
-            time.sleep(0.01)
-
-            # =============================================================================
-            # == User code block ==========================================================
-            # =============================================================================
-            torch._C.valgrind_toggle()
-            {blocked_stmt}
-
-            # Sleep is to allow the interpreter to catch up before we stop collecting in
-            # order to reduce jitter.
-            time.sleep(0.01)
-            torch._C.valgrind_toggle()
-        """).strip().format(
-            indented_stmt=textwrap.indent(stmt, " " * 4),
-            blocked_stmt=blocked_stmt,
-            number=number,
-            setup=setup,
-            warmup_number=min(number, 10),
-            num_threads=num_threads,
-            error_log_repr=repr(error_log),
-            stat_log=stat_log,
-            parent_interpreter=sys.executable,
-            torch_file=torch.__file__,
-        )
-
-
-CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
-def wrapper_singleton() -> _ValgrindWrapper:
-    global CALLGRIND_SINGLETON
-    if CALLGRIND_SINGLETON is None:
-        CALLGRIND_SINGLETON = _ValgrindWrapper()
-    return CALLGRIND_SINGLETON

From 5539066d12a22496760ba10e4b4a12081b7157af Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 30 Sep 2020 10:22:51 -0700
Subject: [PATCH 307/449] [quant][graphmode][fx] Support quantization for
 custom module (#44074)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44074

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D23580642

fbshipit-source-id: a80b0b3e5e1f4c4a9647da872239cc0a4d58dd3b
---
 test/quantization/test_quantize_fx.py         |  92 ++++++++++
 .../quantization/fx/quantization_patterns.py  |  18 ++
 torch/quantization/fx/quantize.py             | 160 ++++++++++++++++--
 torch/quantization/fx/standalone_module.py    |  30 ++++
 torch/quantization/quantize_fx.py             |  76 ++++++++-
 5 files changed, 350 insertions(+), 26 deletions(-)
 create mode 100644 torch/quantization/fx/standalone_module.py

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 124995e3fe7d..e526d096a878 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -9,6 +9,8 @@
 # symbolic trace
 from torch.fx import symbolic_trace
 
+from torch.fx.symbolic_trace import Tracer
+
 # graph mode quantization based on fx
 from torch.quantization import (
     QuantType,
@@ -322,6 +324,96 @@ def forward(self, x):
         m = convert_static_fx(m)
         m(dict_input)
 
+    def test_standalone_module_class(self):
+        class StandaloneModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        class CustomTracer(Tracer):
+            def is_leaf_module(self, m, module_qualified_name):
+                return (m.__module__.startswith('torch.nn') and
+                        not isinstance(m, torch.nn.Sequential)) or \
+                    isinstance(m, StandaloneModule)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(1, 1, 1)
+                self.standalone = StandaloneModule()
+
+            def forward(self, x):
+                x = self.conv(x)
+                x = self.standalone(x)
+                return x
+
+        class RefM(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(1, 1, 1)
+                self.conv2 = torch.nn.Conv2d(1, 1, 1)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x)
+                return x
+
+        data = torch.randn(1, 1, 1, 1)
+        # instantiate M and RefM and align the parameters
+        original_m = M()
+        original_ref_m = RefM()
+        original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach())
+        original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach())
+        original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach())
+        original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
+
+        m = CustomTracer().trace(original_m).eval()
+        qconfig_dict = {'': default_qconfig, 'standalone_module_name': ['standalone']}
+        # check prepared model
+        m = prepare_fx(m, qconfig_dict)
+        # calibration
+        m(data)
+        # input and output of first conv, observer for standalone module
+        # will be inserted in the standalone module itself
+        count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+        # for output of conv in the standalone module
+        count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 1
+        }
+        self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+
+        # check converted/quantized model
+        m = convert_fx(m)
+        count_check = {
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            ns.call_method('dequantize') : 1,
+        }
+        self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
+        count_check = {
+            # quantization of input happens in parent module
+            # quantization of output happens in the quantized conv module
+            ns.call_function(torch.quantize_per_tensor) : 0,
+            # dequantization for output happens in parent module
+            ns.call_method('dequantize') : 0,
+        }
+        self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+        res = m(data)
+
+        # quantize the reference model
+        ref_m = symbolic_trace(original_ref_m).eval()
+        ref_m = prepare_fx(ref_m, qconfig_dict)
+        ref_m(data)
+        ref_m = convert_fx(ref_m)
+        ref_res = ref_m(data)
+        self.assertEqual(res, ref_res)
+
     @skipIfNoFBGEMM
     def test_qconfig_none(self):
         class M(torch.nn.Module):
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index ab85c9a9daff..0cb076dd73c7 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -531,6 +531,24 @@ def convert(self, quantizer, node, load_arg, debug=False):
         # module attribute like module._QUANTIZED_INPUT_INDEXES
         return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
 
+class StandaloneModuleQuantizeHandler(QuantizeHandler):
+    """ Converts an observed standalone module to quantized standalone module
+    by calling convert_fx on the observed standalone module.
+    """
+    def convert(self, quantizer, node, load_arg, debug=False):
+        assert node.op == 'call_module'
+        if quantizer.is_dynamic_quant:
+            convert = torch.quantizations.quantize_fx._convert_dynamic_standalone_module_fx
+        else:
+            convert = torch.quantization.quantize_fx._convert_standalone_module_fx
+        observed_standalone_module = quantizer.modules[node.target]
+        quantized_standalone_module = convert(observed_standalone_module, debug=debug)
+        parent_name, name = _parent_name(node.target)
+        # update the modules dict
+        setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
+        quantizer.modules[node.target] = quantized_standalone_module
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
+
 
 # 2. Post Training Dynamic Quantizatoin Patterns
 @register_dynamic_quant_pattern(torch.nn.Linear)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index e79887bde2ca..eaebe4c04d55 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -2,6 +2,7 @@
 from torch.fx import (
     GraphModule,
     Proxy,
+    symbolic_trace,
 )
 
 from torch.fx.graph import (
@@ -33,6 +34,11 @@
     get_dynamic_quant_patterns,
 )
 
+from .standalone_module import (
+    mark_observed_standalone_module,
+    is_observed_standalone_module,
+)
+
 from .quantization_patterns import *
 
 from .utils import (
@@ -298,7 +304,20 @@ def get_qconfig(module_name):
                 self.modules[node.target].qconfig = module_qconfig
                 self.qconfig_map[node.name] = module_qconfig
 
-    def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
+    def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module):
+        """ standalone_module means it a submodule that is not inlined in parent module,
+        and will be quantized separately as one unit.
+
+        When we are preparing a standalone module:
+        input of the module is observed in parent module, output of the module
+        is observed in the standalone module.
+        Returns:
+            model(GraphModule): prepared standalone module with following attributes:
+                _standalone_module_observed_input_idxs(List[Int]): a list of indexs for the graph inputs that
+                                         needs to be observed in parent module
+                _output_is_observed(Bool): a boolean variable indicate whether the output of the
+                                   custom module is observed or not
+        """
         if not inplace:
             model = copy.deepcopy(model)
         self.is_dynamic_quant = is_dynamic_quant
@@ -320,7 +339,9 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         self._generate_qconfig_map(model, model.graph, qconfig_dict)
 
         # match the patterns that will get quantized
-        matches = self._find_matches(model.graph, self.modules, self.patterns)
+        standalone_module_names = qconfig_dict.get('standalone_module_name', None)
+        matches = self._find_matches(
+            model.graph, self.modules, self.patterns, standalone_module_names)
 
         # find _inputs_ to matched nodes that are not quantized, these
         # have to be quantized, which requires measuring stats,
@@ -328,7 +349,6 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         quants = self._find_quants(model.graph, matches)
 
         self.activation_post_process_map = dict()
-
         env = {}
         observed_graph = Graph()
         observed_node_names_set = set()
@@ -336,6 +356,15 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant):
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
+        # indexes for the inputs that needs to be observed
+        standalone_module_observed_input_idxs = []
+        graph_inputs = []
+        for node in model.graph.nodes:
+            if node.op == 'placeholder':
+                graph_inputs.append(node.name)
+
+        get_new_observer_name = get_new_attr_name_with_prefix('activation_post_process_')
+
         for node in model.graph.nodes:
             if node.name in observed_node_names_set:
                 continue
@@ -369,6 +398,25 @@ def insert_observer(node, observer, device):
                     parent_name, name = _parent_name(node.target)
                     setattr(self.modules[parent_name], name, observed_custom_module)
 
+                # index for input of custom module that needs to be observed in parent
+                standalone_module_input_idxs = None
+                if isinstance(obj, StandaloneModuleQuantizeHandler):
+                    # observe standalone module
+                    standalone_module = self.modules[node.target]
+                    traced_standalone_module = symbolic_trace(standalone_module)
+                    if self.is_dynamic_quant:
+                        prepare = torch.quantization.quantize_fx._prepare_dynamic_standalone_module_fx
+                    else:
+                        prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
+                    observed_standalone_module = prepare(traced_standalone_module, {'': qconfig})
+                    observed_standalone_module.qconfig = qconfig
+                    standalone_module_input_idxs = observed_standalone_module._standalone_module_observed_input_idxs
+                    observed_standalone_module = mark_observed_standalone_module(observed_standalone_module)
+                    parent_name, name = _parent_name(node.target)
+                    setattr(self.modules[parent_name], name, observed_standalone_module)
+                    self.modules[node.target] = observed_standalone_module
+
+
                 # don't need to insert observer for output in dynamic quantization
                 if self.is_dynamic_quant:
                     continue
@@ -393,20 +441,39 @@ def is_observed(input_arg):
                 elif (isinstance(obj, Add) or isinstance(obj, Mul)) and not obj.all_nodes:
                     if node.args[0].name in observed_node_names_set:
                         observed_node_names_set.add(node.name)
+                elif isinstance(obj, StandaloneModuleQuantizeHandler):
+                    assert node.op == 'call_module'
+                    output_is_observed = self.modules[node.target]._output_is_observed
+                    if output_is_observed:
+                        observed_node_names_set.add(node.name)
                 elif qconfig is not None and obj.all_nodes:
                     # observer for outputs
                     new_observer = qconfig.activation()
                     # respect device affinity when adding observers
                     device = assert_and_get_unique_device(model)
                     insert_observer(node, new_observer, device)
+
+                # insert observer for input of standalone module
+                if standalone_module_input_idxs is not None:
+                    for idx in standalone_module_input_idxs:
+                        if node.args[idx].name not in observed_node_names_set:
+                            new_observer = qconfig.activation()
+                            device = assert_and_get_unique_device(model)
+                            insert_observer(node.args[idx], new_observer, device)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
             if node.name not in observed_node_names_set and node.name in quants:
+                if is_standalone_module and node.name in graph_inputs:
+                    # we'll insert observer for input of standalone module
+                    # in parent graph
+                    standalone_module_observed_input_idxs.append(graph_inputs.index(node.name))
+                    continue
                 get_new_observer_name = get_new_attr_name_with_prefix(prefix)
                 observer_name = get_new_observer_name(model)
                 _, qconfig, is_weight = quants[node.name]
                 if qconfig is not None:
+                    # TODO: use insert_observer
                     new_observer = \
                         qconfig.weight() if is_weight else qconfig.activation()
                     # respect device affinity when adding observers
@@ -417,10 +484,18 @@ def is_observed(input_arg):
                     setattr(model, observer_name, self.activation_post_process_map[node.name])
                     env[node.name] = observed_graph.create_node('call_module', observer_name, (load_arg(node),), {})
                     observed_node_names_set.add(node.name)
-        observed_graph.output(load_arg(model.graph.result))
 
+        observed_graph.output(load_arg(model.graph.result))
         model = GraphModule(model, observed_graph)
         self.save_state(model)
+        if is_standalone_module:
+            assert isinstance(model.graph.result, Node), \
+                'standalone module returning dict is not yet supported'
+            # indicator for whether output is observed or not.
+            # This used for correctly quantize standalone modules
+            output_is_observed = model.graph.result.name in observed_node_names_set
+            model._standalone_module_observed_input_idxs = standalone_module_observed_input_idxs
+            model._output_is_observed = output_is_observed
         return model
 
     def save_state(self, observed):
@@ -440,11 +515,11 @@ def restore_state(self, observed):
         self.patterns = observed._patterns
         self.qconfig_map = observed._qconfig_map
 
-    def prepare(self, model, qconfig_dict, inplace=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=False)
+    def prepare(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
+        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=is_standalone_module)
 
-    def prepare_dynamic(self, model, qconfig_dict, inplace=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=True)
+    def prepare_dynamic(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
+        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=is_standalone_module)
 
     def _run_weight_observers(self, observed):
         r''' Extract the subgraph that produces the weight for dynamically quantized
@@ -465,7 +540,16 @@ def _run_weight_observers(self, observed):
                             weight_observer_module()
         return
 
-    def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False):
+    def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False, is_standalone_module=False):
+        """ standalone_module means it a submodule that is not inlined in parent module,
+        and will be quantized separately as one unit.
+        For standalone module: the inputs will be quantized by parent module,
+        checks `_standalone_module_observed_input_idxs` of
+        input observed model and will treat these inputs as quantized
+        also will not dequantize the final output.
+        Returns a quantized standalone module which accepts quantized input(if needed)
+        and produces quantized output (if needed).
+        """
         self.restore_state(model)
         if not inplace:
             model = copy.deepcopy(model)
@@ -482,10 +566,16 @@ def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False):
         matches = self._find_matches(model.graph, self.modules, self.patterns)
 
         quants = self._find_quants(model.graph, matches)
+
         self.quantized_graph = Graph()
         env = {}
         quant_env = {}
 
+        graph_inputs = []
+        for node in model.graph.nodes:
+            if node.op == 'placeholder':
+                graph_inputs.append(node.name)
+
         def load_non_quantized(n):
             if n.name not in env:
                 assert n.name in quant_env, \
@@ -569,6 +659,11 @@ def is_quantized(node):
                     quantized = False
                 else:
                     result = obj.convert(self, node, load_arg)
+                    if node.op == 'call_module' and is_observed_standalone_module(self.modules[node.target]):
+                        quantized = self.modules[node.target]._output_is_observed
+                    else:
+                        quantized = True
+
                     # Need to get correct quantized/non-quantized state for the output of CopyNode
                     if isinstance(obj, CopyNode):
                         assert node.op in [
@@ -577,8 +672,6 @@ def is_quantized(node):
                             'call_method'], \
                             'CopyNode of type ' + node.op + ' is not handled'
                         quantized = is_quantized(node.args[0])
-                    else:
-                        quantized = True
 
                     # output of dynamic quantization is not quantized
                     if self.is_dynamic_quant:
@@ -616,9 +709,21 @@ def is_quantized(node):
                         root_module, self.quantized_graph,
                         load_non_quantized(node.args[0]), observer_module)
                     continue
-            # dequantize inputs for the node that are not quantized
-            env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
-        self.quantized_graph.output(map_arg(model.graph.result, load_non_quantized))
+
+            if is_standalone_module and node.op == 'placeholder' and \
+               graph_inputs.index(node.name) in model._standalone_module_observed_input_idxs:
+                # the node is quantized in parent module
+                quant_env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
+            else:
+                # dequantize inputs for the node that are not quantized
+                env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
+
+        if is_standalone_module:
+            # result are kepted quantized in the quantized standalone module
+            graph_output = map_arg(model.graph.result, load_x)
+        else:
+            graph_output = map_arg(model.graph.result, load_non_quantized)
+        self.quantized_graph.output(graph_output)
 
         # remove activation post process
         act_post_process_removed_graph = Graph()
@@ -629,7 +734,7 @@ def load_arg(a):
         for node in self.quantized_graph.nodes:
             if node.op == 'call_module' and \
                is_activation_post_process(self.modules[node.target]):
-                # remove activation post process
+                # remove activation post process node
                 env[node.name] = env[node.args[0].name]
             else:
                 env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg)
@@ -695,13 +800,13 @@ def load_arg(a):
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 
-    def convert(self, model, inplace=False, debug=False, is_dynamic=False):
-        quantized = self._convert(model, inplace, debug, is_dynamic)
+    def convert(self, model, inplace=False, debug=False, is_dynamic=False, is_standalone_module=False):
+        quantized = self._convert(model, inplace, debug, is_dynamic, is_standalone_module)
         if not debug:
             quantized = self._fold_weight(quantized)
         return quantized
 
-    def _find_matches(self, graph, modules, patterns):
+    def _find_matches(self, graph, modules, patterns, standalone_module_names=None):
         """
         Matches the nodes in the input graph to quantization patterns, and
         outputs the information needed to quantize them in future steps.
@@ -756,6 +861,21 @@ def record_match(pattern, node, matched):
                 match_map[node.name] = (
                     node, [node], CustomModuleQuantizeHandler(self, node), custom_module_qconfig)
 
+        def is_standalone_module(module_path):
+            if standalone_module_names is None:
+                return False
+            return module_path in standalone_module_names
+
+        # add standalone modules to the match
+        for node in graph.nodes:
+            if node.op == 'call_module' and \
+               (is_standalone_module(node.target) or
+                    is_observed_standalone_module(self.modules[node.target])):
+                # add node to matched nodes
+                custom_module_qconfig = self.qconfig_map[node.name]
+                match_map[node.name] = (
+                    node, [node], StandaloneModuleQuantizeHandler(self, node), custom_module_qconfig)
+
         return match_map
 
     def _find_quants(self, graph, matches):
@@ -801,5 +921,9 @@ def visit_arg(arg):
                     map_arg(matched[-1].args, visit(matched[-1], qconfig))
                     map_arg(matched[-1].kwargs, visit(matched[-1], qconfig))
                     # output
+                    if isinstance(obj, StandaloneModuleQuantizeHandler):
+                        # we don't insert observer for output of custom
+                        # module
+                        continue
                     map_arg(matched[0], visit(None, qconfig))
         return quants
diff --git a/torch/quantization/fx/standalone_module.py b/torch/quantization/fx/standalone_module.py
new file mode 100644
index 000000000000..55aa8e21f98f
--- /dev/null
+++ b/torch/quantization/fx/standalone_module.py
@@ -0,0 +1,30 @@
+import torch
+import copy
+from torch.fx import GraphModule
+
+class ObservedStandaloneGraphModule(GraphModule):
+    _PRESERVED_ATTR_NAMES = [
+        '_activation_post_process_map',
+        '_patterns',
+        '_qconfig_map',
+        '_standalone_module_observed_input_idxs',
+        '_output_is_observed']
+
+    def __init__(self, root, graph):
+        preserved_attrs = dict()
+        for attr in self._PRESERVED_ATTR_NAMES:
+            preserved_attrs[attr] = getattr(root, attr)
+        super().__init__(root, graph)
+        for attr in preserved_attrs:
+            setattr(self, attr, preserved_attrs[attr])
+
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return ObservedStandaloneGraphModule(fake_mod, self.graph)
+
+def mark_observed_standalone_module(module):
+    return ObservedStandaloneGraphModule(module, module.graph)
+
+def is_observed_standalone_module(module):
+    return isinstance(module, ObservedStandaloneGraphModule)
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 0f68f2e0e9e9..8c48a8dded95 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -20,13 +20,44 @@ def fuse_fx(graph_module, inplace=False):
     fuser = Fuser()
     return fuser.fuse(graph_module, inplace)
 
-def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant):
+def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module=False):
+    r""" Internal helper function for prepare_fx
+    Args:
+      graph_module, qconfig_dict, inplace: see docs for :func:`~torch.quantization.prepare_fx`
+      `is_standalone_module`: a boolean flag indicates whether we are
+      quantizing a standalone module or not, a standalone module
+      is a submodule of the parent module that is not inlined in the
+forward graph of the parent module,
+      the way we quantize standalone module is described in:
+      :func:`~torch.quantization._prepare_standalone_module_fx`
+    """
     _check_is_graph_module(graph_module)
     graph_module = fuse_fx(graph_module, inplace)
     quantizer = Quantizer()
     prepare = quantizer.prepare_dynamic if is_dynamic_quant else quantizer.prepare
-    prepared = prepare(graph_module, qconfig_dict, inplace=True)
-    return prepared
+    return prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
+
+def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
+    r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
+    parent module.
+    standalone_module means it a submodule that is not inlined in parent module,
+        and will be quantized separately as one unit.
+
+    input of the module is quantized in parent module, output of the module
+    is quantized in the standalone module.
+    Extra attributes in output GraphModule while preparing a standalone module:
+        _standalone_module_observed_input_idxs(List[Int]): a list of indexs for the graph inputs that
+                                         needs to be observed in parent module
+        _output_is_observed(Bool): a boolean variable indicate whether the output of the
+                                   custom module is observed or not
+
+    """
+    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=True)
+
+def _prepare_dynamic_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
+    r""" See :func:`~torch.quantization.prepare_standalone_module_fx`
+    """
+    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=True)
 
 def prepare_fx(graph_module, qconfig_dict, inplace=False):
     r""" Prepare a model for post training static quantization or
@@ -41,7 +72,8 @@ def prepare_fx(graph_module, qconfig_dict, inplace=False):
       A GraphModule with observer or fake quant modules, ready for
       calibration or quantization aware training
     """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False)
+    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False)
+    return prepared
 
 def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
     assert not graph_module.training, 'prepare_static_fx only works for models in ' + \
@@ -66,12 +98,13 @@ def prepare_qat_fx(graph_module, qconfig_dict, inplace=False):
 def prepare_dynamic_fx(graph_module, qconfig_dict, inplace=False):
     r""" Prepare a model for post training dynamic quantization
     """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, True)
+    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, True)
+    return prepared
 
-def _convert_fx(graph_module, inplace, debug, is_dynamic_quant):
+def _convert_fx(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module=False):
     _check_is_graph_module(graph_module)
     quantizer = Quantizer()
-    return quantizer.convert(graph_module, inplace, debug, is_dynamic_quant)
+    return quantizer.convert(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module)
 
 def convert_fx(graph_module, inplace=False, debug=False):
     r""" Convert a calibrated or trained model to a quantized model
@@ -84,6 +117,24 @@ def convert_fx(graph_module, inplace=False, debug=False):
 def convert_dynamic_fx(graph_module, inplace=False, debug=False):
     return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True)
 
+def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
+    r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
+    and convert it to a quantized model
+
+    The inputs will be quantized by parent module, checks `_standalone_module_observed_input_idxs` of
+    input model and will treat these inputs as quantized
+    also will not dequantize the final output
+    Return:
+      A quantized standalone module which accepts quantized input(if needed)
+      and produces quantized output (if needed).
+    """
+    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False, is_standalone_module=True)
+
+def _convert_dynamic_standalone_module_fx(graph_module, inplace=False, debug=False):
+    r""" See :func:`~torch.quantization.convert_standalone_module_fx`
+    """
+    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True, is_standalone_module=True)
+
 def _quantize_fx(model, qconfig_dict, run_fn=None, run_args=None, inplace=False,
                  debug=False, is_dynamic_quant=False):
     assert not model.training, 'quantize_fx is only used for post training ' + \
@@ -138,10 +189,19 @@ def quantize_static_fx(model, qconfig_dict, run_fn, run_args, inplace=False, deb
         "module_name_regex": [
           ("foo.*bar.*conv[0-9]+", qconfig?)
           ...,
-        ]
+        ],
         # priority (in increasing order): global, object_type, module_name_regex, module_name
         # qconfig == None means fusion and quantization should be skipped for anything
         # matching the rule
+
+        # optional: specify the path for standalone modules
+        # These modules are symbolically traced and quantized as one unit
+        # User should also skip symbolic tracing through these modules
+        # so that the call to the submodule appears as one call_module
+        # node in the forward graph of the GraphModule
+        "standalone_module_name": [
+           "submodule.standalone"
+        ]
         }
         `run_fn`: a calibration function for calibrating the prepared model
         `run_args`: positional arguments for `run_fn`

From a245dd43176bdc17c8dd191e2483a162a3cc67be Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 30 Sep 2020 10:37:37 -0700
Subject: [PATCH 308/449] add dllexport before template specialization
 functions for windows build (#45477)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/41896

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45477

Reviewed By: zhangguanheng66

Differential Revision: D24006579

Pulled By: walterddr

fbshipit-source-id: 01e8808f0fecf9a405174fab5f348c02fb063e37
---
 caffe2/operators/roi_align_gradient_op.cc         | 2 +-
 caffe2/operators/roi_align_gradient_op.cu         | 2 +-
 caffe2/operators/roi_align_op.cc                  | 4 ++--
 caffe2/operators/roi_align_op.cu                  | 2 +-
 caffe2/operators/roi_align_rotated_gradient_op.cu | 2 +-
 caffe2/operators/roi_align_rotated_op.cc          | 2 +-
 caffe2/operators/roi_align_rotated_op.cu          | 2 +-
 caffe2/operators/roi_pool_op.cc                   | 2 +-
 caffe2/operators/roi_pool_op.cu                   | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/caffe2/operators/roi_align_gradient_op.cc b/caffe2/operators/roi_align_gradient_op.cc
index 7f3b1155e1b3..6a9b2bab0ec3 100644
--- a/caffe2/operators/roi_align_gradient_op.cc
+++ b/caffe2/operators/roi_align_gradient_op.cc
@@ -191,7 +191,7 @@ void ROIAlignBackwardFeature(
 } // namespace
 
 template <>
-bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignGradientOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu
index babf06d759eb..09f56e3269e7 100644
--- a/caffe2/operators/roi_align_gradient_op.cu
+++ b/caffe2/operators/roi_align_gradient_op.cu
@@ -190,7 +190,7 @@ __global__ void RoIAlignBackwardFeature(
 } // namespace
 
 template <>
-bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
diff --git a/caffe2/operators/roi_align_op.cc b/caffe2/operators/roi_align_op.cc
index 997eb1404b2e..55cbb47be81c 100644
--- a/caffe2/operators/roi_align_op.cc
+++ b/caffe2/operators/roi_align_op.cc
@@ -84,7 +84,7 @@ std::vector<BilinearInterpolationParam<T>> MakeBilinearInterpolationParams(
 } // namespace
 
 template <>
-bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
+C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
     int64_t N,
     int64_t C,
     int64_t H,
@@ -170,7 +170,7 @@ bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNCHW(
 }
 
 template <>
-bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNHWC(
+C10_EXPORT bool RoIAlignOp<float, CPUContext>::RunOnDeviceWithOrderNHWC(
     int64_t N,
     int64_t C,
     int64_t H,
diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu
index 62d7842e2ae3..4d0edd3a408c 100644
--- a/caffe2/operators/roi_align_op.cu
+++ b/caffe2/operators/roi_align_op.cu
@@ -149,7 +149,7 @@ __global__ void RoIAlignForward(
 } // namespace
 
 template <>
-bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
                       // RoI pooled data
diff --git a/caffe2/operators/roi_align_rotated_gradient_op.cu b/caffe2/operators/roi_align_rotated_gradient_op.cu
index 1ca0b73c72fa..cc16a828858f 100644
--- a/caffe2/operators/roi_align_rotated_gradient_op.cu
+++ b/caffe2/operators/roi_align_rotated_gradient_op.cu
@@ -198,7 +198,7 @@ __global__ void RoIAlignRotatedBackward(
 } // namespace
 
 template <>
-bool RoIAlignRotatedGradientOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignRotatedGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op
diff --git a/caffe2/operators/roi_align_rotated_op.cc b/caffe2/operators/roi_align_rotated_op.cc
index c94d0f11bd1f..73464f1fe6ee 100644
--- a/caffe2/operators/roi_align_rotated_op.cc
+++ b/caffe2/operators/roi_align_rotated_op.cc
@@ -291,7 +291,7 @@ void ROIAlignRotatedForward(
 } // namespace
 
 template <>
-bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignRotatedOp<float, CPUContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
 
diff --git a/caffe2/operators/roi_align_rotated_op.cu b/caffe2/operators/roi_align_rotated_op.cu
index 96e4797c597c..67c1d38f51b4 100644
--- a/caffe2/operators/roi_align_rotated_op.cu
+++ b/caffe2/operators/roi_align_rotated_op.cu
@@ -158,7 +158,7 @@ __global__ void RoIAlignRotatedForward(
 } // namespace
 
 template <>
-bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIAlignRotatedOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
 
diff --git a/caffe2/operators/roi_pool_op.cc b/caffe2/operators/roi_pool_op.cc
index 95a6cbfa386c..d0018b03f4a6 100644
--- a/caffe2/operators/roi_pool_op.cc
+++ b/caffe2/operators/roi_pool_op.cc
@@ -8,7 +8,7 @@ using std::max;
 using std::min;
 
 template <>
-bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
+C10_EXPORT bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
   const auto& X = Input(0); // Input data to pool
   const auto& R = Input(1); // RoIs
   auto* Y = Output(0); // RoI pooled data
diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu
index af479f8a5881..7c1ef1316623 100644
--- a/caffe2/operators/roi_pool_op.cu
+++ b/caffe2/operators/roi_pool_op.cu
@@ -167,7 +167,7 @@ bool RoIPoolOp<float, CUDAContext>::RunOnDevice() {
 }
 
 template <>
-bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
+C10_EXPORT bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0); // Input data to pool
   auto& R = Input(1); // RoIs
   auto& A = Input(2); // argmaxes

From ce9df084d52bde6851296137c01022f55cca1eb8 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Wed, 30 Sep 2020 10:41:05 -0700
Subject: [PATCH 309/449] [pytorch] Replace "blacklist" in
 test/test_mobile_optimizer.py (#45512)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45512

This diff addresses https://github.com/pytorch/pytorch/issues/41443.
It is a clone of D23205313 which could not be imported from GitHub
for strange reasons.

Test Plan: Continuous integration.

Reviewed By: AshkanAliabadi

Differential Revision: D23967322

fbshipit-source-id: 744eb92de7cb5f0bc9540ed6a994f9e6dce8919a
---
 test/test_mobile_optimizer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
index 38d8b0dbed7a..11235edac7c0 100644
--- a/test/test_mobile_optimizer.py
+++ b/test/test_mobile_optimizer.py
@@ -100,8 +100,8 @@ def forward(self, x):
         torch.testing.assert_allclose(initial_result, optimized_result, rtol=1e-2, atol=1e-3)
 
 
-        optimization_blacklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
-        optimized_scripted_model_no_prepack = optimize_for_mobile(scripted_model, optimization_blacklist_no_prepack)
+        optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
+        optimized_scripted_model_no_prepack = optimize_for_mobile(scripted_model, optimization_blocklist_no_prepack)
         optimized_result_no_prepack = optimized_scripted_model_no_prepack(input_data)
 
         FileCheck().check_count("Tensor = aten::conv2d", 1, exactly=True) \
@@ -118,14 +118,14 @@ def forward(self, x):
         FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \
                    .run(str(get_forward(bn_scripted_module._c).graph))
 
-        optimization_blacklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
-        bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blacklist_no_prepack)
+        optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS}
+        bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_prepack)
         self.assertEqual(len(torch.jit.export_opnames(bn_fold_scripted_module)), 1)
         bn_input = torch.rand(1, 1, 6, 6)
         torch.testing.assert_allclose(bn_scripted_module(bn_input), bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3)
 
-        optimization_blacklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION}
-        no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blacklist_no_fold_bn)
+        optimization_blocklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION}
+        no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_fold_bn)
         FileCheck().check_count("aten::batch_norm", 1, exactly=True) \
                    .run(str(get_forward_graph(no_bn_fold_scripted_module._c)))
         bn_input = torch.rand(1, 1, 6, 6)

From c112e89cc6e289a6817d9e84ce308222375a83ed Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Wed, 30 Sep 2020 11:33:41 -0700
Subject: [PATCH 310/449] [quant] Make choose_qparams_optimized return Tensors
 to preserve dtype (#45530)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45530

Returning double values requires special handling as a return type for aten functions.
Instead return tensors where the type is preserved in the tensor dtype

Test Plan:
python test/test_quantization.py TestQuantizedTensor.test_choose_qparams_optimized

Imported from OSS

Reviewed By: dskhudia

Differential Revision: D24001134

fbshipit-source-id: bec6b17242f4740ab5674be06e0fc30c35eb0379
---
 aten/src/ATen/native/native_functions.yaml    |  2 +-
 aten/src/ATen/native/quantized/QTensor.cpp    | 17 ++--
 .../quantized/cpu/qembeddingbag_prepack.cpp   | 16 +++-
 .../check_backward_compatibility.py           |  1 +
 test/quantization/test_quantized_tensor.py    | 77 +++++++++++++++++++
 tools/autograd/gen_python_functions.py        |  1 -
 6 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 43df6031727c..72bb4131375f 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4437,7 +4437,7 @@
   use_c10_dispatcher: full
   variants: function
 
-- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (float, float)
+- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
   use_c10_dispatcher: full
   variants: function
 
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 9db2a6eb2ac4..8ae92a0d3bec 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -245,15 +245,14 @@ float calculate_quant_loss(
   float scale = data_range == 0
       ? 1.0
       : static_cast<float>(static_cast<at::Half>(data_range / qmax));
-  float inverse_scale = 1.0f / scale;
+  float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale;
 
   float norm = 0.0f;
-  constexpr int VLEN = 8;
   int i = 0;
 
-// TODO add FBGEMM kernel
-// #ifdef USE_FBGEMM
-// #endif
+  // TODO add FBGEMM kernel
+  // #ifdef USE_FBGEMM
+  // #endif
 
   // remainder loop
   for (; i < numel; i++) {
@@ -271,7 +270,7 @@ float calculate_quant_loss(
   and tries to minimize the quant error by doing `torch.norm(x-fake_quant(x,s,z))`
   Returns the optimized xmax and xmin value of the tensor.
 */
-std::tuple<double, double> choose_qparams_optimized(
+std::tuple<Tensor, Tensor> choose_qparams_optimized(
     const at::Tensor& input_tensor,
     int64_t numel,
     const int64_t n_bins,
@@ -318,7 +317,11 @@ std::tuple<double, double> choose_qparams_optimized(
     }
   }
 
-  return std::make_tuple((float) xmax, (float) xmin);
+  at::Tensor xmax_tensor = at::empty({1});
+  at::Tensor xmin_tensor = at::empty({1});
+  xmax_tensor[0] = xmax;
+  xmin_tensor[0] = xmin;
+  return std::make_tuple(xmax_tensor, xmin_tensor);
 }
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index d4e00d1ad48d..e94f0be0d802 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -196,8 +196,14 @@ Tensor _qembeddingbag_nbit_prepack_helper(
 
       float Xmin, Xmax;
       if (optimized_qparams) {
-        std::tie(Xmax, Xmin) = at::choose_qparams_optimized(
+        at::Tensor xmax_tensor, xmin_tensor;
+        std::tie(xmax_tensor, xmin_tensor) = at::choose_qparams_optimized(
             weight_contig[row], embedding_cols, 200, 0.16, bit_width);
+        TORCH_CHECK(
+            xmax_tensor.numel() == 1 && xmin_tensor.numel() == 1,
+            "Expected choose_qparams_optimized to return min/max tensors of size 1");
+        Xmax = xmax_tensor.item<float>();
+        Xmin = xmin_tensor.item<float>();
       } else {
         Xmin = *std::min_element(input_row, input_row + embedding_cols);
         Xmax = *std::max_element(input_row, input_row + embedding_cols);
@@ -254,7 +260,9 @@ Tensor _qembeddingbag_nbit_prepack_helper(
 // To later de-quantize values, the scale (range / 15) and zero_point
 // are stored alongside the data. More precisely, each row first has quantized
 // values, and then 2-byte fp16 scale and 2-byte zero_offset.
-Tensor qembeddingbag_4bit_prepack(const Tensor& weight, bool optimized_qparams) {
+Tensor qembeddingbag_4bit_prepack(
+    const Tensor& weight,
+    bool optimized_qparams) {
   return _qembeddingbag_nbit_prepack_helper(
       weight, 4 /*bit_width*/, optimized_qparams);
 }
@@ -267,7 +275,9 @@ Tensor qembeddingbag_4bit_prepack(const Tensor& weight, bool optimized_qparams)
 // are stored alongside the data. More precisely, each row first has quantized
 // values, and then 2-byte fp16 scale and 2-byte zero_offset.
 // TODO() - Add 2Bit Embedding Lookup operator.
-Tensor qembeddingbag_2bit_prepack(const Tensor& weight, bool optimized_qparams) {
+Tensor qembeddingbag_2bit_prepack(
+    const Tensor& weight,
+    bool optimized_qparams) {
   return _qembeddingbag_nbit_prepack_helper(
       weight, 2 /*bit_width*/, optimized_qparams);
 }
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index e720f8a4fdbc..6ba9ff66de29 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -108,6 +108,7 @@
     ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),
     ("aten::_foreach_div", datetime.date(2020, 10, 1)),
     ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
+    ("aten::choose_qparams_optimized", datetime.date(2020, 10, 5)),
 ]
 
 
diff --git a/test/quantization/test_quantized_tensor.py b/test/quantization/test_quantized_tensor.py
index fc3aa3c655eb..03f1a2e2b637 100644
--- a/test/quantization/test_quantized_tensor.py
+++ b/test/quantization/test_quantized_tensor.py
@@ -67,6 +67,75 @@ def _calculate_dynamic_qparams(X, dtype, reduce_range=False):
 def get_supported_device_types():
     return ['cpu', 'cuda'] if torch.cuda.is_available() and not TEST_WITH_ROCM else ['cpu']
 
+# Note we explicitly cast variables to np.float32 in a couple of places to avoid
+# the default casting in Python often resuling in double precision and to make
+# sure we're doing the same numerics as C++ code.
+def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16):
+    xmin, xmax = np.min(x), np.max(x)
+    stepsize = (xmax - xmin) / np.float32(n_bins)
+    min_bins = np.float32(n_bins) * (np.float32(1) - np.float32(ratio))
+    xq, loss = _compress_uniform_simplified(x, bit_rate, xmin, xmax)
+
+    solutions = []  # [(left, right, loss)] # local optima solution
+
+    cur_min, cur_max, cur_loss = xmin, xmax, loss
+    thr = min_bins * stepsize
+    while cur_min + thr < cur_max:
+        # move left
+        xq, loss1 = _compress_uniform_simplified(
+            x, bit_rate, cur_min + stepsize, cur_max
+        )
+        # move right
+        xq, loss2 = _compress_uniform_simplified(
+            x, bit_rate, cur_min, cur_max - stepsize
+        )
+
+        if cur_loss < loss1 and cur_loss < loss2:
+            # found a local optima
+            solutions.append((cur_min, cur_max, cur_loss))
+        if loss1 < loss2:
+            cur_min, cur_max, cur_loss = cur_min + stepsize, cur_max, loss1
+        else:
+            cur_min, cur_max, cur_loss = cur_min, cur_max - stepsize, loss2
+    if len(solutions):
+        best = solutions[0]
+        for solution in solutions:
+            if solution[-1] < best[-1]:
+                best = solution
+        return best[1], best[0]  # xmax, xmin
+    return xmax, xmin
+
+
+def _compress_uniform_simplified(X, bit_rate, xmin, xmax, fp16_scale_bias=True):
+    # affine transform to put Xq in [0,2**bit_rate - 1]
+    # Xq = (2 ** bit_rate - 1) * (Xq - xmin) / data_range
+    if fp16_scale_bias:
+        xmin = xmin.astype(np.float16).astype(np.float32)
+    data_range = xmax - xmin
+    scale = np.where(
+        data_range == 0, np.float32(1), data_range / np.float32(2 ** bit_rate - 1)
+    )
+    if fp16_scale_bias:
+        scale = scale.astype(np.float16).astype(np.float32)
+    inverse_scale = np.float32(1) / scale
+    Xq = np.clip(np.round((X - xmin) * inverse_scale), 0, np.float32(2 ** bit_rate - 1))
+    Xq = Xq * scale + xmin
+
+    # Manually compute loss instead of using np.linalg.norm to use the same
+    # accumulation order used by C++ code
+    vlen = 8
+    loss_v = np.zeros(vlen).astype(np.float32)
+    for i in range(len(Xq) // vlen * vlen):
+        loss_v[i % vlen] += (X[i] - Xq[i]) * (X[i] - Xq[i])
+    loss = np.float32(0)
+    for i in range(vlen):
+        loss += loss_v[i]
+    for i in range(len(Xq) // vlen * vlen, len(Xq)):
+        loss += (X[i] - Xq[i]) * (X[i] - Xq[i])
+    loss = np.sqrt(loss)
+
+    return Xq, loss
+
 class TestQuantizedTensor(TestCase):
     def test_qtensor(self):
         num_elements = 10
@@ -745,3 +814,11 @@ def test_fp16_saturate_op(self):
         ref[0] = torch.ones(5) * -65504
         y = torch._saturate_weight_to_fp16(x)
         self.assertEqual(y, ref)
+
+    def test_choose_qparams_optimized(self):
+        for bit_width in [4, 2]:
+            x = torch.randn(64, dtype=torch.float)
+            y = torch.choose_qparams_optimized(x, numel=64, n_bins=200, ratio=0.16, bit_width=bit_width)
+            ref = param_search_greedy(x.numpy(), bit_rate=bit_width)
+            self.assertEqual(y[0].numpy(), ref[0])
+            self.assertEqual(y[1].numpy(), ref[1])
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 8f272de9a5f6..eb5de6f75ef5 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -405,7 +405,6 @@ def get_cpp_formal(arg, ensure_temp_safe=True):
     'std::tuple<Tensor,Tensor,Tensor,Tensor,int64_t>',
     'std::tuple<Tensor,Tensor,double,Tensor,int64_t>',
     'std::tuple<double,int64_t>',
-    'std::tuple<double,double>',
     'std::vector<Tensor>',
     'Scalar', 'bool', 'int64_t', 'void*', 'void',
     'QScheme', 'double',

From 181afd52205d974ff99d30dfa3d3033a71a47e2e Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Wed, 30 Sep 2020 11:51:05 -0700
Subject: [PATCH 311/449] Add an option to DDP to take a list of parameters to
 ignore upfront. (#44826)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44826

As described in https://github.com/pytorch/pytorch/issues/43690, there
is a need for DDP to be able to ignore certain parameters in the module (not
install allreduce hooks) for certain use cases. `find_unused_parameters` is
sufficient from a correctness perspective, but we can get better performance
with this upfront list if users know which params are unused, since we won't
have to traverse the autograd graph every iteration.

To enable this, we add a field `parameters_to_ignore` to DDP init and don't
pass in that parameter to reducer if that parameter is in the given list.
ghstack-source-id: 113210109

Test Plan: Added unittest

Reviewed By: xw285cornell, mrshenli

Differential Revision: D23740639

fbshipit-source-id: a0411712a8b0b809b9c9e6da04bef2b955ba5314
---
 torch/csrc/distributed/c10d/reducer.cpp       |  3 +-
 torch/nn/parallel/distributed.py              | 77 +++++++++++++---
 .../_internal/distributed/distributed_test.py | 88 +++++++++++++++++++
 3 files changed, 156 insertions(+), 12 deletions(-)

diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 6bbed5e87fea..90128e48ee1d 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -1341,8 +1341,9 @@ bool Reducer::rebuild_buckets() {
       replicas_[0].size() == rebuilt_param_indices_.size(),
       c10::str(
           "rebuilt parameter indices size is not same as original model parameters size.",
+          "Original model param size is: ",
           replicas_[0].size(),
-          " versus ",
+          " versus rebuilt params size of: ",
           rebuilt_param_indices_.size()));
   std::vector<std::vector<size_t>> rebuilt_bucket_indices;
   std::vector<size_t> bucket_size_limits;
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 3d2f00ead9a9..894175e8c12f 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -389,6 +389,10 @@ def __init__(self, module, device_ids=None,
         self.require_forward_param_sync = True
         self.ddp_join_enabled = False
         self.gradient_as_bucket_view = gradient_as_bucket_view
+        if hasattr(module, '_ddp_params_and_buffers_to_ignore'):
+            self.parameters_to_ignore = module._ddp_params_and_buffers_to_ignore
+        else:
+            self.parameters_to_ignore = []
 
         if check_reduction:
             # This argument is no longer used since the reducer
@@ -412,7 +416,11 @@ def __init__(self, module, device_ids=None,
         self._ddp_init_helper()
 
     def _sync_params_and_buffers(self, authoritative_rank=0):
-        module_states = list(self.module.state_dict().values())
+        module_states = []
+        for name, param in self.module.state_dict().items():
+            if name not in self.parameters_to_ignore:
+                module_states.append(param)
+
         if len(module_states) > 0:
             self._distributed_broadcast_coalesced(
                 module_states,
@@ -478,17 +486,55 @@ def model_parameters(m):
             self._module_copies = [self.module]
 
         self.modules_params = [list(parameters(m)) for m in self._module_copies]
-        self.modules_buffers = [list(m.buffers()) for m in self._module_copies]
-
-        # Build tuple of (module, parameter) for all parameters that require grads.
-        modules_and_parameters = [
+        # Collect buffers for modules, filtering out buffers that should be ignored.
+        named_module_buffers = [
+            [(buffer, buffer_name) for buffer_name, buffer in m.named_buffers()]
+            for m in self._module_copies
+        ]
+        self.modules_buffers = [
             [
-                (module, parameter)
-                for module in replica.modules()
-                for parameter in filter(
-                    lambda parameter: parameter.requires_grad,
-                    parameters(module, recurse=False))
-            ] for replica in self._module_copies]
+                buffer
+                for (buffer, buffer_name) in module_buffers
+                if buffer_name not in self.parameters_to_ignore
+            ]
+            for module_buffers in named_module_buffers
+        ]
+        # Build tuple of (module, parameter) for all parameters that require grads.
+        if self.device_ids and len(self.device_ids) > 1:
+            # Single-process multi-device mode,does not support self.parameters_to_ignore.
+            if self.parameters_to_ignore:
+                raise ValueError(
+                    "Single-Process multi-device mode does not "
+                    "support ignoring parameters upfront. Please consider "
+                    "using one DDP instance per device."
+                )
+
+            modules_and_parameters = [
+                [
+                    (module, parameter)
+                    for module in replica.modules()
+                    for parameter in filter(
+                        lambda parameter: parameter.requires_grad,
+                        parameters(module, recurse=False))
+                ] for replica in self._module_copies]
+        else:
+            modules_and_parameters = [
+                [
+                    (module, parameter)
+                    for module_name, module in replica.named_modules()
+                    for parameter in [
+                        param
+                        # Note that we access module.named_parameters instead of
+                        # parameters(module). parameters(module) is only needed in the
+                        # single-process multi device case, where it accesses replicated
+                        # parameters through _former_parameters.
+                        for param_name, param in module.named_parameters(recurse=False)
+                        if param.requires_grad
+                        and f"{module_name}.{param_name}" not in self.parameters_to_ignore
+                    ]
+                ]
+                for replica in self._module_copies
+            ]
 
         # Build list of parameters.
         parameters = [
@@ -1088,3 +1134,12 @@ def _check_comm_hook(self, hook):
             raise ValueError(
                 "Communication hook: return annotation should be torch.futures.Future or torch._C.Future."
             )
+
+    @staticmethod
+    def _set_params_and_buffers_to_ignore_for_model(
+        module, params_and_buffers_to_ignore
+    ):
+        # This is a workaround to set parameters and buffers DDP should ignore
+        # during synchronization. It will be removed when the API is finalized
+        # as part of addressing https://github.com/pytorch/pytorch/issues/43690.
+        module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 791172b4e46c..5d43882ff024 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -3335,3 +3335,91 @@ def test_broadcast_object_list(self):
                 self.assertNotEqual(objects, collectives_object_test_list)
             dist.broadcast_object_list(objects, src=0)
             self.assertEqual(objects, collectives_object_test_list)
+
+        @require_backend({"gloo", "nccl"})
+        @require_backends_available({"gloo", "nccl"})
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_ignore_params_arg(self):
+            class TestModel(nn.Module):
+                def __init__(self, rank):
+                    self.rank = rank
+                    super(TestModel, self).__init__()
+                    self.fc1 = nn.Linear(1, 1, bias=False)
+                    # Proxy that will be materialized to another architecture later.
+                    # (after wrapping model with DDP)
+                    if self.rank == 0:
+                        self.fc2 = nn.Linear(1, 10, bias=False)
+                    else:
+                        self.fc2 = nn.Linear(10, 10, bias=False)
+
+                def forward(self, x):
+                    x = self.fc1(x)
+                    x = self.fc2(x)
+                    return x
+
+            device_id = self.rank
+            # Ensure the test works for both find_unused_parameter and broadcast_buffer settings.
+            for (find_unused, broadcast_buffers) in itertools.product([False, True], [False, True]):
+                model = TestModel(self.rank).float().to(device_id)
+                # Note that the model can have different shape buffers if we pass
+                # them in to be ignored as well.
+                model.fc2.register_buffer(
+                    "ignore_buffer", torch.zeros(5 + self.rank, device=self.rank)
+                )
+                proxy_params = list(model.fc2.parameters())
+                proxy_buffers = list(model.fc2.buffers())
+                model_fc2_name = [
+                    module_name
+                    for module_name, module in model.named_modules()
+                    if module is model.fc2
+                ][0]
+                proxy_param_names = [
+                    f"{model_fc2_name}.{param_name}"
+                    for param_name, _ in model.fc2.named_parameters()
+                ]
+                proxy_buffer_names = [
+                    f"{model_fc2_name}.{buf_name}"
+                    for buf_name, _ in model.fc2.named_buffers()
+                ]
+                # Specify that we should ignore proxy_params since it will be
+                # materialized later.
+                torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                    model, proxy_param_names + proxy_buffer_names
+                )
+                ddp = torch.nn.parallel.DistributedDataParallel(
+                    model,
+                    device_ids=[device_id],
+                    find_unused_parameters=find_unused,
+                    broadcast_buffers=broadcast_buffers,
+                )
+                # Materialize new params. These are not registered in DDP and thus
+                # don't have autograd hooks installed on them.
+                ddp.module.fc2 = nn.Linear(1, 1, bias=False).to(device_id)
+                # local model with the new materialized parameters.
+                local_model = copy.deepcopy(ddp.module).cuda(self.rank)
+
+                inp = torch.ones(1, dtype=torch.float).to(device_id) * (self.rank + 1)
+                for i in range(6):
+                    ddp(inp).sum().backward()
+                    local_model(inp).sum().backward()
+                    # materialized param grad is not touched by DDP, so its grad should
+                    # be the same as if running locally.
+                    for materialized_param, local_param in zip(
+                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+                    ):
+                        self.assertEqual(materialized_param.grad, local_param.grad)
+
+                    # fc1 parameter grad should still be different, due to allreduce.
+                    for synced_param, local_param in zip(
+                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+                    ):
+                        self.assertFalse(synced_param.grad == local_param.grad)
+
+                    # Proxy module grad should not be touched
+                    for proxy_param in proxy_params:
+                        self.assertTrue(proxy_param.grad is None)
+
+                # Synchronize since we run multiple iterations of this test, to
+                # isolate failure hangs.
+                torch.cuda.synchronize(device=self.rank)

From c9bb990707d4bfe524f3f1c4a77ff85fed1cd2a2 Mon Sep 17 00:00:00 2001
From: Xinyu Li <lixinyu@fb.com>
Date: Wed, 30 Sep 2020 12:35:08 -0700
Subject: [PATCH 312/449] [c++] Distance-agnostic triplet margin loss (#45377)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45377

This PR adds a C++ implementation of the TripletMarginWithDistanceLoss, for which the Python implementation was introduced in PR #43680.  It's based on PR #44072, but I'm resubmitting this to unlink it from Phabricator.

Test Plan: Imported from OSS

Reviewed By: izdeby

Differential Revision: D24003973

fbshipit-source-id: 2d9ada7260a6f27425ff2fdbbf623dad0fb79405
---
 test/cpp/api/functional.cpp                   |  50 +++++++
 test/cpp/api/modules.cpp                      | 127 +++++++++++++++++-
 .../api/include/torch/nn/functional/loss.h    |  79 +++++++++++
 .../csrc/api/include/torch/nn/modules/loss.h  |  54 +++++++-
 .../csrc/api/include/torch/nn/options/loss.h  |  45 +++++++
 torch/csrc/api/src/nn/modules/loss.cpp        |  31 ++++-
 torch/nn/modules/loss.py                      |   4 +-
 7 files changed, 379 insertions(+), 11 deletions(-)

diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 8c75e5fed10b..707c1bfd7ac0 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -682,6 +682,56 @@ TEST_F(FunctionalTest, TripletMarginLoss) {
   ASSERT_TRUE(output.allclose(expected, 1e-04));
 }
 
+TEST_F(FunctionalTest, TripletMarginWithDistanceLossDefaultParity) {
+  // Check that if we use torch::pairwise_distance with the default
+  // TripletMarginLoss options as our distance function, the outputs
+  // are equal (i.e., equal under defaults).
+
+  std::vector<TripletMarginWithDistanceLossOptions::reduction_t>
+      reductions = {torch::kSum, torch::kMean, torch::kNone};
+  std::vector<float> margins = {0.5, 1.0, 1.5};
+  std::vector<bool> swaps = {true, false};
+
+  for (auto& reduction : reductions) {
+    for (auto& margin : margins) {
+      for (const auto& swap : swaps) {
+        auto anchor = 
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto positive =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto negative =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+
+        auto basicOptions = F::TripletMarginLossFuncOptions()
+                                .reduction(reduction)
+                                .margin(margin)
+                                .swap(swap);
+        auto distanceOptions =
+            F::TripletMarginWithDistanceLossFuncOptions()
+                .reduction(reduction)
+                .margin(margin)
+                .swap(swap);
+        TripletMarginLoss basicLoss(basicOptions);
+        TripletMarginWithDistanceLoss distanceLoss(distanceOptions);
+
+        auto basicOutput =
+            F::triplet_margin_loss(anchor, positive, negative, basicOptions);
+        auto distanceOutput = F::triplet_margin_with_distance_loss(
+            anchor, positive, negative, distanceOptions);
+
+        ASSERT_TRUE(distanceOutput.allclose(basicOutput, 1e-6, 1e-6));
+
+        // handle for torch::kNone reduction
+        auto sum = distanceOutput.sum();
+        sum.backward();
+        ASSERT_EQ(anchor.sizes(), anchor.grad().sizes());
+        ASSERT_EQ(positive.sizes(), positive.grad().sizes());
+        ASSERT_EQ(negative.sizes(), negative.grad().sizes());
+      }
+    }
+  }
+}
+
 TEST_F(FunctionalTest, NLLLoss) {
   auto input = torch::tensor({{-0.1315, -3.1315, -2.5315},
                               {-3.7038, -0.1038, -2.6038},
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 4777cf0b54bc..ef0fc2765551 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -2085,6 +2085,115 @@ TEST_F(ModulesTest, TripletMarginLoss) {
   ASSERT_EQ(anchor.sizes(), anchor.grad().sizes());
 }
 
+TEST_F(ModulesTest, TripletMarginWithDistanceLossDefaultParity) {
+  // Check that if we use torch::pairwise_distance with the default
+  // TripletMarginLoss options as our distance function, the outputs
+  // are equal (i.e., equal under defaults).
+
+  std::vector<TripletMarginWithDistanceLossOptions::reduction_t>
+      reductions = {torch::kSum, torch::kMean, torch::kNone};
+  std::vector<float> margins = {0.5, 1.0, 1.5};
+  std::vector<bool> swaps = {true, false};
+
+  for (auto& reduction : reductions) {
+    for (auto& margin : margins) {
+      for (const auto& swap : swaps) {
+        auto anchor = 
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto positive =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+        auto negative =
+            torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+
+        auto basicOptions = TripletMarginLossOptions()
+                                .reduction(reduction)
+                                .margin(margin)
+                                .swap(swap);
+        auto distanceOptions =
+            TripletMarginWithDistanceLossOptions()
+                .reduction(reduction)
+                .margin(margin)
+                .swap(swap);
+        TripletMarginLoss basicLoss(basicOptions);
+        TripletMarginWithDistanceLoss distanceLoss(distanceOptions);
+
+        auto basicOutput = basicLoss->forward(anchor, positive, negative);
+        auto distanceOutput = distanceLoss->forward(anchor, positive, negative);
+        auto basicOperatorOutput = basicLoss(anchor, positive, negative);
+        auto distanceOperatorOutput = distanceLoss(anchor, positive, negative);
+
+        ASSERT_TRUE(distanceOutput.allclose(basicOutput, 1e-6, 1e-6));
+        ASSERT_TRUE(distanceOperatorOutput.allclose(distanceOutput, 1e-6, 1e-6));
+        ASSERT_TRUE(distanceOperatorOutput.allclose(basicOperatorOutput, 1e-6, 1e-6));
+
+        // handle for torch::kNone reduction
+        auto sum = distanceOutput.sum();
+        sum.backward();
+        ASSERT_EQ(anchor.sizes(), anchor.grad().sizes());
+        ASSERT_EQ(positive.sizes(), positive.grad().sizes());
+        ASSERT_EQ(negative.sizes(), negative.grad().sizes());
+      }
+    }
+  }
+}
+
+TEST_F(ModulesTest, TripletMarginWithDistanceLossFunctionalParity) {
+  // Check for parity between F::triplet_margin_with_distance_loss and
+  // TripletMarginWithDistanceLoss.
+  auto pairwise_distance = [&](const torch::Tensor& x, const torch::Tensor& y) {
+    return torch::pairwise_distance(x, y);
+  };
+  auto cosine_distance = [&](const torch::Tensor& x,
+                                 const torch::Tensor& y) {
+    return 1.0 - torch::cosine_similarity(x, y);
+  };
+  std::vector<TripletMarginWithDistanceLossOptions::distance_function_t>
+      distance_functions = {pairwise_distance, cosine_distance};
+
+  std::vector<TripletMarginWithDistanceLossOptions::reduction_t>
+      reductions = {torch::kSum, torch::kMean, torch::kNone};
+  std::vector<float> margins = {0.5, 1.0, 1.5};
+  std::vector<bool> swaps = {true, false};
+
+  for (auto& function : distance_functions) {
+    for (auto& reduction : reductions) {
+      for (auto& margin : margins) {
+        for (const auto& swap : swaps) {
+          auto moduleOptions =
+              TripletMarginWithDistanceLossOptions()
+                  .distance_function(function)
+                  .reduction(reduction)
+                  .margin(margin)
+                  .swap(swap);
+          auto functionOptions =
+              torch::nn::functional::TripletMarginWithDistanceLossFuncOptions()
+                  .distance_function(function)
+                  .reduction(reduction)
+                  .margin(margin)
+                  .swap(swap);
+
+          auto anchor = torch::randn(
+              {100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+          auto positive = torch::randn(
+              {100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+          auto negative = torch::randn(
+              {100, 128}, torch::dtype(torch::kFloat).requires_grad(true));
+
+          TripletMarginWithDistanceLoss distanceLoss(moduleOptions);
+
+          auto moduleOutput = distanceLoss->forward(anchor, positive, negative);
+          auto moduleOperatorOutput = distanceLoss(anchor, positive, negative);
+          auto functionOutput = torch::nn::functional::triplet_margin_with_distance_loss(
+            anchor, positive, negative, functionOptions);
+
+          ASSERT_TRUE(moduleOutput.allclose(functionOutput, 1e-6, 1e-6));
+          ASSERT_TRUE(moduleOperatorOutput.allclose(functionOutput, 1e-6, 1e-6));
+        }
+      }
+    }
+  }
+}
+
 TEST_F(ModulesTest, NLLLoss) {
   NLLLoss loss;
   auto input = torch::tensor({{-0.1315, -3.1315, -2.5315},
@@ -3529,9 +3638,9 @@ TEST_F(ModulesTest, PrettyPrintIdentity) {
 }
 
 TEST_F(ModulesTest, PrettyPrintFlatten) {
-  ASSERT_EQ(c10::str(Flatten()), 
+  ASSERT_EQ(c10::str(Flatten()),
     "torch::nn::Flatten(start_dim=1, end_dim=-1)");
-  ASSERT_EQ(c10::str(Flatten(FlattenOptions().start_dim(2).end_dim(4))), 
+  ASSERT_EQ(c10::str(Flatten(FlattenOptions().start_dim(2).end_dim(4))),
     "torch::nn::Flatten(start_dim=2, end_dim=4)");
 }
 
@@ -4394,6 +4503,20 @@ TEST_F(ModulesTest, PrettyPrintTripletMarginLoss) {
       "torch::nn::TripletMarginLoss(margin=3, p=2, eps=1e-06, swap=false)");
 }
 
+TEST_F(ModulesTest, PrettyPrintTripletMarginWithDistanceLoss) {
+  auto distanceOptions = TripletMarginWithDistanceLossOptions()
+                             .distance_function([&](const torch::Tensor& x,
+                                                    const torch::Tensor& y) {
+                               return torch::pairwise_distance(x, y, 2.0, 1e-6);
+                             })
+                             .margin(1.5)
+                             .swap(true)
+                             .reduction(torch::kMean);
+  ASSERT_EQ(
+      c10::str(TripletMarginWithDistanceLoss(distanceOptions)),
+      "torch::nn::TripletMarginWithDistanceLoss(margin=1.5, swap=true)");
+}
+
 TEST_F(ModulesTest, PrettyPrintNLLLoss) {
   ASSERT_EQ(
       c10::str(NLLLoss()), "torch::nn::NLLLoss()");
diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h
index cf6afe8b2b9f..6ed3c37311c0 100644
--- a/torch/csrc/api/include/torch/nn/functional/loss.h
+++ b/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -527,6 +527,85 @@ inline Tensor triplet_margin_loss(
 
 // ============================================================================
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    c10::optional<TripletMarginWithDistanceLossFuncOptions::distance_function_t> distance_function,
+    double margin,
+    bool swap,
+    TripletMarginWithDistanceLossFuncOptions::reduction_t reduction) {
+  Tensor dist_pos, dist_neg;
+  if (distance_function.has_value()) {
+    auto distance_function_impl = distance_function.value();
+    dist_pos = distance_function_impl(anchor, positive);
+    dist_neg = distance_function_impl(anchor, negative);
+  } else {
+    dist_pos = pairwise_distance(anchor, positive);
+    dist_neg = pairwise_distance(anchor, negative);
+  }
+
+  if (swap) {
+    Tensor dist_swap;
+    if (distance_function.has_value()) {
+      dist_swap = distance_function.value()(positive, negative);
+    } else {
+      dist_swap = pairwise_distance(positive, negative);
+    }
+    dist_neg = torch::min(dist_neg, dist_swap);
+  }
+
+  auto loss = torch::clamp_min(dist_pos - dist_neg + margin, 0);
+
+  Tensor ret;
+  if (c10::get_if<enumtype::kNone>(&reduction)) {
+    ret = loss;
+  } else if (c10::get_if<enumtype::kMean>(&reduction)) {
+    ret = loss.mean();
+  } else if (c10::get_if<enumtype::kSum>(&reduction)) {
+    ret = loss.sum();
+  } else {
+    ret = anchor;
+    TORCH_INTERNAL_ASSERT(
+      false,
+      enumtype::get_enum_name(reduction),
+      " is not valid");
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.triplet_margin_with_distance_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::TripletMarginWithDistanceLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative, F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    const TripletMarginWithDistanceLossFuncOptions& options = {}) {
+  return detail::triplet_margin_with_distance_loss(
+    anchor,
+    positive,
+    negative,
+    options.distance_function(),
+    options.margin(),
+    options.swap(),
+    options.reduction());
+}
+
+// ============================================================================
+
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace detail {
 inline Tensor ctc_loss(const Tensor& log_probs,
diff --git a/torch/csrc/api/include/torch/nn/modules/loss.h b/torch/csrc/api/include/torch/nn/modules/loss.h
index d136f9cb7ee9..8c9308864842 100644
--- a/torch/csrc/api/include/torch/nn/modules/loss.h
+++ b/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -309,7 +309,7 @@ struct TORCH_API SmoothL1LossImpl : public Cloneable<SmoothL1LossImpl> {
 TORCH_MODULE(SmoothL1Loss);
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiLabelMarginLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  
+
 /// Creates a criterion that optimizes a multi-class multi-classification
 /// hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
 /// and output :math:`y` (which is a 2D `Tensor` of target class indices).
@@ -421,9 +421,9 @@ TORCH_MODULE(MultiLabelSoftMarginLoss);
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// Creates a criterion that measures the triplet loss given an input
-/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater 
+/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater
 /// than :math:`0`. This is used for measuring a relative similarity between
-/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`, 
+/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`,
 /// `positive examples` and `negative examples` respectively). The
 /// shapes of all input tensors should be :math:`(N, D)`.
 /// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginLoss to learn
@@ -461,6 +461,50 @@ struct TORCH_API TripletMarginLossImpl : public Cloneable<TripletMarginLossImpl>
 /// module storage semantics.
 TORCH_MODULE(TripletMarginLoss);
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginWithDistanceLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the triplet loss given input
+/// tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+/// positive, and negative examples, respectively); and a nonnegative, real-valued function
+/// ("distance function") used to compute the relationships between the anchor
+/// and positive example ("positive distance") and the anchor and negative
+/// example ("negative distance").
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginWithDistanceLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossImpl : public Cloneable<TripletMarginWithDistanceLossImpl> {
+  explicit TripletMarginWithDistanceLossImpl(
+      TripletMarginWithDistanceLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `TripletMarginWithDistanceLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& anchor,
+      const Tensor& positive,
+      const Tensor& negative);
+
+  /// The options with which this `Module` was constructed.
+  TripletMarginWithDistanceLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `TripletMarginWithDistanceLossImpl`.
+/// See the documentation for `TripletMarginWithDistanceLossImpl` class to learn what methods it
+/// provides, and examples of how to use `TripletMarginWithDistanceLoss` with
+/// `torch::nn::TripletMarginWithDistanceLossOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TripletMarginWithDistanceLoss);
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CTCLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 /// The Connectionist Temporal Classification loss.
@@ -626,9 +670,9 @@ TORCH_MODULE(NLLLoss);
 struct TORCH_API CrossEntropyLossImpl : public Cloneable<CrossEntropyLossImpl> {
   explicit CrossEntropyLossImpl(
       const CrossEntropyLossOptions& options_ = {});
-    
+
   void reset() override;
-    
+
   /// Pretty prints the `CrossEntropyLoss` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override;
 
diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h
index 16cdd02aa562..e175aa02294a 100644
--- a/torch/csrc/api/include/torch/nn/options/loss.h
+++ b/torch/csrc/api/include/torch/nn/options/loss.h
@@ -388,6 +388,51 @@ using TripletMarginLossFuncOptions = TripletMarginLossOptions;
 
 // ============================================================================
 
+/// Options for the `TripletMarginWithDistanceLoss` module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossOptions {
+  typedef c10::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum> reduction_t;
+  typedef std::function<Tensor(const Tensor&, const Tensor&)> distance_function_t;
+
+  /// Specifies a nonnegative, real-valued function that quantifies the
+  /// closeness of two tensors. If not specified, `F::pairwise_distance` will
+  /// be used. Default: nullopt
+  TORCH_ARG(c10::optional<distance_function_t>, distance_function) = c10::nullopt;
+  /// Specifies a nonnegative margin representing the minimum difference
+  /// between the positive and negative distances required for the loss to be 0.
+  /// Larger margins penalize cases where the negative examples are not distance
+  /// enough from the anchors, relative to the positives. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Whether to use the distance swap described in the paper Learning shallow
+  /// convolutional feature descriptors with triplet losses by V. Balntas,
+  /// E. Riba et al. If True, and if the positive example is closer to the
+  /// negative example than the anchor is, swaps the positive example and the
+  /// anchor in the loss computation. Default: False
+  TORCH_ARG(bool, swap) = false;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::triplet_margin_with_distance_loss`.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative, F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+using TripletMarginWithDistanceLossFuncOptions = TripletMarginWithDistanceLossOptions;
+} // namespace functional
+
+// ============================================================================
+
 /// Options for the `CTCLoss` module.
 ///
 /// Example:
diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp
index 43ab1119def9..4b41b88c420c 100644
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@@ -180,6 +180,33 @@ Tensor TripletMarginLossImpl::forward(
 
 // ============================================================================
 
+TripletMarginWithDistanceLossImpl::TripletMarginWithDistanceLossImpl(
+    TripletMarginWithDistanceLossOptions options_)
+    : options(std::move(options_)) {}
+
+void TripletMarginWithDistanceLossImpl::reset() {}
+
+void TripletMarginWithDistanceLossImpl::pretty_print(std::ostream& stream) const {
+  stream << "torch::nn::TripletMarginWithDistanceLoss(margin=" << options.margin()
+         << std::boolalpha << ", swap=" << options.swap() << ")";
+}
+
+Tensor TripletMarginWithDistanceLossImpl::forward(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative) {
+  return F::detail::triplet_margin_with_distance_loss(
+    anchor,
+    positive,
+    negative,
+    options.distance_function(),
+    options.margin(),
+    options.swap(),
+    options.reduction());
+}
+
+// ============================================================================
+
 MultiLabelMarginLossImpl::MultiLabelMarginLossImpl(
     const torch::nn::MultiLabelMarginLossOptions& options_)
     : options(options_) {}
@@ -223,9 +250,9 @@ void SmoothL1LossImpl::pretty_print(std::ostream& stream) const {
 Tensor SmoothL1LossImpl::forward(const Tensor& input, const Tensor& target) {
   return F::detail::smooth_l1_loss(input, target, options.reduction());
 }
-  
+
 // ============================================================================
-  
+
 CTCLossImpl::CTCLossImpl(const CTCLossOptions& options_) : options(options_) {}
 
 void CTCLossImpl::reset() {}
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index bd3be4e10daa..13dac25b68a2 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1273,7 +1273,7 @@ class TripletMarginWithDistanceLoss(_Loss):
 
     where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function
     quantifying the closeness of two tensors, referred to as the :attr:`distance_function`;
-    and :math:`margin` is a non-negative margin representing the minimum difference
+    and :math:`margin` is a nonnegative margin representing the minimum difference
     between the positive and negative distances that is required for the loss to
     be 0.  The input tensors have :math:`N` elements each and can be of any shape
     that the distance function can handle.
@@ -1295,7 +1295,7 @@ class TripletMarginWithDistanceLoss(_Loss):
         distance_function (callable, optional): A nonnegative, real-valued function that
             quantifies the closeness of two tensors. If not specified,
             `nn.PairwiseDistance` will be used.  Default: ``None``
-        margin (float, optional): A non-negative margin representing the minimum difference
+        margin (float, optional): A nonnegative margin representing the minimum difference
             between the positive and negative distances required for the loss to be 0. Larger
             margins penalize cases where the negative examples are not distant enough from the
             anchors, relative to the positives. Default: :math:`1`.

From ffd50b82201effa13c8272729e7f8bfc9b7e0884 Mon Sep 17 00:00:00 2001
From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com>
Date: Wed, 30 Sep 2020 12:38:11 -0700
Subject: [PATCH 313/449] SET USE_DISTRIBUTED OFF when libuv is not installed
 (#45554)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45554

Reviewed By: izdeby

Differential Revision: D24016825

Pulled By: mrshenli

fbshipit-source-id: 332d860429626a915c06f98cad31e6db1cbc4eb1
---
 CMakeLists.txt | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a094271a802..0d1225ab450e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,9 +227,29 @@ option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 
 # Since TensorPipe does not support Windows, set it to OFF when WIN32 detected
+# On Windows platform, if user does not install libuv in build conda env and
+# does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF.
 if(WIN32)
   set(USE_TENSORPIPE OFF)
   message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF")
+
+  if(USE_DISTRIBUTED AND NOT DEFINED ENV{libuv_ROOT})
+    find_library(
+      libuv_tmp_LIBRARY
+      NAMES uv libuv
+      HINTS $ENV{CONDA_PREFIX}\\Library
+      PATH_SUFFIXES lib
+      REQUIRED
+      NO_DEFAULT_PATH)
+    if(NOT EXISTS ${libuv_tmp_LIBRARY})
+      set(USE_DISTRIBUTED OFF)
+      set(USE_GLOO OFF)
+      message(
+        WARNING "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF.")
+    else()
+      set(ENV{libuv_ROOT} $ENV{CONDA_PREFIX}\\Library)
+    endif()
+  endif()
 endif()
 
 # Linux distributions do not want too many embedded sources, in that sense we
@@ -292,12 +312,6 @@ if(LINUX)
   set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed")
 endif()
 
-if(WIN32 AND USE_DISTRIBUTED)
-  if(NOT DEFINED ENV{libuv_ROOT})
-    set(ENV{libuv_ROOT} $ENV{CONDA_PREFIX}\\Library)
-  endif()
-endif()
-
 if(MSVC)
   foreach(flag_var
       CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE

From ac9a708ed0aba8b1f8d258247219c86077d6d0b1 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 30 Sep 2020 13:16:12 -0700
Subject: [PATCH 314/449] [FX] Shape propagation example (#45589)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45589

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D24024606

Pulled By: jamesr66a

fbshipit-source-id: 5340eab20f805c232bfeb37e4e2156f39a161c19
---
 test/test_fx.py                     | 27 +++++++++++++++++
 torch/fx/experimental/shape_prop.py | 46 +++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 torch/fx/experimental/shape_prop.py

diff --git a/test/test_fx.py b/test/test_fx.py
index 1790cf3e3559..c4fba462421f 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Tracer, Graph
 from torch.fx.experimental import GraphManipulation
+from torch.fx.experimental import shape_prop
 
 from torch.fx.proxy import TraceError
 
@@ -733,5 +734,31 @@ def test_wrong_topo(self):
         with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'):
             graph.lint()
 
+    def test_example_shape_prop(self):
+        class TestCase(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.randn(3, 4)
+                self.submod = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return torch.neg(self.submod(x.relu() + self.attr))
+        tc = TestCase()
+        tc_traced = symbolic_trace(tc)
+        ref_out = tc_traced(torch.rand(3, 4))
+
+        # Make sure we're testing all opcodes
+        opcodes = set()
+        for node in tc_traced.graph.nodes:
+            opcodes.add(node.op)
+        self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method', 'call_module']))
+
+        # Test shape propogation and make sure results match actual
+        shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4))
+        self.assertEqual(tc_traced.graph.result.shape, ref_out.shape)
+
+
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py
new file mode 100644
index 000000000000..92df6c08d468
--- /dev/null
+++ b/torch/fx/experimental/shape_prop.py
@@ -0,0 +1,46 @@
+import torch
+from torch.fx.node import Node
+
+class ShapeProp:
+    def __init__(self, mod):
+        self.mod = mod
+        self.graph = mod.graph
+        self.modules = dict(self.mod.named_modules())
+
+    def propagate(self, *args):
+        args_iter = iter(args)
+        env : Dict[str, Node] = {}
+
+        def load_arg(a):
+            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+
+        def fetch_attr(target : str):
+            target_atoms = target.split('.')
+            attr_itr = self.mod
+            for i, atom in enumerate(target_atoms):
+                if not hasattr(attr_itr, atom):
+                    raise RuntimeError(f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}")
+                attr_itr = getattr(attr_itr, atom)
+            return attr_itr
+
+        for node in self.graph.nodes:
+            if node.op == 'placeholder':
+                result = next(args_iter)
+            elif node.op == 'get_attr':
+                result = fetch_attr(node.target)
+            elif node.op == 'call_function':
+                result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'call_method':
+                self_obj, *args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = getattr(self_obj, node.target)(*args, **kwargs)
+            elif node.op == 'call_module':
+                result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+
+            if isinstance(result, torch.Tensor):
+                node.shape = result.shape
+                node.dtype = result.dtype
+
+            env[node.name] = result
+
+        return load_arg(self.graph.result)

From 6b42ca2d692161cfcb564692423ba248b4b86196 Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Wed, 30 Sep 2020 13:34:26 -0700
Subject: [PATCH 315/449] [ONNX] Update embedding_bag export (#44693)

Summary:
Export of embedding bag with dynamic list of offsets.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44693

Reviewed By: malfet

Differential Revision: D23831980

Pulled By: bzinodev

fbshipit-source-id: 3eaff1a0f20d1bcfb8039e518d78c491be381e1a
---
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 75 +++++++++++-------
 .../passes/onnx/fixup_onnx_controlflow.cpp    | 54 ++++++-------
 .../jit/passes/onnx/fixup_onnx_controlflow.h  |  3 +-
 torch/csrc/jit/passes/onnx/helper.cpp         | 22 +++---
 torch/csrc/jit/passes/onnx/helper.h           |  5 +-
 torch/csrc/jit/python/init.cpp                |  1 +
 torch/csrc/jit/python/python_ir.cpp           | 10 ++-
 torch/onnx/symbolic_opset10.py                | 43 ++--------
 torch/onnx/symbolic_opset11.py                | 78 ++++++++++++++++++-
 torch/onnx/utils.py                           | 38 +++++++--
 10 files changed, 218 insertions(+), 111 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index efb2d6b2433d..310072ff4c4f 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -3483,28 +3483,9 @@ def forward(self, input):
         x = torch.tensor([False, True, True])
         self.run_test(model, x)
 
-    @unittest.skip("Enable once jit trace Tensor.numel as constant is fixed.")
-    def test_embedding_bag_dynamic(self):
-        class EmbeddingModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.embeddingbag = torch.nn.EmbeddingBag(40, 12, mode='sum')
-
-            def forward(self, input):
-                return self.embeddingbag(input)
-
-        model = EmbeddingModel()
-        x = torch.randint(7, (10, 5))
-        y = torch.randint(10, (20, 5))
-        self.run_test(model, x, test_with_inputs=[y],
-                      input_names=['input'],
-                      output_names=['output'],
-                      dynamic_axes={'input': [0],
-                                    'output': [0]
-                                    })
-
-    @disableScriptTest()  # error in propagate as assign input shapes
+    @disableScriptTest()  # error in propagate as assign input shape
     @skipIfUnsupportedMinOpsetVersion(10)
+    @skipIfUnsupportedOpsetVersion([12])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag(self):
         model = torch.nn.EmbeddingBag(10, 5, mode='sum', scale_grad_by_freq=True)
         input = torch.randint(10, (7,))
@@ -3520,27 +3501,29 @@ def test_embedding_bag(self):
         input = torch.randint(10, (7, 5))
         self.run_test(model, (input))
 
-    @disableScriptTest()  # error in propagate as assign input shapes
+    @disableScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
     @skipIfUnsupportedMinOpsetVersion(10)
+    @skipIfUnsupportedOpsetVersion([12])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag_1d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, offset, weights):
-                return torch.nn.functional.embedding_bag(embedding_matrix, input, offsets=offset,
+                return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offset,
                                                          mode='sum', per_sample_weights=weights)
 
         model = EmbeddingModel()
         x = torch.randint(7, (6,))
-        w = torch.randn(6,)
+        w = torch.randn(6, )
         offset = torch.tensor([0, 2, 5])
         embedding_matrix = torch.rand(10, 15)
         self.run_test(model, (embedding_matrix, x, offset, w))
 
-    @disableScriptTest()  # error in propagate as assign input shapes
+    @disableScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
     @skipIfUnsupportedMinOpsetVersion(10)
+    @skipIfUnsupportedOpsetVersion([12])  # Due to ONNX Loop shape inference issue
     def test_embedding_bag_2d_per_sample_weights(self):
         class EmbeddingModel(torch.nn.Module):
             def forward(self, embedding_matrix, input, weights):
-                return torch.nn.functional.embedding_bag(embedding_matrix, input,
+                return torch.nn.functional.embedding_bag(input, embedding_matrix,
                                                          mode='sum', per_sample_weights=weights)
 
         embedding_matrix = torch.rand(10, 15)
@@ -3549,6 +3532,46 @@ def forward(self, embedding_matrix, input, weights):
         w = torch.randn(2, 3)
         self.run_test(model, (embedding_matrix, x, w))
 
+    @disableScriptTest()  # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast
+    @skipIfUnsupportedMinOpsetVersion(11)
+    @unittest.skip("Due to ONNX Loop shape inference issue.")
+    def test_embedding_bag_dynamic_input(self):
+        class EmbeddingModel1D(torch.nn.Module):
+            def forward(self, embedding_matrix, input, weights, offsets):
+                return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offsets,
+                                                         mode='sum', per_sample_weights=weights)
+
+        model = EmbeddingModel1D()
+        x = torch.randint(7, (6,))
+        w = torch.randn(6, )
+        offsets = torch.tensor([0, 2, 5], dtype=torch.long)
+        embedding_matrix = torch.rand(10, 15)
+        x2 = torch.randint(7, (2,))
+        w2 = torch.randn(2, )
+        embedding_matrix2 = torch.rand(12, 25)
+        offsets2 = torch.tensor([0, ], dtype=torch.long)
+        self.run_test(model, (embedding_matrix, x, w, offsets),
+                      test_with_inputs=[(embedding_matrix2, x2, w2, offsets2)],
+                      input_names=['embedding_matrix', 'x', 'offsets', 'w'],
+                      dynamic_axes={'embedding_matrix': [0, 1], 'x': [0], 'offsets': [0], 'w': [0]})
+
+        class EmbeddingModel2D(torch.nn.Module):
+            def forward(self, embedding_matrix, input, weights):
+                return torch.nn.functional.embedding_bag(input, embedding_matrix,
+                                                         mode='sum', per_sample_weights=weights)
+
+        model = EmbeddingModel2D()
+        x = torch.randint(7, (2, 3))
+        w = torch.randn(2, 3)
+        embedding_matrix = torch.rand(10, 15)
+        x2 = torch.randint(7, (3, 5))
+        w2 = torch.randn(3, 5)
+        embedding_matrix2 = torch.rand(12, 25)
+        self.run_test(model, (embedding_matrix, x, w),
+                      test_with_inputs=[(embedding_matrix2, x2, w2)],
+                      input_names=['embedding_matrix', 'x', 'w'],
+                      dynamic_axes={'embedding_matrix': [0, 1], 'x': [0, 1], 'w': [0, 1]})
+
     @skipIfUnsupportedMinOpsetVersion(8)
     def test_meshgrid(self):
         class Meshgrid(torch.nn.Module):
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
index ce9ce5fb37c4..c6c8eceb8f45 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.cpp
@@ -82,33 +82,6 @@ bool IsErasableSequence(const Node* loop_node, size_t i) {
   return true;
 }
 
-void FixupONNXLoopNodeInputs(Node* node) {
-  if (node->kind() != ::c10::onnx::Loop) {
-    return;
-  }
-
-  auto* graph = node->owningGraph();
-
-  // add cast to condition input outside the loop.
-  Value* cond_val = node->inputs()[1];
-  if (IsCondCastRequired(cond_val))
-    InsertCastForCond(cond_val, graph, node);
-
-  // Setup Loop input cond and i.
-  TORCH_INTERNAL_ASSERT(node->blocks().size() == 1);
-  auto* sub_block = node->blocks()[0];
-  Value* cond = sub_block->insertInput(1, "cond");
-  cond->setType(BoolType::create());
-
-  Value* i = sub_block->inputs()[0];
-  i->setType(TensorType::fromNumberType(IntType::get()));
-
-  // add cast to condition input inside the loop.
-  Value* next_cond_val = sub_block->outputs()[0];
-  if (IsCondCastRequired(next_cond_val))
-    InsertCastForCond(next_cond_val, graph, sub_block->return_node());
-}
-
 // ONNX::Loop does not support Sequence type as loop-carried dependencies. Only
 // tensors are supported. This pass converts Sequence loop-carried dependencies
 // to scan_outputs. In opset 11, only the below pattern is supported.
@@ -218,6 +191,33 @@ void ConvertSequenceDependencies(Block* block, int opset_version) {
 }
 } // anonymous namespace
 
+void FixupONNXLoopNodeInputs(Node* node) {
+  if (node->kind() != ::c10::onnx::Loop) {
+    return;
+  }
+
+  auto* graph = node->owningGraph();
+
+  // add cast to condition input outside the loop.
+  Value* cond_val = node->inputs()[1];
+  if (IsCondCastRequired(cond_val))
+    InsertCastForCond(cond_val, graph, node);
+
+  // Setup Loop input cond and i.
+  TORCH_INTERNAL_ASSERT(node->blocks().size() == 1);
+  auto* sub_block = node->blocks()[0];
+  Value* cond = sub_block->insertInput(1, "cond");
+  cond->setType(BoolType::create());
+
+  Value* i = sub_block->inputs()[0];
+  i->setType(TensorType::fromNumberType(IntType::get()));
+
+  // add cast to condition input inside the loop.
+  Value* next_cond_val = sub_block->outputs()[0];
+  if (IsCondCastRequired(next_cond_val))
+    InsertCastForCond(next_cond_val, graph, sub_block->return_node());
+}
+
 std::vector<Value*> FixupONNXLoopNode(Node* node, int opset_version) {
   auto output_size = node->outputs().size();
   FixupONNXLoopNodeInputs(node);
diff --git a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
index 3487946d721b..e2097ccf7809 100644
--- a/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
+++ b/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
@@ -5,7 +5,8 @@
 namespace torch {
 namespace jit {
 
+void FixupONNXLoopNodeInputs(Node* node);
 std::vector<Value*> FixupONNXControlflowNode(Node* n, int opset_version);
 
-}
+} // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/helper.cpp b/torch/csrc/jit/passes/onnx/helper.cpp
index e4965f692a23..07f71fc64fea 100644
--- a/torch/csrc/jit/passes/onnx/helper.cpp
+++ b/torch/csrc/jit/passes/onnx/helper.cpp
@@ -50,16 +50,6 @@ void buildParamsMapFromValueToParamsMap(
   }
 }
 
-Node* addNodeToBlock(Block* block, Value* input, Symbol kind) {
-  auto new_node = block->appendNode(block->owningGraph()->create(kind));
-  auto new_input = new_node->addInput(input);
-  for (size_t i = 0; i < new_node->outputs().size(); i++) {
-    auto output = new_node->outputs()[i];
-    block->registerOutput(output);
-  }
-  return new_node;
-}
-
 c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
   switch (onnx_type) {
     case ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED:
@@ -94,5 +84,17 @@ c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type) {
   return c10::optional<at::ScalarType>{};
 }
 
+Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef<Value*> inputs) {
+  auto new_node = block->appendNode(block->owningGraph()->create(kind));
+  for (auto input : inputs) {
+    auto new_input = new_node->addInput(input);
+  }
+  return new_node;
+}
+
+Value* addInputToBlock(Block* block) {
+  return block->addInput();
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h
index b3ab64fe759a..e27909ff6362 100644
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@@ -27,7 +27,10 @@ void eraseUnusedBlockInputs(Block* b);
 void buildParamsMapFromValueToParamsMap(
     const ValueToParamPairMap& valsToParamsMap,
     ParamMap& paramsDict);
-Node* addNodeToBlock(Block* block, Value* input, Symbol kind);
+
+Node* addNodeToBlock(Block* block, Symbol kind, ArrayRef<Value*> inputs);
+
+Value* addInputToBlock(Block* block);
 
 TORCH_API c10::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type);
 } // namespace jit
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index db866704aa97..43e96839a3c2 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -452,6 +452,7 @@ void initJITBindings(PyObject* module) {
           })
       .def("_jit_pass_onnx_block", BlockToONNX)
       .def("_jit_pass_fixup_onnx_controlflow_node", FixupONNXControlflowNode)
+      .def("_jit_pass_fixup_onnx_loop_node_inputs", FixupONNXLoopNodeInputs)
       .def("_jit_pass_canonicalize_graph_fuser_ops", CanonicalizeOps)
       .def("_jit_pass_decompose_ops", DecomposeOps)
       .def("_jit_pass_specialize_autogradzero", specializeAutogradZero)
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 1a0635a1c122..640725679595 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -470,8 +470,14 @@ void initPythonIRBindings(PyObject* module_) {
           })
       .def("returnNode", [](Block& b) { return b.return_node(); })
       .def("paramNode", [](Block& b) { return b.param_node(); })
-      .def("addNode", [](Block& b, Value& input, const char* str) {
-        return addNodeToBlock(&b, &input, Symbol::fromQualString(str));
+      .def(
+          "addNode",
+          [](Block& b, const char* str, const std::vector<Value*>& inputs) {
+            return addNodeToBlock(&b, Symbol::fromQualString(str), inputs);
+          })
+      .def("addInputToBlock", [](Block& b) { return addInputToBlock(&b); })
+      .def("registerOutput", [](Block& b, Value* value) {
+        return b.registerOutput(value);
       });
 
 #define NS(name) def(#name, &Node ::name)
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index a6930ac619ba..718b30f8fde3 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -5,12 +5,12 @@
 # This import monkey-patches graph manipulation methods on Graph, used for the
 # ONNX symbolics
 import torch.onnx.utils
-from sys import maxsize
 
 import torch.onnx.symbolic_helper as sym_help
 from torch.onnx.symbolic_helper import parse_args, _unimplemented
 import torch.onnx.symbolic_opset9
 
+from sys import maxsize
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -205,39 +205,11 @@ def embedding_bag(g,
                   include_last_offset):
     if scale_grad_by_freq and sym_help._training_mode:
         return sym_help._onnx_unsupported('embedding_bag with scale_grad_by_freq for training mode')
-
-    from torch.onnx.symbolic_opset9 import size, select
-
-    # Check if initial indices was 2D. In functional.py:
-    # offsets is set to torch.arange(0, indices.numel(), indices.size(1))
-    # Then indices is reshaped to 1D: indices.reshape(-1)
-    if len(list(indices.node().inputs())) > 0 and indices.node().inputs().__next__().type().sizes() is not None \
-            and len(indices.node().inputs().__next__().type().sizes()) == 2:
-        # Assert include_last_offset is False
-        assert not include_last_offset
-        embeddings = g.op("Gather", embedding_matrix, indices)
-        dim_0 = size(g, offsets, g.op("Constant", value_t=torch.LongTensor([0])))
-        dim_1 = g.op('Div', size(g, indices, g.op("Constant", value_t=torch.LongTensor([0]))), dim_0)
-        dim_2 = g.op("Constant", value_t=torch.LongTensor([-1]))
-
-        shape = [dim_0, dim_1, dim_2]
-        shape = g.op("Concat", *shape, axis_i=0)
-
-        if not sym_help._is_none(per_sample_weights):
-            per_sample_weights = g.op("Unsqueeze", per_sample_weights, axes_i=[1])
-            embeddings = g.op("Mul", embeddings, per_sample_weights)
-
-        embeddings = g.op("Reshape", embeddings, shape)
-        if mode == 0:
-            embeddings = g.op("ReduceSum", embeddings, axes_i=[1], keepdims_i=0)
-        elif mode == 1:
-            embeddings = g.op("ReduceMean", embeddings, axes_i=[1], keepdims_i=0)
-        else:
-            embeddings = g.op("ReduceMax", embeddings, axes_i=[1], keepdims_i=0)
-        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
-        return embeddings, None, None, None
-    elif offsets.type().sizes() is not None:
+    from torch.onnx.symbolic_opset9 import select
+    import warnings
+    warnings.warn("Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+                  "Please use opset 11 or higher to export model for dynamic input shape.'")
+    if offsets.type().sizes() is not None:
         if include_last_offset:
             offset_len = offsets.type().sizes()[0] - 1
             offsets_extended = offsets
@@ -272,7 +244,8 @@ def embedding_bag(g,
         # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
         return output, None, None, None
     else:
-        return sym_help._onnx_unsupported('embedding_bag with unknown shape of indices')
+        return sym_help._onnx_unsupported('embedding_bag with unknown shape of offsets for opset 10 is not supported. '
+                                          'please use opset 11 or higher.')
 
 
 @parse_args('v', 't', 'i', 'i', 'i')
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 18af82df2f79..83b0da0aef5d 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -9,7 +9,7 @@
 from torch.onnx.symbolic_helper import parse_args, _unimplemented
 from torch.onnx.symbolic_opset9 import expand, unused
 from torch.nn.modules.utils import _single, _pair, _triple
-
+from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
 
 # EDITING THIS FILE? READ THIS FIRST!
 # see Note [Edit Symbolic Files] in symbolic_helper.py
@@ -538,7 +538,6 @@ def squeeze(g, self, dim=None):
         return g.op("Squeeze", self)
 
     dim = sym_help._get_const(dim, 'i', 'dim')
-
     # create 'cond' node (condition is shape[i]==1)
     dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
     size = sym_help._size_helper(g, self, dim_constant)
@@ -547,10 +546,15 @@ def squeeze(g, self, dim=None):
     # create the 'If' node and add the 'then' and 'else' blocks to it.
     if_node_outputs = g.op("If", cond)
     if_node = if_node_outputs.node()
-    torch.onnx.utils._add_block(if_node, self, "onnx::Squeeze", axes_i=[dim])
-    torch.onnx.utils._add_block(if_node, self, "onnx::Identity")
+    if_block = torch.onnx.utils._add_block(if_node)
+    squeeze_ = if_block.op("Squeeze", self, axes_i=[dim])
+    torch.onnx.utils._add_output_to_block(if_block, squeeze_)
+    else_block = torch.onnx.utils._add_block(if_node)
+    identity_ = else_block.op("Identity", self)
+    torch.onnx.utils._add_output_to_block(else_block, identity_)
     return if_node_outputs
 
+
 @parse_args('v', 'i')
 def unsqueeze(g, self, dim):
     return g.op("Unsqueeze", self, axes_i=[dim])
@@ -738,3 +742,69 @@ def flatten(g, input, start_dim, end_dim):
         end_dim = dim + end_dim
 
     return sym_help._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@parse_args('v', 'v', 'v', 'i', 'i', 'i', 'v', 'i')
+def embedding_bag(g,
+                  embedding_matrix,
+                  indices,
+                  offsets,
+                  scale_grad_by_freq,
+                  mode,
+                  sparse,
+                  per_sample_weights,
+                  include_last_offset):
+    if scale_grad_by_freq and sym_help._training_mode:
+        return sym_help._onnx_unsupported('embedding_bag with scale_grad_by_freq for training mode')
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = g.op("Unsqueeze",
+                       sym_help._size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+                       axes_i=[0])
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = sym_help._slice_helper(g, offsets, axes=[0], starts=[0], ends=[maxsize], steps=[1])
+    offsets_ends = sym_help._slice_helper(g, offsets, axes=[0], starts=[1], ends=[maxsize], steps=[1])
+
+    loop_len = sym_help._size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+    loop = g.op("Loop", loop_len, loop_condition)
+
+    loop_block = _add_block(loop.node())
+    block_input_iter = _add_input_to_block(loop_block)
+
+    indices_start = loop_block.op("Gather", offsets_starts, block_input_iter, axis_i=0)
+    indices_end = loop_block.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = loop_block.op("Unsqueeze", indices_start, axes_i=[0])
+    indices_end = loop_block.op("Unsqueeze", indices_end, axes_i=[0])
+
+    indices_row = loop_block.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_block.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not sym_help._is_none(per_sample_weights):
+        per_sample_weights_row = loop_block.op("Slice", per_sample_weights,
+                                               indices_start,
+                                               indices_end,
+                                               zero)
+        per_sample_weights_row = loop_block.op("Unsqueeze", per_sample_weights_row, axes_i=[1])
+        embeddings = loop_block.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = loop_block.op("ReduceSum", embeddings, axes_i=[0], keepdims_i=0)
+    elif mode == 1:
+        embeddings = loop_block.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+    else:
+        embeddings = loop_block.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+    _add_output_to_block(loop_block, loop_condition)
+    _add_output_to_block(loop_block, embeddings)
+    # This pass does all required type casting for loop inputs (condition and iter)
+    torch._C._jit_pass_fixup_onnx_loop_node_inputs(loop.node())
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index a3b05ff71c61..12c1a5f34988 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -827,6 +827,38 @@ def const_if_tensor(arg):
     return tuple(o for o in n.outputs())
 
 
+def _block_op(b, opname, *args, **kwargs):
+    if "::" in opname:
+        aten = False
+        ns_opname = opname
+    else:
+        aten = kwargs.pop("aten", False)
+        ns = "aten" if aten else "onnx"
+        ns_opname = ns + "::" + opname
+    n = b.addNode(ns_opname, list(args))
+    for k, v in sorted(kwargs.items()):
+        # TODO: enable inplace in aten exporting mode.
+        if k == "inplace":
+            continue
+        _add_attribute(n, k, v, aten=aten)
+    if len(list(n.outputs())) == 1:
+        return n.output()
+    return tuple(o for o in n.outputs())
+
+
+def _add_block(node):
+    return node.addBlock()
+
+
+def _add_input_to_block(block):
+    return block.addInputToBlock()
+
+
+def _add_output_to_block(block, value):
+    new_output = block.registerOutput(value)
+    return new_output
+
+
 # Note [Export inplace]
 # ~~~~~~~~~~~~~~~~~~~~~
 # In abstract, it would be better for us to export inplace annotations,
@@ -1080,13 +1112,9 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
                     value_dict[x] = str(key) + '_dynamic_axes_' + str(i + 1)
             dynamic_axes[key] = value_dict
 
-def _add_block(node, input_node, op_name, **kwargs):
-    new_block = node.addBlock()
-    new_node = new_block.addNode(input_node, op_name)
-    for k, v in kwargs.items():
-        _add_attribute(new_node, k, v, False)
 
 torch._C.Graph.op = _graph_op
 torch._C.Graph.at = _graph_at
+torch._C.Block.op = _block_op
 torch._C.Graph.constant = _graph_constant
 torch._C.Node.__getitem__ = _node_getitem

From 2596113a79add9ccc6921829e2ba674de0c32c85 Mon Sep 17 00:00:00 2001
From: Sam Tsai <sstsai@fb.com>
Date: Wed, 30 Sep 2020 14:12:09 -0700
Subject: [PATCH 316/449] Add fuse support for batchnorm with affine=False
 (#45474)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45474

When batchnorm affine is set to false, weight and bias is set to None, which is not supported in this case. Added a fix to set weights to 1 and bias to 0 if they are not set.

Test Plan: Add unit test for testing fusing conv, batchnorm where batchnorm is in affine=False mode.

Reviewed By: z-a-f

Differential Revision: D23977080

fbshipit-source-id: 2782be626dc67553f3d27d8f8b1ddc7dea022c2a
---
 test/test_nn.py          | 12 ++++++++++++
 torch/nn/utils/fusion.py |  4 ++++
 2 files changed, 16 insertions(+)

diff --git a/test/test_nn.py b/test/test_nn.py
index 7a1943f82691..93bdafdb6866 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9137,6 +9137,18 @@ def test_fuse_module_eval_numerics(self, X, running_mean, running_var):
 
         self.assertEqual(Y_ref, Y_hat, msg="Conv+BN fusion results are off")
 
+        na_bn_ref = torch.nn.BatchNorm2d(oC, affine=False)
+        na_bn_ref.running_mean = torch.from_numpy(running_mean[0]).to(torch.double)
+        na_bn_ref.running_var = torch.from_numpy(running_var[0]).to(torch.double)
+        na_bn_ref.eval()
+
+        Y_ref = na_bn_ref(conv_ref(inputs))
+        conv_na_bn_fused = torch.nn.utils.fusion.fuse_conv_bn_eval(conv_ref,
+                                                                   na_bn_ref)
+        Y_hat = conv_na_bn_fused(inputs)
+
+        self.assertEqual(Y_ref, Y_hat, msg="Conv+BN(non-affine) fusion results are off")
+
 
 class TestAddRelu(TestCase):
     def test_add_relu(self):
diff --git a/torch/nn/utils/fusion.py b/torch/nn/utils/fusion.py
index 65b1b7eb0048..e0f512a24f3e 100644
--- a/torch/nn/utils/fusion.py
+++ b/torch/nn/utils/fusion.py
@@ -16,6 +16,10 @@ def fuse_conv_bn_eval(conv, bn):
 def fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
     if conv_b is None:
         conv_b = bn_rm.new_zeros(bn_rm.shape)
+    if bn_w is None:
+        bn_w = torch.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = torch.zeros_like(bn_rm)
     bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
 
     conv_w = conv_w * (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))

From 56840f0a81e4460089740d50d3768f37e79a17fc Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Wed, 30 Sep 2020 15:01:36 -0700
Subject: [PATCH 317/449] Prevent overflow in bucketize binary search

Summary: The current `median` calculation in the bucketize binary search is done in a way which is well-known to produce overflow issues ([link](https://en.wikipedia.org/wiki/Binary_search_algorithm#Implementation_issues)). This diff fixes the calculation so that overflows do not occur.

Test Plan:
Standard commit tests.

Also can test with:
```
#include <cassert>
#include <iostream>
#include <cstdint>

int32_t mp1(int32_t a, int32_t b){
        return (a+b)/2;
}

int32_t mp2(int32_t a, int32_t b){
        return a+(b-a)/2;
}

int main(){
        int32_t low=-1;
        for(int32_t high=1;high<10000;high++){
                if(mp1(low,high)!=mp2(low,high)){
                        std::cout<<"Ahhhh!"<<std::endl;
                }
        }
}
```

Reviewed By: drdarshan

Differential Revision: D23993920

fbshipit-source-id: 6b4567515552092de5876de6cab77df27c9cf61d
---
 caffe2/operators/bucketize_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/operators/bucketize_op.cu b/caffe2/operators/bucketize_op.cu
index 1d48013e771d..5d3049f239fb 100644
--- a/caffe2/operators/bucketize_op.cu
+++ b/caffe2/operators/bucketize_op.cu
@@ -15,7 +15,7 @@ __global__ void BucketizeOpKernel(
   CUDA_1D_KERNEL_LOOP(i, N) {
     int32_t low = -1, high = M;
     while (high - low > 1) {
-      int32_t median = (high + low) / 2;
+      const int32_t median = low + (high - low) / 2;
       if (bounds[median] < X[i]) {
         low = median;
       } else {

From 85a70ce71f52b97875f02ca2218a960266687ff0 Mon Sep 17 00:00:00 2001
From: Malgi Nikitha Vivekananda <nikithamalgi@fb.com>
Date: Wed, 30 Sep 2020 16:05:55 -0700
Subject: [PATCH 318/449] Add multiline string dedent support (#45580)

Summary:
Fixes #{44842}
Summary
========
This PR adds support for multiline string dedents.

Test
=====
pytest -k test_multiline_string_dedents test/test_jit.py

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45580

Reviewed By: wconstab

Differential Revision: D24025866

Pulled By: nikithamalgifb

fbshipit-source-id: 0f49739fb93f70f73a8f367caca2887f558a3937
---
 test/test_jit.py      | 14 ++++++++++++++
 torch/jit/frontend.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/test/test_jit.py b/test/test_jit.py
index 3f6f17066364..658ca08839b1 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -7205,6 +7205,20 @@ def f(x):
         x = torch.rand(3, 4)
         self.assertEqual(scripted_f(x), f(x))
 
+    def test_multiline_string_dedents(self):
+        def foo() -> None:
+            multiline_string_dedent_1 = """
+This is a string dedent """
+            multiline_string_dedent_2 = """ This is a
+  string dedent """
+            multiline_string_dedent_3 = """
+            This is a string
+dedent """
+            multiline_string_dedent_4 = """ This is a string dedent """
+
+        scripted_foo = torch.jit.script(foo)
+        self.assertEqual(scripted_foo(), foo())
+
     # adapted from test in test_torch
     def test_tensor_to(self):
         template = dedent('''
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index fdf1e613461e..952f44161c3f 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -178,6 +178,34 @@ def get_jit_class_def(cls, self_name):
     ctx = SourceContext(source, filename, file_lineno, leading_whitespace_len, False)
     return build_class_def(ctx, py_ast.body[0], methods, properties, self_name)
 
+def check_and_indent_multiline_strings(sourcelines):
+    """
+    This is a helper function which checks for multiline strings and
+    indents the strings by calculating the leading space and appending
+    the spaces to each line of the multiline string.The failure to indent
+    multiline strings causes failures during downstream dedent
+    Arguments:
+        sourcelines: This is an array of source lines of the function
+    Returns:
+        This function returns the updated indented sources,i.e,sourcelines
+    """
+    indices = []
+    triple_quotes = '\"\"\"'
+    # Extract the start and end line number of the multiline string
+    for index, source in enumerate(sourcelines):
+        if triple_quotes in source and source.find(triple_quotes) == source.rfind(triple_quotes):
+            indices.append(index)
+
+    # Adding leading space for every line of the multiline string
+    indices_length = len(indices)
+    for i in range(0, indices_length, 2):
+        if i + 1 < indices_length:
+            start = indices[i]
+            end = indices[i + 1]
+            leading_space = len(sourcelines[start]) - len(sourcelines[start].lstrip())
+            for lines in range(start + 1, end + 1):
+                sourcelines[lines] = ' ' * leading_space + sourcelines[lines]
+    return sourcelines
 
 def get_jit_def(fn, def_name, self_name=None):
     """
@@ -195,6 +223,7 @@ def _forward(self):
         self_name: If this function is a method, what the type name of `self` is.
     """
     sourcelines, file_lineno, filename = get_source_lines_and_file(fn, torch._C.ErrorReport.call_stack())
+    sourcelines = check_and_indent_multiline_strings(sourcelines)
     source = ''.join(sourcelines)
     dedent_src = dedent(source)
     py_ast = ast.parse(dedent_src)

From 4be42034b66f0dff3f0a6adf0ef4cbe8e3ec70d1 Mon Sep 17 00:00:00 2001
From: Zino Benaissa <zinob@fb.com>
Date: Wed, 30 Sep 2020 16:08:00 -0700
Subject: [PATCH 319/449] Clear shape information before finalizing graph-mode
 quantization (#45282)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45282

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D23909601

Pulled By: bzinodev

fbshipit-source-id: 3062cda46b15a79094a360216c35906afab7c723
---
 torch/csrc/jit/passes/quantization/finalize.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/torch/csrc/jit/passes/quantization/finalize.cpp b/torch/csrc/jit/passes/quantization/finalize.cpp
index d5beb5ca37ca..0f12e94180a5 100644
--- a/torch/csrc/jit/passes/quantization/finalize.cpp
+++ b/torch/csrc/jit/passes/quantization/finalize.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/passes/quantization/finalize.h>
 #include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/clear_profiling.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/prepack_folding.h>
 #include <torch/csrc/jit/passes/quantization/quantization_patterns.h>
@@ -81,6 +82,15 @@ Module Finalize(
     Module& module,
     QuantType quant_type,
     const std::vector<std::string>& preserved_attrs) {
+  // Tracing annotates the resulting graph with shape information. In many case,
+  // user applies different input shapes to traced graph. It is on the user to
+  // know it is correct to do so. The quantized module needs to be clean up and
+  // To prevent the JIT optimizations from leveraging the annotated shape info,
+  // clear shape information in the graph.
+  for (auto func : module.type()->methods()) {
+    ClearProfilingInformation(func->graph());
+  }
+
   auto graph = module.get_method("forward").graph();
   InsertPrepackUnpack(graph);
   GRAPH_DUMP("Before QuantFusion:", graph);

From 6fde2df1b8dde95c4a6f76ba129b2754009ed072 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Wed, 30 Sep 2020 16:34:38 -0700
Subject: [PATCH 320/449] [JIT] Update JIT triage project board workflow
 (#45613)

Summary:
This commit updates `.github/workflows/jit_triage.yml` to use the new `oncall: jit` tag instead of the old `jit` tag.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45613

Reviewed By: izdeby

Differential Revision: D24032388

Pulled By: SplitInfinity

fbshipit-source-id: 6631a596b2f80bdb322caa74adaf0dc2cb146350
---
 .github/workflows/jit_triage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/jit_triage.yml b/.github/workflows/jit_triage.yml
index af59d2160ec6..1fb967e8ffb8 100644
--- a/.github/workflows/jit_triage.yml
+++ b/.github/workflows/jit_triage.yml
@@ -19,7 +19,7 @@ jobs:
             // - io: A reference to the @actions/io package
 
             // Check if issue has a JIT label.
-            const kJitLabel = "jit";
+            const kJitLabel = "oncall: jit";
 
             issue = await github.issues.get({
               owner: context.issue.owner,

From f2c2b75e80a2ac23fe7c7717ff5b63ad9feb841f Mon Sep 17 00:00:00 2001
From: Hector Yuen <hyz@fb.com>
Date: Wed, 30 Sep 2020 16:53:10 -0700
Subject: [PATCH 321/449] flush the buffer when printing the IR (#45585)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45585

I discovered this bug when I was trying to print the graph to a file. Turns out I had to close the file, but flushing should be a good safeguard in case other users forget.

Test Plan:
Tested with and without flushing.
with P144064292
without P144064767

Reviewed By: mortzur

Differential Revision: D24023819

fbshipit-source-id: 39574b3615feb28e5b5939664c04ddfb1257706a
---
 torch/csrc/jit/ir/ir.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 6543f36d6ac2..b602542b3ed3 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -326,6 +326,8 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
     out << "with " << fg->kind().toQualString() << "_" << i++ << " = "
         << *fg->g(attr::Subgraph);
   }
+  out.flush();
+
   /*
   // Uncomment this to debug all_nodes issues
   {

From 869b05648def7a3b01685da94d4ee36f671d5dd6 Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Wed, 30 Sep 2020 16:55:32 -0700
Subject: [PATCH 322/449] Revert D24024606: [FX] Shape propagation example

Test Plan: revert-hammer

Differential Revision:
D24024606 (https://github.com/pytorch/pytorch/commit/ac9a708ed0aba8b1f8d258247219c86077d6d0b1)

Original commit changeset: 5340eab20f80

fbshipit-source-id: f465eb5e8e994b3b0bedbc779901f76b9ab16f02
---
 test/test_fx.py                     | 27 -----------------
 torch/fx/experimental/shape_prop.py | 46 -----------------------------
 2 files changed, 73 deletions(-)
 delete mode 100644 torch/fx/experimental/shape_prop.py

diff --git a/test/test_fx.py b/test/test_fx.py
index c4fba462421f..1790cf3e3559 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -7,7 +7,6 @@
 from pathlib import Path
 from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Tracer, Graph
 from torch.fx.experimental import GraphManipulation
-from torch.fx.experimental import shape_prop
 
 from torch.fx.proxy import TraceError
 
@@ -734,31 +733,5 @@ def test_wrong_topo(self):
         with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'):
             graph.lint()
 
-    def test_example_shape_prop(self):
-        class TestCase(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.attr = torch.randn(3, 4)
-                self.submod = torch.nn.Linear(4, 4)
-
-            def forward(self, x):
-                return torch.neg(self.submod(x.relu() + self.attr))
-        tc = TestCase()
-        tc_traced = symbolic_trace(tc)
-        ref_out = tc_traced(torch.rand(3, 4))
-
-        # Make sure we're testing all opcodes
-        opcodes = set()
-        for node in tc_traced.graph.nodes:
-            opcodes.add(node.op)
-        self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method', 'call_module']))
-
-        # Test shape propogation and make sure results match actual
-        shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4))
-        self.assertEqual(tc_traced.graph.result.shape, ref_out.shape)
-
-
-
-
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py
deleted file mode 100644
index 92df6c08d468..000000000000
--- a/torch/fx/experimental/shape_prop.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import torch
-from torch.fx.node import Node
-
-class ShapeProp:
-    def __init__(self, mod):
-        self.mod = mod
-        self.graph = mod.graph
-        self.modules = dict(self.mod.named_modules())
-
-    def propagate(self, *args):
-        args_iter = iter(args)
-        env : Dict[str, Node] = {}
-
-        def load_arg(a):
-            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
-
-        def fetch_attr(target : str):
-            target_atoms = target.split('.')
-            attr_itr = self.mod
-            for i, atom in enumerate(target_atoms):
-                if not hasattr(attr_itr, atom):
-                    raise RuntimeError(f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}")
-                attr_itr = getattr(attr_itr, atom)
-            return attr_itr
-
-        for node in self.graph.nodes:
-            if node.op == 'placeholder':
-                result = next(args_iter)
-            elif node.op == 'get_attr':
-                result = fetch_attr(node.target)
-            elif node.op == 'call_function':
-                result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
-            elif node.op == 'call_method':
-                self_obj, *args = load_arg(node.args)
-                kwargs = load_arg(node.kwargs)
-                result = getattr(self_obj, node.target)(*args, **kwargs)
-            elif node.op == 'call_module':
-                result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
-
-            if isinstance(result, torch.Tensor):
-                node.shape = result.shape
-                node.dtype = result.dtype
-
-            env[node.name] = result
-
-        return load_arg(self.graph.result)

From 0b3ad5404a2a528dad27c887b1c3eade3e070946 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 30 Sep 2020 17:17:48 -0700
Subject: [PATCH 323/449] [bot] Add quantization triage bot script (#45622)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45622

Copied and modified from https://github.com/pytorch/pytorch/blob/master/.github/workflows/jit_triage.yml

Test Plan: Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D24036142

fbshipit-source-id: 41287b6a0390cabe4474c99464d74da2c0934401
---
 .github/workflows/quantization_triage.yml | 78 +++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 .github/workflows/quantization_triage.yml

diff --git a/.github/workflows/quantization_triage.yml b/.github/workflows/quantization_triage.yml
new file mode 100644
index 000000000000..ac337a066873
--- /dev/null
+++ b/.github/workflows/quantization_triage.yml
@@ -0,0 +1,78 @@
+name: quantization-triage
+
+on:
+  issues:
+    types: [labeled]
+
+jobs:
+  welcome:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/github-script@v2
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            // Arguments available:
+            // - github: A pre-authenticated octokit/rest.js client
+            // - context: An object containing the context of the workflow run
+            // - core: A reference to the @actions/core package
+            // - io: A reference to the @actions/io package
+
+            // Check if issue has a Quantization label.
+            const kQuantizationLabel = "oncall: quantization";
+
+            issue = await github.issues.get({
+              owner: context.issue.owner,
+              repo: context.issue.repo,
+              issue_number: context.issue.number,
+            })
+
+            const hasQuantizationLabel = issue.data.labels.filter(label => label.name == kQuantizationLabel).length > 0;
+
+            if (!hasQuantizationLabel) {
+              core.debug("Issue " + issue.data.title + " does not have Quantization label");
+              return;
+            }
+
+            // Get project column ID.
+            const kProjectName = "Quantization Triage";
+            const kColumnName = "Need Triage";
+
+            // Query all projects in the repository.
+            // TODO: Support pagination once there are > 30 projects.
+            const projects = await github.projects.listForRepo({
+              owner: context.issue.owner,
+              repo: context.issue.repo,
+            });
+
+            // Filter out unwanted projects and get the ID for the Quantization Triage project.
+            const filteredProjects = projects.data.filter(project => project.name == kProjectName);
+
+            if (filteredProjects.length != 1) {
+              core.setFailed("Unable to find a project named " + kProjectName);
+              return;
+            }
+
+            const projectId = filteredProjects[0].id;
+            // First, query all columns in the project.
+            // TODO: Support pagination once there are > 30 columns.
+            const columns = await github.projects.listColumns({
+              project_id: projectId,
+            });
+
+            // Filter out unwanted projects and get the ID for the Need triage column.
+            const filteredColumns = columns.data.filter(column => column.name == kColumnName);
+
+            if (filteredColumns.length != 1) {
+              core.setFailed("Unable to find a column named " + kColumnName);
+              return;
+            }
+
+            const columnId = filteredColumns[0].id;
+
+            // Create a project card for this new issue.
+            await github.projects.createCard({
+              column_id: columnId,
+              content_id: issue.data.id,
+              content_type: "Issue",
+            })

From 3a2d45304d95d7bb2c39f8cd28f5185f37f91566 Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@gmail.com>
Date: Wed, 30 Sep 2020 17:33:53 -0700
Subject: [PATCH 324/449] [Experimental][Partial] New implementation for
 torch.distributed APIs in C++ (#45547)

Summary:
This is an attempt at refactoring `torch.distributed` implementation. Goal is to push Python layer's global states (like _default_pg) to C++ layer such that `torch.distributed` becomes more TorchScript friendly.

This PR adds the skeleton of C++ implementation, at the moment it is not included in any build (and won't be until method implementations are filled in). If you see any test failures related, feel free to revert.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45547

Reviewed By: izdeby

Differential Revision: D24024213

Pulled By: gmagogsfm

fbshipit-source-id: 2762767f63ebef43bf58e17f9447d53cf119f05f
---
 torch/csrc/distributed/c10d/c10d_frontend.h | 86 +++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 torch/csrc/distributed/c10d/c10d_frontend.h

diff --git a/torch/csrc/distributed/c10d/c10d_frontend.h b/torch/csrc/distributed/c10d/c10d_frontend.h
new file mode 100644
index 000000000000..9ff4b69999c7
--- /dev/null
+++ b/torch/csrc/distributed/c10d/c10d_frontend.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <torch/lib/c10d/ProcessGroup.hpp>
+#include <torch/lib/c10d/Store.hpp>
+#include <ATen/ATen.h>
+#include <c10/util/Optional.h>
+
+#include <string>
+#include <unordered_map>
+#include <memory>
+#include <chrono>
+
+namespace c10d {
+
+class Backend {
+    public:
+     // Maps to Backend.__new__ in Python.
+     static std::string get(std::string);
+
+     // TODO: How to support registering third_party backend?
+     static void registerBackend();
+
+    private:
+     // TODO: Should this be an enum list instead since this set doesn't
+     // change at all.
+     std::unordered_set<std::string> registered_backends_;
+};
+
+class DistributedC10d{
+    public:
+     void initProcessGroup(
+         const std::string& backend,
+         const std::string& init_method,
+         const std::chrono::milliseconds& timeout,
+         int64_t world_size,
+         int64_t rank,
+         std::shared_ptr<Store> store,
+         const std::string& group_name);
+
+     void destroyProcessGroup(std::shared_ptr<ProcessGroup> group);
+     int64_t getRank(std::shared_ptr<ProcessGroup> group);
+     int64_t getWorldSize(std::shared_ptr<ProcessGroup> group);
+
+     ProcessGroup::Work isend(at::Tensor tensor, int64_t dst, std::shared_ptr<ProcessGroup> group, c10::optional<int64_t> tag);
+     ProcessGroup::Work irecv(at::Tensor tensor, int64_t src, std::shared_ptr<ProcessGroup> group, c10::optional<int64_t> tag);
+
+    private:
+     DistributedC10d(){};
+
+     bool rankNotInGroup(std::shared_ptr<ProcessGroup> group) const;
+     int64_t getGroupRank(
+         std::shared_ptr<ProcessGroup> group,
+         const int64_t rank) const;
+     int64_t getGlobalRank(
+         std::shared_ptr<ProcessGroup> group,
+         const int64_t global_rank) const;
+     void checkDefaultPg() const;
+     int64_t getGroupSize(std::shared_ptr<ProcessGroup> group) const;
+     int64_t getBackend(std::shared_ptr<ProcessGroup> group);
+
+     std::string backend_;
+     // TODO: Ask Alex what kind of equality we need. It determine whether we
+     // need to use ProcessGroup or ProcesGroup* as key.
+     std::unordered_map<
+         std::shared_ptr<ProcessGroup>,
+         std::pair<std::shared_ptr<Backend>, std::shared_ptr<Store>>>
+         pg_map_;
+
+     // Note, this is different mapping relationship than original Python
+     // implementation.
+     std::unordered_map<std::shared_ptr<ProcessGroup>, std::string> pg_names_;
+
+     // Value is global_rank:group_rank mapping.
+     std::unordered_map<std::shared_ptr<ProcessGroup>, std::vector<int64_t>>
+         pg_group_ranks_;
+
+     std::shared_ptr<ProcessGroup> default_pg_;
+
+     // Default value should be "env://"
+     std::string default_pg_init_method_;
+
+     int64_t group_count_;
+};
+
+
+} // namespace c10d

From 2b13d9413e55fb270cc5f1f7fb91f77dbc7167c7 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Wed, 30 Sep 2020 17:39:07 -0700
Subject: [PATCH 325/449] Re-land: Add callgrind collection to Timer #44717
 (#45586)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45586

Test Plan: The unit test has been softened to be less platform sensitive.

Reviewed By: mruberry

Differential Revision: D24025415

Pulled By: robieta

fbshipit-source-id: ee986933b984e736cf1525e1297de6b21ac1f0cf
---
 .circleci/docker/common/install_base.sh       |   1 -
 .circleci/scripts/binary_ios_build.sh         |   1 +
 .gitmodules                                   |   4 +
 test/test_utils.py                            |  12 +-
 third_party/valgrind                          |   1 +
 torch/CMakeLists.txt                          |   3 +
 torch/_C/__init__.pyi.in                      |   4 +
 torch/csrc/Module.cpp                         |  26 +
 torch/overrides.py                            |   2 +
 torch/utils/benchmark/utils/timer.py          |  27 +
 .../utils/valgrind_wrapper/__init__.py        |   0
 .../utils/valgrind_wrapper/timer_interface.py | 473 ++++++++++++++++++
 12 files changed, 552 insertions(+), 2 deletions(-)
 create mode 160000 third_party/valgrind
 create mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
 create mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py

diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
index 5e8173a43627..1fc49932fee5 100755
--- a/.circleci/docker/common/install_base.sh
+++ b/.circleci/docker/common/install_base.sh
@@ -131,4 +131,3 @@ sudo make install
 cd ../../
 rm -rf valgrind_build
 alias valgrind="/usr/local/bin/valgrind"
-
diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index efab1e5ded3a..6df086ccf965 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -16,6 +16,7 @@ source ~/anaconda/bin/activate
 
 # Install dependencies
 conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
+conda install -c conda-forge valgrind
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
 # sync submodules
diff --git a/.gitmodules b/.gitmodules
index 509ab94f1cf4..d7a11cc22996 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -130,3 +130,7 @@
     ignore = dirty
     path = third_party/tensorpipe
     url = https://github.com/pytorch/tensorpipe.git
+[submodule "third_party/valgrind"]
+    ignore = dirty
+	path = third_party/valgrind
+	url = https://sourceware.org/git/valgrind.git
diff --git a/test/test_utils.py b/test/test_utils.py
index dc7efe5a8f5b..11b4337b4768 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -15,7 +15,7 @@
 import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS
+from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, slowTest
 from urllib.error import URLError
 import numpy as np
 
@@ -768,6 +768,16 @@ class MockCudaTimer(benchmark_utils.Timer):
             self.assertEqual(len(measurement.times), repeats)
             self.assertEqual(measurement.number_per_run, number_per_run)
 
+    @slowTest
+    @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
+    def test_collect_callgrind(self):
+        timer = benchmark_utils.Timer("y = torch.ones((1,)) + 1")
+
+        # Don't collect baseline to speed up unit test by ~30 seconds.
+        stats = timer.collect_callgrind(number=1000, collect_baseline=False)
+
+        self.assertIsInstance(stats.counts(include_lookdict_unicode=False), int)
+
     def test_compare(self):
         # Simulate several approaches.
         costs = (
diff --git a/third_party/valgrind b/third_party/valgrind
new file mode 160000
index 000000000000..2593ccd82c18
--- /dev/null
+++ b/third_party/valgrind
@@ -0,0 +1 @@
+Subproject commit 2593ccd82c189bf40b60a3a4934c5d0bbdb75427
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 2ae2f7f737fe..de84804f3012 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -66,6 +66,9 @@ set(TORCH_PYTHON_INCLUDE_DIRECTORIES
     ${CMAKE_BINARY_DIR}/third_party
     ${CMAKE_BINARY_DIR}/third_party/onnx
 
+    ${TORCH_ROOT}/third_party/valgrind/callgrind
+    ${TORCH_ROOT}/third_party/valgrind/include
+
     ${TORCH_ROOT}/third_party/gloo
     ${TORCH_ROOT}/third_party/onnx
     ${pybind11_INCLUDE_DIRS}
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 8da2fb7a587f..b0479c01f58a 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -388,6 +388,10 @@ def _vmapmode_increment_nesting() -> _int: ...  # THPModule_vmapmode_increment_n
 def _vmapmode_decrement_nesting() -> _int: ...  # THPModule_vmapmode_decrement_nesting
 def _log_api_usage_once(str) -> None: ...  # LogAPIUsageOnceFromPython
 
+# Defined in `valgrind.h` and `callgrind.h` respecitively.
+def valgrind_supported_platform() -> _bool: ...  # NVALGRIND
+def valgrind_toggle() -> None: ...  # CALLGRIND_TOGGLE_COLLECT
+
 has_openmp: _bool
 has_mkl: _bool
 has_lapack: _bool
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ae6f15155f2a..805548d18a98 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -61,6 +61,12 @@
 #endif
 #endif
 
+#if (defined(_WIN32) || defined(_WIN64) || defined(FBCODE_CAFFE2) || defined(C10_MOBILE))
+#define NVALGRIND
+#else
+#include <callgrind.h>
+#endif
+
 #define WITH_NUMPY_IMPORT_ARRAY
 #include <torch/csrc/utils/numpy_stub.h>
 
@@ -821,6 +827,26 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False));
   ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False));
 
+  py_module.def(
+    "valgrind_supported_platform", [](){
+      #if defined(NVALGRIND)
+      return false;
+      #else
+      return true;
+      #endif
+    }
+  );
+
+  py_module.def(
+    "valgrind_toggle", [](){
+      #if defined(NVALGRIND)
+      TORCH_CHECK(false, "Valgrind is not supported.");
+      #else
+      CALLGRIND_TOGGLE_COLLECT;
+      #endif
+    }
+  );
+
 #ifdef USE_CUDA
   PyObject *has_cuda = Py_True;
 #else
diff --git a/torch/overrides.py b/torch/overrides.py
index d7cda983fde6..bab17c1e961f 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -156,6 +156,8 @@ def get_ignored_functions() -> Set[Callable]:
         torch.is_deterministic,
         torch.set_deterministic,
         torch.unify_type_list,
+        torch.valgrind_supported_platform,
+        torch.valgrind_toggle,
         Tensor.__delitem__,
         Tensor.__dir__,
         Tensor.__getattribute__,
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 41d3892c86b3..d4c321156da9 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -6,6 +6,7 @@
 import numpy as np
 import torch
 from torch.utils.benchmark.utils import common
+from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
 
 
 __all__ = ["Timer", "timer"]
@@ -42,6 +43,7 @@ def __init__(
         # specified as a convenience feature.
         globals = dict(globals or {})
         globals.setdefault("torch", torch)
+        self._globals = globals
 
         self._timer = self._timer_cls(stmt=stmt, setup=setup, timer=timer, globals=globals)
         self._task_spec = common.TaskSpec(
@@ -159,3 +161,28 @@ def stop_hook(times) -> bool:
             raw_times=times,
             task_spec=self._task_spec
         )
+
+    def collect_callgrind(self, number=100, collect_baseline=True):
+        if not isinstance(self._task_spec.stmt, str):
+            raise ValueError("`collect_callgrind` currently only supports string `stmt`")
+
+        # __init__ adds torch, and Timer adds __builtins__
+        allowed_keys = {"torch", "__builtins__"}
+        if any(k not in allowed_keys for k in self._globals.keys()):
+            raise ValueError(
+                "`collect_callgrind` does not currently support passing globals. "
+                "Please define a `setup` str instead.")
+
+        if self._globals.get("torch", torch) is not torch:
+            raise ValueError("`collect_callgrind` does not support mocking out `torch`.")
+
+        # Check that the statement is valid. It doesn't guarantee success, but it's much
+        # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
+        # the parent process rather than the valgrind subprocess.
+        self._timer.timeit(1)
+        return valgrind_timer_interface.wrapper_singleton().collect_callgrind(
+            stmt=self._task_spec.stmt,
+            setup=self._task_spec.setup,
+            number=number,
+            num_threads=self._task_spec.num_threads,
+            collect_baseline=collect_baseline)
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py b/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
new file mode 100644
index 000000000000..423cc3dc4a86
--- /dev/null
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -0,0 +1,473 @@
+"""Intermediate layer between `Timer` and `valgrind`."""
+import collections
+import dataclasses
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+from typing import Any, DefaultDict, Dict, List, NamedTuple, Optional, Tuple
+
+import torch
+
+
+FunctionCount = NamedTuple("FunctionCount", [("count", int), ("function", str)])
+
+
+@dataclasses.dataclass(repr=False, eq=False, frozen=True)
+class CallgrindStats(object):
+    stmt: str
+    setup: str
+    number_per_run: int
+    num_threads: int
+    built_with_debug_symbols: bool
+    baseline_inclusive_stats: Tuple[FunctionCount, ...]
+    baseline_exclusive_stats: Tuple[FunctionCount, ...]
+    stmt_inclusive_stats: Tuple[FunctionCount, ...]
+    stmt_exclusive_stats: Tuple[FunctionCount, ...]
+
+    def __repr__(self) -> str:
+        newline = "\n"  # `\` cannot appear in fstring code section.
+        base_stats = self.baseline_exclusive_stats
+        self_stats = self.stmt_exclusive_stats
+        output = textwrap.dedent(f"""
+        {super().__repr__()}
+          stmt:  {self.stmt.replace(newline, newline + ' ' * 9)}
+          setup: {self.setup.replace(newline, newline + ' ' * 9)}
+          {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
+        {'':>25}All{'':>10}Noisy symbols removed
+          Instructions: {self._counts(self_stats, True):>12}{'':>15}{self._counts(self_stats, False):>12}
+          Baseline:     {self._counts(base_stats, True):>12}{'':>15}{self._counts(base_stats, False):>12}
+        """).strip()
+        if not self.built_with_debug_symbols:
+            output += textwrap.dedent("""
+            Warning: PyTorch was not built with debug symbols.
+                     Source information may be limited. Rebuild with
+                     REL_WITH_DEB_INFO=1 for more detailed results.""")
+        return output
+
+    def stats(self, inclusive: bool = False) -> Tuple[FunctionCount, ...]:
+        """Returns stats as a tuple of (count, function)
+
+        `inclusive` matches the semantics of callgrind. If True, the counts
+        include instructions executed by children. `inclusive=True` is useful
+        for identifying hot spots in code; `inclusive=False` is useful for
+        identifying reducing noise when diffing counts from two different
+        runs. (See CallgrindStats.delta(...) for more details)
+        """
+        if inclusive:
+            first, second = self.stmt_inclusive_stats, self.baseline_inclusive_stats
+        else:
+            first, second = self.stmt_exclusive_stats, self.baseline_exclusive_stats
+        return self._diff(first, second)
+
+    def counts(self, *, include_lookdict_unicode: bool = True) -> int:
+        """Returns the total number of instructions executed.
+
+        Several instructions in the CPython interpreter are rather noisy. These
+        instructions involve unicode to dictionary lookups which Python uses to
+        map variable names. By default these are included, but setting
+        `include_lookdict_unicode=False` will exclude them and generally lead
+        to less noisy counts.
+        """
+        return self._counts(self.stmt_exclusive_stats, include_lookdict_unicode)
+
+    # FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
+    def delta(
+        self,
+        other,  # type: CallgrindStats
+        inclusive: bool = False,
+        subtract_baselines: bool = True
+    ) -> Tuple[FunctionCount, ...]:
+        """Diff two sets of counts.
+
+        One common reason to collect instruction counts is to determine the
+        the effect that a particular change will have on the number of instructions
+        needed to perform some unit of work. If a change increases that number, the
+        next logical question is "why". This generally involves looking at what part
+        if the code increased in instruction count. This function automates that
+        process so that one can easily diff counts on both an inclusive and
+        exclusive basis. The `subtract_baselines` argument allows one to disable
+        baseline correction, though in most cases it shouldn't matter as the
+        baselines are expected to more or less cancel out.
+        """
+        if subtract_baselines:
+            first = self.stats(inclusive=inclusive)
+            second = other.stats(inclusive=inclusive)
+        else:
+            if inclusive:
+                first, second = self.stmt_inclusive_stats, other.stmt_inclusive_stats
+            else:
+                first, second = self.stmt_exclusive_stats, other.stmt_exclusive_stats
+        return self._diff(first, second)
+
+    def as_standardized(self) -> "CallgrindStats":
+        """Strip library names and some prefixes from function strings.
+
+        When comparing two different sets of instruction counts, on stumbling
+        block can be path prefixes. Callgrind includes the full filepath
+        when reporting a function (as it should). However, this can cause
+        issues when diffing profiles. If a key component such as Python
+        or PyTorch was built in separate locations in the two profiles, which
+        can result in something resembling:
+            23234231 /tmp/first_build_dir/thing.c:foo(...)
+             9823794 /tmp/first_build_dir/thing.c:bar(...)
+              ...
+               53453 .../aten/src/Aten/...:function_that_actually_changed(...)
+              ...
+             -9823794 /tmp/second_build_dir/thing.c:bar(...)
+            -23234231 /tmp/second_build_dir/thing.c:foo(...)
+
+        Stripping prefixes can ameliorate this issue by regularizing the
+        strings and causing better cancellation of equivilent call sites
+        when diffing.
+        """
+        def strip(stats: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
+            counts: DefaultDict[str, int] = collections.defaultdict(int)
+
+            # "Python" and "Objects" come from CPython.
+            prefix_truncations = ("build/aten/", "Python/", "Objects/")
+            for c, fn in stats:
+                fn = re.sub(r"^.+build/\.\./", "build/../", fn)
+                for new_prefix in prefix_truncations:
+                    fn = re.sub(r"^.+/" + re.escape(new_prefix), new_prefix, fn)
+
+                # Strip library name. e.g. `libtorch.so`
+                fn = re.sub(r"\s\[.+\]$", "", fn)
+                counts[fn] += c
+            return tuple(sorted([
+                FunctionCount(c, fn) for fn, c in counts.items() if c
+            ], reverse=True))
+
+        return CallgrindStats(
+            stmt=self.stmt,
+            setup=self.setup,
+            number_per_run=self.number_per_run,
+            num_threads=self.num_threads,
+            built_with_debug_symbols=self.built_with_debug_symbols,
+            baseline_inclusive_stats=strip(self.baseline_inclusive_stats),
+            baseline_exclusive_stats=strip(self.baseline_exclusive_stats),
+            stmt_inclusive_stats=strip(self.stmt_inclusive_stats),
+            stmt_exclusive_stats=strip(self.stmt_exclusive_stats),
+        )
+
+    @staticmethod
+    def _counts(stats: Tuple[FunctionCount, ...], include_lookdict_unicode: bool) -> int:
+        return sum(
+            c for c, fn in stats
+            if include_lookdict_unicode
+            or "dictobject.c:lookdict_unicode" not in fn
+        )
+
+    @staticmethod
+    def _diff(first: Tuple[FunctionCount, ...], second: Tuple[FunctionCount, ...]) -> Tuple[FunctionCount, ...]:
+        counts = collections.defaultdict(int, {fn: c for c, fn in first})
+        assert len(counts) == len(first)
+        for c, fn in second:
+            counts[fn] -= c
+
+        return tuple(sorted([
+            FunctionCount(c, fn) for fn, c in counts.items() if c
+        ], reverse=True))
+
+
+class _ValgrindWrapper(object):
+    def __init__(self) -> None:
+        self._commands_available: Dict[str, bool] = {}
+        if torch._C.valgrind_supported_platform():
+            # Only bother checking on supported platforms.
+            for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
+                self._commands_available[cmd] = not subprocess.run(
+                    ["which", cmd],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                ).returncode
+
+        self._build_type: Optional[str] = None
+        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())
+        if build_search is not None:
+            self._build_type = build_search.groups()[0].split(",")[0]
+
+        self._baseline_cache: Dict[Tuple[int, int], Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]] = {}
+
+    def _validate(self) -> None:
+        if not torch._C.valgrind_supported_platform():
+            raise OSError("Valgrind is not supported on this platform.")
+
+        missing_cmds = [cmd for cmd, available in self._commands_available.items() if not available]
+        if missing_cmds:
+            raise OSError("Missing: " + ", ".join(missing_cmds))
+
+    def collect_callgrind(
+        self,
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int,
+        collect_baseline: bool
+    ) -> CallgrindStats:
+        """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
+        self._validate()
+        baseline_inclusive_stats: Tuple[FunctionCount, ...] = ()
+        baseline_exclusive_stats: Tuple[FunctionCount, ...] = ()
+        if collect_baseline:
+            cache_key = (number, num_threads)
+            if cache_key not in self._baseline_cache:
+                self._baseline_cache[cache_key] = self._invoke(
+                    stmt="pass", setup="pass", number=number, num_threads=num_threads)
+            baseline_inclusive_stats, baseline_exclusive_stats = \
+                self._baseline_cache[cache_key]
+
+        stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(
+            stmt=stmt,
+            setup=setup,
+            number=number,
+            num_threads=num_threads
+        )
+
+        return CallgrindStats(
+            stmt=stmt,
+            setup=setup,
+            number_per_run=number,
+            num_threads=num_threads,
+            built_with_debug_symbols=self._build_type == "RelWithDebInfo",
+            baseline_inclusive_stats=baseline_inclusive_stats,
+            baseline_exclusive_stats=baseline_exclusive_stats,
+            stmt_inclusive_stats=stmt_inclusive_stats,
+            stmt_exclusive_stats=stmt_exclusive_stats,
+        )
+
+    def _invoke(
+        self,
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int
+    ) -> Tuple[Tuple[FunctionCount, ...], Tuple[FunctionCount, ...]]:
+        """Core invocation method for Callgrind collection.
+
+        Valgrind operates by effectively replacing the CPU with an emulated
+        version which allows it to instrument any code at the cost of severe
+        performance degradation. This has the practical effect that in order
+        to collect Callgrind statistics, a new process has to be created
+        running under `valgrind`. The steps for this process are:
+
+        1) Create a scratch directory.
+        2) Codegen a run script. (_ValgrindWrapper._construct_script)
+            Inside the run script:
+                * Validate that Python and torch match the parent process
+                * Validate that it is indeed running under valgrind
+                * Execute `setup` and warm up `stmt`
+                * Begin collecting stats
+                * Run the `stmt` loop
+                * Stop collecting stats
+        3) Parse the run results.
+        4) Cleanup the scratch directory.
+        """
+        working_dir = tempfile.mkdtemp()
+        script_file = os.path.join(working_dir, "timer_callgrind.py")
+        callgrind_out = os.path.join(working_dir, "callgrind.out")
+        error_log = os.path.join(working_dir, "error.txt")
+        stat_log = os.path.join(working_dir, "callgrind_stat.txt")
+        stdout_stderr_log = os.path.join(working_dir, "stdout_stderr.log")
+
+        def run(args: List[str], **kwargs: Any) -> Tuple[subprocess.CompletedProcess, str]:
+            # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
+            f_stdout_stderr = open(stdout_stderr_log, "wb")
+            try:
+                invocation = subprocess.run(
+                    args,
+                    stdout=f_stdout_stderr,
+                    stderr=subprocess.STDOUT,
+                    **kwargs,
+                )
+                with open(stdout_stderr_log, "rt") as f:
+                    return invocation, f.read()
+            finally:
+                f_stdout_stderr.close()
+
+        try:
+            with open(script_file, "wt") as f:
+                f.write(self._construct_script(
+                    stmt=stmt, setup=setup, number=number,
+                    num_threads=num_threads, error_log=error_log,
+                    stat_log=stat_log))
+
+            valgrind_invocation, valgrind_invocation_output = run([
+                "valgrind",
+                "--tool=callgrind",
+                f"--callgrind-out-file={callgrind_out}",
+                "--dump-line=yes",
+                "--dump-instr=yes",
+                "--instr-atstart=yes",
+                "--collect-atstart=no",
+                "python",
+                script_file,
+            ])
+
+            if valgrind_invocation.returncode:
+                error_report = ""
+                if os.path.exists(error_log):
+                    with open(error_log, "rt") as f:
+                        error_report = f.read()
+                if not error_report:
+                    error_report = "Unknown error.\n" + valgrind_invocation_output
+
+                raise OSError(f"Failed to collect callgrind profile:\n{error_report}")
+
+            def parse_output(inclusive: bool) -> Tuple[FunctionCount, ...]:
+                annotate_invocation, annotate_invocation_output = run([
+                    "callgrind_annotate",
+                    f"--inclusive={'yes' if inclusive else 'no'}",
+                    callgrind_out
+                ], check=True)
+
+                begin_collecting = False
+                fn_counts = []
+                for l in annotate_invocation_output.splitlines(keepends=False):
+                    if not begin_collecting and re.match(r"Ir\s+file:function", l):
+                        begin_collecting = True
+                        continue
+
+                    count_match = re.match(r"^\s*([0-9,]+)\s+(.+:.+)$", l)
+                    if count_match:
+                        ir_str, file_function = count_match.groups()
+                        ir = int(ir_str.replace(",", ""))
+                        fn_counts.append(FunctionCount(ir, file_function))
+                        continue
+
+                    if begin_collecting and re.match(r"-+", l):
+                        continue
+
+                    begin_collecting = False
+
+                return tuple(fn_counts)
+            return parse_output(inclusive=True), parse_output(inclusive=False)
+        finally:
+            shutil.rmtree(working_dir)
+
+    @staticmethod
+    def _construct_script(
+        stmt: str,
+        setup: str,
+        number: int,
+        num_threads: int,
+        error_log: str,
+        stat_log: str
+    ) -> str:
+        # The naive template looks something like:
+        #   "for _ in range({number}): {stmt}"
+        # However a loop in Python is surprisingly expensive, and significantly
+        # increases the number of background Python instructions. So instead we
+        # partially unroll the loops, with a block size of 100 chosen to keep
+        # the instruction overhead from `range` low while also not ballooning
+        # the size of the generated file.
+        block_size = 100
+        loop_count = number // block_size
+        remainder = number - block_size * loop_count
+        blocked_stmt = ""
+        if loop_count:
+            unrolled_stmts = textwrap.indent("\n".join([stmt] * block_size), " " * 4)
+            blocked_stmt += f"for _ in range({loop_count}):\n{unrolled_stmts}\n"
+        if remainder:
+            blocked_stmt += "\n".join([stmt] * remainder)
+
+        return textwrap.dedent(r"""
+            import gc
+            import os
+            import subprocess
+            import sys
+            import time
+
+            import torch
+            torch.set_num_threads({num_threads})
+
+            PID = os.getpid()
+
+            def log_failure(msg):
+                with open({error_log_repr}, "wt") as f:
+                    f.write(msg)
+                sys.exit(1)
+
+            def check_result(completed_process):
+                if completed_process.returncode:
+                    log_failure(f"Command failed: {{' '.join(completed_process.args)}}")
+                return completed_process
+
+            # =============================================================================
+            # == Check that subprocess matches parent =====================================
+            # =============================================================================
+            if sys.executable != "{parent_interpreter}":
+                log_failure(
+                    "Interpreter mismatch:\n"
+                    f"  {{sys.executable}}\n    vs.\n  {parent_interpreter}"
+                )
+
+            if torch.__file__ != "{torch_file}":
+                log_failure(
+                    "PyTorch does not match expected file:\n"
+                    f"  {{torch.__file__}}\n    vs.\n  {torch_file}"
+                )
+
+            # =============================================================================
+            # == User specified setup =====================================================
+            # =============================================================================
+            {setup}
+
+            for _ in range({warmup_number}):
+            {indented_stmt}
+
+            # =============================================================================
+            # == Callgrind management =====================================================
+            # =============================================================================
+            with open("{stat_log}", "wb") as stat_file:
+                # If many instances of callgrind are running at once, the output of
+                # `callgrind_control` may exceed 16kb which would cause `subprocess.PIPE`
+                # to deadlock. So instead we use a file.
+                callgrind_stat = check_result(subprocess.run(
+                    ["callgrind_control", "--stat"],
+                    stdout=stat_file,
+                    stderr=subprocess.STDOUT,
+                ))
+
+            with open("{stat_log}", "rt") as stat_file:
+                stat_lines = stat_file.read().splitlines()
+
+            if f"PID {{PID}}: python {{__file__}}" not in stat_lines:
+                log_failure("Process does not appear to be running callgrind.")
+
+            gc.collect()
+            time.sleep(0.01)
+
+            # =============================================================================
+            # == User code block ==========================================================
+            # =============================================================================
+            torch._C.valgrind_toggle()
+            {blocked_stmt}
+
+            # Sleep is to allow the interpreter to catch up before we stop collecting in
+            # order to reduce jitter.
+            time.sleep(0.01)
+            torch._C.valgrind_toggle()
+        """).strip().format(
+            indented_stmt=textwrap.indent(stmt, " " * 4),
+            blocked_stmt=blocked_stmt,
+            number=number,
+            setup=setup,
+            warmup_number=min(number, 10),
+            num_threads=num_threads,
+            error_log_repr=repr(error_log),
+            stat_log=stat_log,
+            parent_interpreter=sys.executable,
+            torch_file=torch.__file__,
+        )
+
+
+CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
+def wrapper_singleton() -> _ValgrindWrapper:
+    global CALLGRIND_SINGLETON
+    if CALLGRIND_SINGLETON is None:
+        CALLGRIND_SINGLETON = _ValgrindWrapper()
+    return CALLGRIND_SINGLETON

From 9d5607fcd91eb5c5366fb201b675d53b8e7fcd64 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 30 Sep 2020 18:56:36 -0700
Subject: [PATCH 326/449] [quant] Use PlaceholderObserver as default dynamic
 quant observer (#45343)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45343

Current default dynamic quant observer is not correct since we don't accumulate
min/max and we don't need to calculate qparams.

Test Plan: Imported from OSS

Reviewed By: supriyar

Differential Revision: D23933995

fbshipit-source-id: 3ff497c9f5f74c687e8e343ab9948d05ccbba09b
---
 test/quantization/test_quantize_jit.py        |  6 +-
 test/quantization/test_workflow_module.py     | 27 +-----
 .../quantization/insert_quant_dequant.cpp     |  9 +-
 torch/quantization/observer.py                | 85 ++-----------------
 4 files changed, 20 insertions(+), 107 deletions(-)

diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py
index f0cd203cf43e..a0fad9b80e89 100644
--- a/test/quantization/test_quantize_jit.py
+++ b/test/quantization/test_quantize_jit.py
@@ -2732,11 +2732,11 @@ def forward(self, x):
             else:
                 # for input of FC for dynamic quant
                 assert len(attrs_with_prefix(m, '_observer_')) == 1
-                observer_name = 'DynamicQuantObserver = prim::GetAttr[name="_observer_'
+                observer_name = 'Observer = prim::GetAttr[name="_observer_'
                 FileCheck().check(observer_name) \
                            .check('prim::GetAttr[name="fc"]') \
                            .check('prim::CallMethod') \
-                           .check_not('Observer = prim::GetAttr[name="_observer_') \
+                           .check_not(observer_name) \
                            .run(m.graph)
 
 
@@ -2772,7 +2772,7 @@ def forward(self, x):
         assert len(attrs_with_prefix(m.sub.fc, '_observer_')) == 1
         FileCheck().check('prim::GetAttr[name="sub') \
                    .check('prim::CallMethod') \
-                   .check('DynamicQuantObserver = prim::GetAttr[name="_observer_') \
+                   .check('Observer = prim::GetAttr[name="_observer_') \
                    .check('prim::CallMethod') \
                    .check_not('Observer = prim::GetAttr[name="_observer_') \
                    .run(m.graph)
diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index 8d4e43cc52c4..6d1dd2b1b698 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -5,7 +5,6 @@
     PerChannelMinMaxObserver,
     MovingAverageMinMaxObserver,
     MovingAveragePerChannelMinMaxObserver,
-    MinMaxDynamicQuantObserver,
     HistogramObserver,
     RecordingObserver,
     PlaceholderObserver,
@@ -267,25 +266,6 @@ def test_per_tensor_observers(self, qdtype, qscheme, reduce_range):
             self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
 
 
-    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=2, max_dims=4,
-                                              min_side=1, max_side=10),
-                       qparams=hu.qparams()),
-           reduce_range=st.booleans())
-    def test_per_tensor_dynamic_quant_observers(self, X, reduce_range):
-
-        X, (scale, zero_point, torch_type) = X
-        x = torch.from_numpy(X)
-
-        obs = MinMaxDynamicQuantObserver(dtype=torch.quint8, reduce_range=reduce_range)
-
-        result = obs(x)
-        qparams = obs.calculate_qparams()
-        ref = torch._choose_qparams_per_tensor(x, reduce_range)
-
-        self.assertEqual(ref[0], qparams[0])
-        self.assertEqual(ref[1], qparams[1])
-
-
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
            qscheme=st.sampled_from((torch.per_channel_affine, torch.per_channel_symmetric, torch.per_channel_affine_float_qparams)),
            ch_axis=st.sampled_from((0, 1, 2, 3)), reduce_range=st.booleans())
@@ -396,7 +376,7 @@ def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range):
 
 
     def test_observer_scriptable(self):
-        obs_list = [MinMaxObserver(), MovingAverageMinMaxObserver(), MinMaxDynamicQuantObserver()]
+        obs_list = [MinMaxObserver(), MovingAverageMinMaxObserver()]
         for obs in obs_list:
             scripted = torch.jit.script(obs)
 
@@ -425,7 +405,7 @@ def test_state_dict_respects_device_affinity(self):
             [device_cpu, device_cuda],
             [device_cpu, device_cuda],
             [MinMaxObserver, MovingAverageMinMaxObserver,
-             MinMaxDynamicQuantObserver, PerChannelMinMaxObserver,
+             PerChannelMinMaxObserver,
              MovingAveragePerChannelMinMaxObserver,
              # TODO: enable this (separate PR)
              # HistogramObserver,
@@ -481,7 +461,7 @@ def test_save_load_state_dict_script(self):
         in a quantized model.
         """
         obs_list = [MinMaxObserver, MovingAverageMinMaxObserver,
-                    MinMaxDynamicQuantObserver, PerChannelMinMaxObserver,
+                    PerChannelMinMaxObserver,
                     MovingAveragePerChannelMinMaxObserver, HistogramObserver]
 
         for obs in obs_list:
@@ -1445,7 +1425,6 @@ def test_observers_preserve_buffers(self):
         observer_types = [
             torch.quantization.MinMaxObserver.with_args(dtype=torch.qint8),
             torch.quantization.MovingAverageMinMaxObserver.with_args(dtype=torch.qint8),
-            torch.quantization.MinMaxDynamicQuantObserver.with_args(dtype=torch.qint8),
             torch.quantization.PerChannelMinMaxObserver.with_args(dtype=torch.qint8),
             torch.quantization.MovingAveragePerChannelMinMaxObserver.with_args(dtype=torch.qint8),
             torch.quantization.HistogramObserver.with_args(dtype=torch.qint8),
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 5c6851ce4fab..afd6625d3466 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -988,8 +988,13 @@ std::tuple<c10::QScheme, QParamVector> InsertQuantDeQuantHelper::
 
   auto observer_module = module.attr(observer_name.value()).toModule();
   auto scalar_type = observer_module.attr("dtype");
-  if (isPlaceholderObserver(n->input(0)) ||
-      scalar_type == at::ScalarType::Half) {
+  if (isPlaceholderObserver(n->input(0))) {
+    // get compute_dtype for dynamic quantization
+    if (observer_module.hasattr("compute_dtype")) {
+      qparams.push_back(std::make_pair("_scalar_type", observer_module.attr("compute_dtype")));
+    }
+    return std::make_tuple(qscheme, qparams);
+  } else if (scalar_type == at::ScalarType::Half) {
     return std::make_tuple(qscheme, qparams);
   }
   auto calculate_qparams = observer_module.get_method("calculate_qparams");
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index fc0d2a436d0b..2909b0c1adfe 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -476,82 +476,6 @@ def forward(self, x_orig):
         self.max_val.copy_(max_val)
         return x_orig
 
-
-class MinMaxDynamicQuantObserver(MinMaxObserver):
-    r"""Observer module for computing the quantization parameters based on the
-    tensor min and max values in dynamic quantization.
-
-    This observer will mimic the quantization steps followed in the operator
-    to compute the activation tensor quantization parameters at run-time.
-
-    Args:
-        dtype: Quantized data type
-        qscheme: Quantization scheme to be used
-        reduce_range: Reduces the range of the quantized data type by 1 bit
-
-    .. warning:: Only works with ``torch.per_tensor_symmetric`` quantization scheme
-
-    .. warning:: :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``.
-
-    .. note:: If the running minimum equals to the running maximum, the scale
-              and zero_point are set to 0.1 and 0.
-    """
-
-    @torch.jit.export
-    def calculate_qparams(self):
-        r"""Calculates the quantization parameters."""
-
-        if self.max_val == float('-inf') and self.min_val == float('inf'):
-            return torch.tensor([1.0]), torch.tensor([0])
-
-        assert self.min_val <= self.max_val, "min {} should be less than max {}".format(
-            self.min_val, self.max_val
-        )
-
-        if self.dtype == torch.qint8:
-            if self.reduce_range:
-                qmin, qmax = -64, 63
-            else:
-                qmin, qmax = -128, 127
-        else:  # dtype == torch.quint8
-            if self.reduce_range:
-                qmin, qmax = 0, 127
-            else:
-                qmin, qmax = 0, 255
-
-        max_val, min_val = self.max_val.to(dtype=torch.float), self.min_val.to(dtype=torch.float)
-
-        # Extend the min_val and max_val to ensure that it contains 0.
-        min_val = torch.min(min_val, torch.tensor(0.).to(dtype=torch.float))
-        max_val = torch.max(max_val, torch.tensor(0.).to(dtype=torch.float))
-
-        scale = (max_val.to(dtype=torch.double) - min_val) / float(qmax - qmin)
-
-        if scale == 0.0 or torch.isinf(1.0 / scale):
-            scale = torch.tensor(0.1).to(dtype=torch.float)
-            zero_point = 0
-
-        zero_point_from_min = qmin - min_val / scale.to(dtype=torch.double)
-        zero_point_from_max = qmax - max_val / scale.to(dtype=torch.double)
-        zero_point_from_min_error = abs(qmin) - abs(min_val / scale.to(dtype=torch.double))
-        zero_point_from_max_error = abs(qmax) - abs(max_val / scale.to(dtype=torch.double))
-
-        if zero_point_from_min_error < zero_point_from_max_error:
-            initial_zero_point = zero_point_from_min
-        else:
-            initial_zero_point = zero_point_from_max
-
-        nudged_zero_point = 0
-
-        if initial_zero_point < qmin:
-            nudged_zero_point = qmin
-        elif initial_zero_point > qmax:
-            nudged_zero_point = qmax
-        else:
-            nudged_zero_point = int(initial_zero_point.round())
-
-        return scale.to(dtype=torch.float), torch.tensor([nudged_zero_point])
-
 class PerChannelMinMaxObserver(_ObserverBase):
     r"""Observer module for computing the quantization parameters based on the
     running per channel min and max values.
@@ -1043,10 +967,15 @@ class PlaceholderObserver(ObserverBase):
         custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
                         (Can be used in Graph Mode Passes for special case ops).
     """
-    def __init__(self, dtype=torch.float16, custom_op_name=""):
+    def __init__(self, dtype=torch.float16, custom_op_name="", compute_dtype=None):
         super(PlaceholderObserver, self).__init__(dtype=dtype)
+        # dtype of input of the target operator, e.g. for dynamic quantization
+        # ops, the dtype will be float32
         self.dtype = dtype
         self.custom_op = custom_op_name
+        # used for configuration of computation type for dynamic quantization
+        if compute_dtype:
+            self.compute_dtype = compute_dtype
 
     def forward(self, x):
         return x
@@ -1179,7 +1108,7 @@ def load_observer_state_dict(mod, obs_dict):
 default_weight_observer = MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric)
 default_histogram_observer = HistogramObserver.with_args(reduce_range=True)
 default_per_channel_weight_observer = PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric)
-default_dynamic_quant_observer = MinMaxDynamicQuantObserver
+default_dynamic_quant_observer = PlaceholderObserver.with_args(dtype=torch.float, compute_dtype=torch.quint8)
 default_float_qparams_observer = PerChannelMinMaxObserver.with_args(dtype=torch.quint8,
                                                                     qscheme=torch.per_channel_affine_float_qparams,
                                                                     ch_axis=0)

From 75fc2635798b9e93cc09b359294e5b86a77a369a Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 30 Sep 2020 19:32:01 -0700
Subject: [PATCH 327/449] [TensorExpr] Add a tensor expressions tutorial.
 (#45527)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45527

Differential Revision: D23998787

Test Plan: Imported from OSS

Reviewed By: eellison

Pulled By: ZolotukhinM

fbshipit-source-id: 1f78ccfe8ef13bf493812cfec7f2fd4853e630ee
---
 test/cpp/tensorexpr/CMakeLists.txt |  24 +-
 test/cpp/tensorexpr/tutorial.cpp   | 426 +++++++++++++++++++++++++++++
 2 files changed, 447 insertions(+), 3 deletions(-)
 create mode 100644 test/cpp/tensorexpr/tutorial.cpp

diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
index a2922045adff..5f162fdd2d13 100644
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -12,29 +12,47 @@ add_executable(test_tensorexpr
 target_link_libraries(test_tensorexpr PRIVATE torch gtest)
 target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
 
+add_executable(tutorial_tensorexpr ${TENSOREXPR_TEST_ROOT}/tutorial.cpp)
+target_link_libraries(tutorial_tensorexpr PRIVATE torch)
+target_include_directories(tutorial_tensorexpr PRIVATE ${ATen_CPU_INCLUDE})
+
+
 if(USE_CUDA)
   target_link_libraries(test_tensorexpr PRIVATE
     ${CUDA_LIBRARIES}
     ${CUDA_NVRTC_LIB}
     ${CUDA_CUDA_LIB}
     ${TORCH_CUDA_LIBRARIES})
-
   target_compile_definitions(test_tensorexpr PRIVATE USE_CUDA)
+
+  target_link_libraries(tutorial_tensorexpr PRIVATE
+    ${CUDA_LIBRARIES}
+    ${CUDA_NVRTC_LIB}
+    ${CUDA_CUDA_LIB}
+    ${TORCH_CUDA_LIBRARIES})
+  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA)
 elseif(USE_ROCM)
   target_link_libraries(test_tensorexpr PRIVATE
     ${ROCM_HIPRTC_LIB}
     ${PYTORCH_HIP_HCC_LIBRARIES}
     ${TORCH_CUDA_LIBRARIES})
-
   target_link_libraries(test_tensorexpr PRIVATE caffe2_gpu)
-
   target_compile_definitions(test_tensorexpr PRIVATE USE_ROCM)
+
+  target_link_libraries(tutorial_tensorexpr PRIVATE
+    ${ROCM_HIPRTC_LIB}
+    ${PYTORCH_HIP_HCC_LIBRARIES}
+    ${TORCH_CUDA_LIBRARIES})
+  target_link_libraries(tutorial_tensorexpr PRIVATE caffe2_gpu)
+  target_compile_definitions(tutorial_tensorexpr PRIVATE USE_ROCM)
 endif()
 
 if(INSTALL_TEST)
   install(TARGETS test_tensorexpr DESTINATION bin)
+  install(TARGETS tutorial_tensorexpr DESTINATION bin)
   # Install PDB files for MSVC builds
   if(MSVC AND BUILD_SHARED_LIBS)
     install(FILES $<TARGET_PDB_FILE:test_tensorexpr> DESTINATION bin OPTIONAL)
+    install(FILES $<TARGET_PDB_FILE:tutorial_tensorexpr> DESTINATION bin OPTIONAL)
   endif()
 endif()
diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp
new file mode 100644
index 000000000000..f0bcfc4c2485
--- /dev/null
+++ b/test/cpp/tensorexpr/tutorial.cpp
@@ -0,0 +1,426 @@
+// *** Tensor Expressions ***
+//
+// This tutorial covers basics of NNC's tensor expressions, shows basic APIs to
+// work with them, and outlines how they are used in the overall TorchScript
+// compilation pipeline. This doc is permanently a "work in progress" since NNC
+// is under active development and things change fast.
+//
+// This Tutorial's code is compiled in the standard pytorch build, and the
+// executable can be found in `build/bin/tutorial_tensorexpr`.
+//
+// *** What is NNC ***
+//
+// NNC stands for Neural Net Compiler. It is a component of TorchScript JIT
+// and it performs on-the-fly code generation for kernels, which are often a
+// combination of multiple aten (torch) operators.
+//
+// When the JIT interpreter executes a torchscript model, it automatically
+// extracts subgraphs from the torchscript IR graph for which specialized code
+// can be JIT generated. This usually improves performance as the 'combined'
+// kernel created from the subgraph could avoid unnecessary memory traffic that
+// is unavoidable when the subgraph is interpreted as-is, operator by operator.
+// This optimization is often referred to as 'fusion'. Relatedly, the process of
+// finding and extracting subgraphs suitable for NNC code generation is done by
+// a JIT pass called 'fuser'.
+//
+// *** What is TE ***
+//
+// TE stands for Tensor Expressions. TE is a commonly used approach for
+// compiling kernels performing tensor (~matrix) computation. The idea behind it
+// is that operators are represented as a mathematical formula describing what
+// computation they do (as TEs) and then the TE engine can perform mathematical
+// simplification and other optimizations using those formulas and eventually
+// generate executable code that would produce the same results as the original
+// sequence of operators, but more efficiently.
+//
+// NNC's design and implementation of TE was heavily inspired by Halide and TVM
+// projects.
+#include <iostream>
+#include <string>
+
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+using namespace torch::jit::tensorexpr;
+
+int main(int argc, char* argv[]) {
+  // Memory management for tensor expressions is currently done with memory
+  // arenas. That is, whenever an object is created it registers itself in an
+  // arena and the object is kept alive as long as the arena is alive. When the
+  // arena gets destructed, it deletes all objects registered in it.
+  //
+  // The easiest way to set up a memory arena is to use `KernelScope` class - it
+  // is a resource guard that creates a new arena on construction and restores
+  // the previously set arena on destruction.
+  //
+  // We will create a kernel scope here, and thus we'll set up a mem arena for
+  // the entire tutorial.
+  KernelScope kernel_scope;
+
+  std::cout << "*** Structure of tensor expressions ***" << std::endl;
+  {
+    // A tensor expression is a tree of expressions. Each expression has a type,
+    // and that type defines what sub-expressions it the current expression has.
+    // For instance, an expression of type 'Mul' would have a type 'kMul' and
+    // two subexpressions: LHS and RHS. Each of these two sub-expressions could
+    // also be a 'Mul' or some other expression.
+    //
+    // Let's construct a simple TE:
+    Expr* lhs = new IntImm(5);
+    Expr* rhs = new Var("x", kInt);
+    Expr* mul = new Mul(lhs, rhs);
+    std::cout << "Tensor expression: " << *mul << std::endl;
+    // Prints: Tensor expression: 5 * x
+
+    // Here we created an expression representing a 5*x computation, where x is
+    // an int variable.
+
+    // Another, probably a more convenient, way to construct tensor expressions
+    // is to use so called expression handles (as opposed to raw expressions
+    // like we did in the previous example). Expression handles overload common
+    // operations and allow us to express the same semantics in a more natural
+    // way:
+    ExprHandle l = 1;
+    ExprHandle r = Var::make("x", kInt);
+    ExprHandle m = l * r;
+    std::cout << "Tensor expression: " << *m.node() << std::endl;
+    // Prints: Tensor expression: 1 * x
+
+    // In a similar fashion we could construct arbitrarily complex expressions
+    // using mathematical and logical operations, casts between various data
+    // types, and a bunch of intrinsics.
+    ExprHandle a = Var::make("a", kInt);
+    ExprHandle b = Var::make("b", kFloat);
+    ExprHandle c = Var::make("c", kFloat);
+    ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f);
+    std::cout << "Tensor expression: " << *x.node() << std::endl;
+    // Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f)
+
+    // An ultimate purpose of tensor expressions is to optimize tensor
+    // computations, and in order to represent accesses to tensors data, there
+    // is a special kind of expression - a load.
+    // To construct a load we need two pieces: the base and the indices. The
+    // base of a load is a Buf expression, which could be thought of as a
+    // placeholder similar to Var, but with dimensions info.
+    //
+    // Let's construct a simple load:
+    BufHandle A("A", {ExprHandle(64), ExprHandle(32)}, kInt);
+    ExprHandle i = Var::make("i", kInt), j = Var::make("j", kInt);
+    ExprHandle load = Load::make(A.dtype(), A, {i, j}, /* mask= */ 1);
+    std::cout << "Tensor expression: " << *load.node() << std::endl;
+    // Prints: Tensor expression: A[i, j]
+  }
+
+  std::cout << "*** Tensors, Functions, and Placeholders ***" << std::endl;
+  {
+    // A tensor computation is represented by objects of Tensor class and
+    // consists of the following pieces:
+    //   - domain, which is specified by a Buf expression
+    //   - an expression (or several expressions if we want to perform several
+    //   independent computations over the same domain) for its elements, as a
+    //   function of indices
+    //
+    // We use Function objects to represent this. Let's build one.
+    //
+    // First, we need to specify the domain, or dimensions in which the
+    // computation would be performed. Let's create a 64x32 domain:
+    std::vector<const Expr*> dims = {
+        new IntImm(64), new IntImm(32)}; // IntImm stands for Integer Immediate
+                                         // and represents an integer constant
+
+    // Next we need to create Function arguments. The arguments of a Function
+    // are Vars, and they play role of placeholders. The computation that the
+    // function would describe would use these arguments.
+    const Var* i = new Var("i", kInt);
+    const Var* j = new Var("j", kInt);
+    std::vector<const Var*> args = {i, j};
+
+    // Now we can define the function computations using these arguments. Let's
+    // create two computations, the first would add the arguments of the
+    // function, the second would multiply them.
+    Expr* func_body1 = new Mul(i, j);
+    Expr* func_body2 = new Add(i, j);
+
+    // Finally, we pass all these pieces together to Function constructor:
+    Function* func =
+        new Function({"X", "Y"}, dims, args, {func_body1, func_body2});
+    // Under the hood function constructor would create separate `Buf`
+    // expressions for each computation (which can be accessed via
+    // `func->func_var(idx)`) with the names specified by the first parameter of
+    // the constructor call. In our example two `Buf` variables will be created
+    // with names 'X' and 'Y', each of them would signify a domain of 64x32.
+
+    // We can now print out our function:
+    std::cout << "Tensor function: " << *func << std::endl;
+    // Prints:
+    // Tensor function: Function F(i[64], j[32]) {
+    //   X = i * j
+    //   Y = i + j
+    // }
+
+    // A Tensor refers to an individual computation defined by a Function. For
+    // instance, we could create a following tensor given the function above:
+    int output_idx = 0; // Used to index the computation
+    Tensor* X = new Tensor(func, output_idx);
+    std::cout << "Tensor computation: " << *X << std::endl;
+    // Prints: Tensor computation: Tensor X(i[64], j[32]) = i * j
+
+    // Similarly to how we provide a more convenient way of using handles for
+    // constructing Exprs, Tensors also have a more convenient API for
+    // construction. It is based on Compute functions, which take a name:
+    // dimensions, and a lambda specifying the computation body:
+    Tensor* Z = Compute(
+        "Z",
+        {{64, "i"}, {32, "j"}},
+        [](const VarHandle& i, const VarHandle& j) { return i / j; });
+    std::cout << "Tensor computation: " << *Z << std::endl;
+    // Prints: Tensor computation: Tensor Z(i[64], j[32]) = i / j
+
+    // Tensors might access other tensors and external placeholders in their
+    // expressions. It can be done like so:
+    Placeholder P("P", kFloat, {64, 32});
+    Tensor* R = Compute(
+        "R",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return Z->call(i, j) * P.load(i, j);
+        });
+    std::cout << "Tensor computation: " << *R << std::endl;
+    // Prints: Tensor computation: Tensor R(i[64], j[32]) = Z(i, j) * P[i, j]
+
+    // Placeholders could be thought of as external tensors, i.e. tensors for
+    // which we don't have the element expression. In other words, for `Tensor`
+    // we know an expression specifying how its elements can be computed (a
+    // mathematical formula). For external tensors, or placeholders, we don't
+    // have such an expression. They need to be considered as coming to us as
+    // inputs from outside - we can only load data from them.
+    //
+    // Also note that we use 'call' to construct an access to an element of a
+    // Tensor and we use 'load' for accessing elements of an external tensor
+    // through its Placeholder. This is an implementation detail and could be
+    // changed in future.
+    //
+    // Why do we have Functions and Tensors and what is the relationship between
+    // them? Functions are used to represent several computations performed over
+    // the same domain. Tensors refer to individual computations of a Function.
+    //
+    // Also note that currently a lot of code only supports single-output
+    // Functions, in which case they become almost identical to Tensors. This
+    // probably will be changed in future.
+
+    // TODO: Show how reductions are represented and constructed
+  }
+
+  std::cout << "*** Loopnests and Statements ***" << std::endl;
+  {
+    // Creating a tensor expression is the first step to generate an executable
+    // code for it. A next step is to represent it as a loop nest and apply
+    // various loop transformations in order to get an optimal implementation.
+    // In Halide's or TVM's terms the first step was to define the algorithm of
+    // computation (what to compute?) and now we are getting to the schedule of
+    // the computation (how to compute?).
+    //
+    // Let's create a simple tensor expression and construct a loop nest for it.
+    Placeholder A("A", kFloat, {64, 32});
+    Placeholder B("B", kFloat, {64, 32});
+    Tensor* X = Compute(
+        "X",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return A.load(i, j) + B.load(i, j);
+        });
+    Tensor* Y = Compute(
+        "Y",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return sigmoid(X->call(i, j));
+        });
+    std::cout << "Tensor computation X: " << *X
+              << "Tensor computation Y: " << *Y << std::endl;
+    // Prints:
+    // Tensor computation X: Tensor X(i[64], j[32]) = (A[i, j]) + (B[i, j])
+    // Tensor computation Y: Tensor Y(i[64], j[32]) = sigmoid(X(i, j))
+
+    // Creating a loop nest is as quite simple, we just need to specify what are
+    // the output tensors in our computation and LoopNest object will
+    // automatically pull all tensor dependencies:
+    LoopNest loopnest({Y});
+
+    // An IR used in LoopNest is based on tensor statements, represented by
+    // `Stmt` class. Statements are used to specify the loop nest structure, and
+    // to take a sneak peek at them, let's print out what we got right after
+    // creating our LoopNest object:
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       X[i, j] = (A[i, j]) + (B[i, j]);
+    //     }
+    //   }
+    //   for (int i_1 = 0; i_1 < 64; i_1++) {
+    //     for (int j_1 = 0; j_1 < 32; j_1++) {
+    //       Y[i_1, j_1] = sigmoid(X(i_1, j_1));
+    //     }
+    //   }
+    // }
+
+    // To introduce statements let's first look at their three main types (in
+    // fact, there are more than 3 types, but the other types would be easy to
+    // understand once the overall structure is clear):
+    //  1) Block
+    //  2) For
+    //  3) Store
+    //
+    // A `Block` statement is simply a list of other statements.
+    // A `For` is a statement representing one axis of computation. It contains
+    // an index variable (Var), boundaries of the axis (start and end - both are
+    // `Expr`s), and a `Block` statement body.
+    // A `Store` represents an assignment to a tensor element. It contains a Buf
+    // representing the target tensor, a list of expressions for indices of the
+    // element, and the value to be stored, which is an arbitrary expression.
+
+    // Once we've constructed the loop nest, we can apply various tranformations
+    // to it. To begin with, let's inline computation of X into computation of Y
+    // and see what happens to our statements.
+    loopnest.computeInline(loopnest.getLoopBodyFor(X));
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       Y[i, j] = sigmoid((A[i, j]) + (B[i, j]));
+    //     }
+    //   }
+    // }
+    //
+    // As you can see, the first two loops have disappeared and the expression
+    // for X[i,j] has been inserted into the Y[i,j] computation.
+
+    // Loop transformations can be composed, so we can do something else with
+    // our loop nest now. Let's split the inner loop with a factor of 9, for
+    // instance.
+    std::vector<For*> loops = loopnest.getLoopStmtsFor(Y);
+    For* j_outer;
+    For* j_inner;
+    For* j_tail;
+    int split_factor = 9;
+    loopnest.splitWithTail(
+        loops[1], // loops[0] is the outer loop, loops[1] is inner
+        split_factor,
+        &j_outer, // These are handles that we would be using for
+        &j_inner, // further transformations
+        &j_tail);
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j_outer = 0; j_outer < (32 - 0) / 9; j_outer++) {
+    //       for (int j_inner = 0; j_inner < 9; j_inner++) {
+    //         Y[i, j_outer * 9 + j_inner] = sigmoid((A[i, j_outer * 9 + ...
+    //       }
+    //     }
+    //     for (int j_tail = 0; j_tail < (32 - 0) % 9; j_tail++) {
+    //       Y[i, j_tail + ((32 - 0) / 9) * 9] = sigmoid((A[i, j_tail + ...
+    //     }
+    //   }
+    // }
+
+    // TODO: List all available transformations
+    // TODO: Show how statements can be constructed manually
+  }
+
+  std::cout << "*** Codegen ***" << std::endl;
+  {
+    // An ultimate goal of tensor expressions is to be provide a mechanism to
+    // execute a given computation in the fastest possible way. So far we've
+    // looked at how we could describe what computation we're interested in, but
+    // we haven't looked at how to actually execute it. So far all we've been
+    // dealing with was just symbols with no actual data associated, in this
+    // section we would look at how we can bridge that gap.
+
+    // Let's start by constructing a simple computation for us to work with:
+    Placeholder A("A", kInt, {64, 32});
+    Placeholder B("B", kInt, {64, 32});
+    Tensor* X = Compute(
+        "X",
+        {{64, "i"}, {32, "j"}},
+        [&](const VarHandle& i, const VarHandle& j) {
+          return A.load(i, j) + B.load(i, j);
+        });
+
+    // And let's lower it to a loop nest, as we did in the previous section:
+    LoopNest loopnest({X});
+    std::cout << *loopnest.root_stmt() << std::endl;
+    // Prints:
+    // {
+    //   for (int i = 0; i < 64; i++) {
+    //     for (int j = 0; j < 32; j++) {
+    //       X[i, j] = (A[i, j]) + (B[i, j]);
+    //     }
+    //   }
+
+    // Now imagine that we have two actual tensors 64x32 that we want sum
+    // together, how do we pass those tensors to the computation and how do we
+    // carry it out?
+    //
+    // Codegen object is aimed at providing exactly that functionality. Codegen
+    // is an abstract class and concrete codegens are derived from it.
+    // Currently, we have three codegens:
+    //  1) Simple Evaluator,
+    //  2) LLVM Codegen for CPU,
+    //  3) CUDA Codegen.
+    // In this example we will be using Simple Evaluator, since it's available
+    // everywhere.
+
+    // To create a codegen, we need to provide the statement - it specifies the
+    // computation we want to perform - and a list of placeholders and tensors
+    // used in the computation. The latter part is crucial since that's the only
+    // way the codegen could use to correlate symbols in the statement to actual
+    // data arrays that we will be passing when we will actually be performing
+    // the computation.
+    //
+    // Let's create a Simple IR Evaluator codegen for our computation:
+    SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X});
+
+    // We are using the simplest codegen and in it almost no work is done at the
+    // construction step. Real codegens such as CUDA and LLVM perform
+    // compilation during that stage so that when we're about to run the
+    // computation everything is ready.
+
+    // Let's now create some inputs and run our computation with them:
+    std::vector<int> data_A(64 * 32, 3); // This will be the input A
+    std::vector<int> data_B(64 * 32, 5); // This will be the input B
+    std::vector<int> data_X(64 * 32, 0); // This will be used for the result
+
+    // Now let's invoke our codegen to perform the computation on our data. We
+    // need to provide as many arguments as how many placeholders and tensors we
+    // passed at the codegen construction time. A position in these lists would
+    // define how real data arrays from the latter call (these arguments are
+    // referred to as 'CallArg's in our codebase) correspond to symbols
+    // (placeholders and tensors) used in the tensor expressions we constructed
+    // (these are referred to as 'BufferArg').
+    // Thus, we will provide three arguments: data_A, data_B, and data_X. data_A
+    // contains data for the placeholder A, data_B - for the placeholder B, and
+    // data_X would be used for contents of tensor X.
+    ir_eval(data_A, data_B, data_X);
+
+    // Let's print one of the elements from each array to verify that the
+    // computation did happen:
+    std::cout << "A[10] = " << data_A[10] << std::endl
+              << "B[10] = " << data_B[10] << std::endl
+              << "X[10] = A[10] + B[10] = " << data_X[10] << std::endl;
+    // Prints:
+    // A[10] = 3
+    // B[10] = 5
+    // X[10] = A[10] + B[10] = 8
+  }
+
+  // TODO: Show how TorchScript IR is translated to TE
+  return 0;
+}

From 3f440d74fc2e8efe082d7c47e61d06edf81f45a9 Mon Sep 17 00:00:00 2001
From: Xingying Cheng <xcheng16@fb.com>
Date: Wed, 30 Sep 2020 20:11:45 -0700
Subject: [PATCH 328/449] [PyTorch][QPL] Add instance_key into
 MOBILE_MODULE_STATS logging. (#45517)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45517

Add unique instance_key instead of the default one into MOBILE_MODULE_STATS logging to avoid multiple events overlaps.
ghstack-source-id: 113149453

Test Plan:
Make sure that each event's start, annotate and end are having the same instancekey:
```
09-28 23:46:03.094 19349 21069 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod instance_key = 1123198800, method_name = forward
09-28 23:46:03.094 19349 21069 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod instance_key = 1123198800, model_name = bi_pytext_v10
09-28 23:46:03.094 19349 21069 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod instance_key = 1123198800, model_type = FBNet
09-28 23:46:03.094 19349 21069 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod instance_key = 1123198800, op_list_string = ["aten::__getitem__.t", "aten::__is__", "aten::__isnot__", "aten::add.Tensor", "aten::append.t", "aten::cat", "aten::contiguous", "aten::conv1d", "aten::dim", "aten::embedding", "aten::eq.int", "aten::format", "aten::len.t", "aten::max.dim", "aten::mul.Tensor", "aten::permute", "aten::relu", "aten::softmax.int", "aten::tanh", "prepacked::linear_clamp_run", "prim::RaiseException", "prim::TupleIndex", "prim::TupleUnpack", "prim::Uninitialized", "prim::unchecked_cast"]
09-28 23:46:03.181 19349 21069 W MobileModuleQPLObserver.cpp: TESTINGTESTING onExitRunMethod instance_key = 1123198800
09-28 23:46:04.183 19349 20896 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod instance_key = 1521608147, method_name = forward
09-28 23:46:04.184 19349 20896 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterRunMethod instance_key = 1521608147, model_name = __torch__.Model
09-28 23:46:04.205 19349 20896 W MobileModuleQPLObserver.cpp: TESTINGTESTING onExitRunMethod instance_key = 1521608147
```

Reviewed By: iseeyuan

Differential Revision: D23985178

fbshipit-source-id: bcd5db8dc680e3cf8d12edf865377e80693cc23b
---
 torch/csrc/jit/mobile/module.cpp | 10 ++++++----
 torch/csrc/jit/mobile/observer.h |  6 +++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/jit/mobile/module.cpp b/torch/csrc/jit/mobile/module.cpp
index f294436d9d7e..4fd2c94bbf1a 100644
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@@ -121,6 +121,7 @@ Method::Method(const Module* owner, Function* function)
 
 void Method::run(Stack& stack) {
   auto observer = torch::observerConfig().getModuleObserver();
+  auto instance_key = std::rand();
   /* if the metadata dict doesn't contain "model_name", copy the metadata and
   set the value of "model_name" as name() */
   std::unordered_map<std::string, std::string> copied_metadata =
@@ -129,7 +130,8 @@ void Method::run(Stack& stack) {
     copied_metadata["model_name"] = owner_->name();
   }
   if (observer) {
-    observer->onEnterRunMethod(copied_metadata, function_->name());
+    observer->onEnterRunMethod(
+        copied_metadata, instance_key, function_->name());
   }
 
   auto debug_info = std::make_shared<MobileDebugInfo>();
@@ -142,11 +144,11 @@ void Method::run(Stack& stack) {
     stack.insert(stack.begin(), owner_->_ivalue());
     function_->run(stack);
     if (observer) {
-      observer->onExitRunMethod();
+      observer->onExitRunMethod(instance_key);
     }
   } catch (c10::Error& error) {
     if (observer) {
-      observer->onFailRunMethod(error.what());
+      observer->onFailRunMethod(instance_key, error.what());
     }
     TORCH_RETHROW(error);
   } catch (...) {
@@ -163,7 +165,7 @@ void Method::run(Stack& stack) {
       }
     } catch (c10::Error& error) {
       if (observer) {
-        observer->onFailRunMethod(error.what());
+        observer->onFailRunMethod(instance_key, error.what());
       }
       TORCH_RETHROW(error);
     }
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
index 2935fa078fc7..bdc0e28cc691 100644
--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@@ -70,10 +70,10 @@ class MobileModuleObserver {
 
   virtual void onEnterRunMethod(
       const std::unordered_map<std::string, std::string>&,
+      const int32_t,
       const std::string&) {}
-  virtual void onExitRunMethod() {}
-  virtual void onCancelRunMethod(const std::string&) {}
-  virtual void onFailRunMethod(const char*) {}
+  virtual void onExitRunMethod(const int32_t) {}
+  virtual void onFailRunMethod(const int32_t, const char*) {}
   virtual void onEnterLoadModel() {}
   virtual void onExitLoadModel(
       const std::unordered_map<std::string, std::string>&) {}

From ffcb0989e76798ddb893b9e156ae1113d2498bb5 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Wed, 30 Sep 2020 21:18:34 -0700
Subject: [PATCH 329/449] [quant][graphmode][fx] Merge all quantization mode
 (#45292)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45292

This PR merges all quantization mode and will only expose the following top level functions:
```
prepare_fx
prepare_qat_fx
convert_fx
```

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D23913105

fbshipit-source-id: 4e335286d6de225839daf51d1df54322d52d68e5
---
 test/quantization/test_quantize_fx.py         |  80 ++---
 torch/quantization/fx/pattern_utils.py        |  12 -
 .../quantization/fx/quantization_patterns.py  | 167 ++++------
 torch/quantization/fx/quantize.py             |  55 ++--
 torch/quantization/fx/utils.py                |  27 ++
 torch/quantization/quantize_fx.py             | 301 +++++++-----------
 .../testing/_internal/common_quantization.py  |  36 ++-
 7 files changed, 295 insertions(+), 383 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index e526d096a878..23b416d7f5c7 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -17,10 +17,6 @@
     fuse_fx,
     prepare_fx,
     convert_fx,
-    prepare_static_fx,
-    convert_static_fx,
-    quantize_static_fx,
-    quantize_dynamic_fx,
     prepare_qat_fx,
     register_observed_custom_module_mapping,
     register_quantized_custom_module_mapping,
@@ -158,11 +154,11 @@ def test_functional_debug(self):
             quant_type = QuantType.DYNAMIC if is_dynamic else QuantType.STATIC
             node_occurrence = dict()
             if weight_prepack_node:
-                node_occurrence[weight_prepack_node] = 1
+                node_occurrence[weight_prepack_node] = 0
+                node_occurrence[quantized_node] = 0
             self.checkGraphModeFxOp(
                 ModuleClass(*module_constructor_inputs),
                 inputs, quant_type,
-                expected_node=quantized_node,
                 expected_node_occurrence=node_occurrence,
                 debug=True)
 
@@ -183,7 +179,8 @@ def forward(self, x):
         original = symbolic_trace(m)
         qconfig = default_dynamic_qconfig
         qconfig_dict = {'': qconfig}
-        quantized = quantize_dynamic_fx(original, qconfig_dict, debug=True)
+        prepared = prepare_fx(original, qconfig_dict)
+        quantized = convert_fx(prepared, debug=True)
         qparams = (quantized._scale_0, quantized._zero_point_0)
         weight_obs = qconfig.weight()
         weight_obs(quantized.weight)
@@ -226,14 +223,12 @@ def forward(self, x):
             for debug in [True, False]:
                 node_occurrence = dict()
                 if weight_prepack_node:
-                    if debug:
-                        node_occurrence[weight_prepack_node] = 1
-                    else:
-                        node_occurrence[weight_prepack_node] = 0
+                    node_occurrence[weight_prepack_node] = 0
                 m = ModuleClass(*module_constructor_inputs).eval()
                 m = symbolic_trace(m)
                 qconfig_dict = {"": float16_dynamic_qconfig}
-                m = quantize_dynamic_fx(m, qconfig_dict, debug=debug)
+                m = prepare_fx(m, qconfig_dict)
+                m = convert_fx(m, debug=debug)
                 self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
 
@@ -293,13 +288,19 @@ def __init__(self):
             def forward(self, x):
                 return self.conv(x)
 
-        model = symbolic_trace(M().eval())
+        model = M().eval()
+        model = symbolic_trace(model)
         qconfig_dict = {'': default_qconfig}
-        non_inplace_model = quantize_static_fx(
-            model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=False)
-        inplace_model = model
-        inplace_model = quantize_static_fx(
-            inplace_model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=True)
+        prepared = prepare_fx(
+            model, qconfig_dict, inplace=False)
+        test_only_eval_fn(model, self.img_data_2d)
+        non_inplace_model = convert_fx(prepared, inplace=True)
+
+        prepared = prepare_fx(
+            model, qconfig_dict, inplace=True)
+        test_only_eval_fn(model, self.img_data_2d)
+        inplace_model = convert_fx(prepared, inplace=True)
+
         non_inplace_res = non_inplace_model(self.img_data_2d[0][0])
         inplace_res = inplace_model(self.img_data_2d[0][0])
         self.assertEqual(non_inplace_res, inplace_res)
@@ -319,9 +320,9 @@ def forward(self, x):
         dict_input = {"input": torch.randn(1, 1, 1, 1)}
         m = symbolic_trace(M()).eval()
         qconfig_dict = {"": default_qconfig}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         m(dict_input)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(dict_input)
 
     def test_standalone_module_class(self):
@@ -431,10 +432,10 @@ def forward(self, x):
         m = symbolic_trace(m)
         qconfig_dict = {"": default_qconfig,
                         "module_name": [("conv2", None)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -460,10 +461,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"object_type": [(torch.nn.Conv2d, default_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -485,10 +486,10 @@ def forward(self, x, y):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"object_type": [(operator.add, default_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data, data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data, data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -513,10 +514,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"module_name_regex": [("conv*", default_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -558,7 +559,7 @@ def forward(self, x):
             "object_type": [(nn.Conv2d, object_type_qconfig)],
             "module_name_regex": [("module_conv*", module_name_regex_qconfig)],
             "module_name": [("module_conv2", module_name_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         self.assertEqual(m.linear.qconfig, global_qconfig)
         self.assertEqual(m.conv.qconfig, object_type_qconfig)
         self.assertEqual(m.module_conv1.qconfig, module_name_regex_qconfig)
@@ -577,10 +578,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {'': default_qconfig}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         for name, module in m.named_modules():
             self.assertFalse(hasattr(module, 'qconfig'),
@@ -632,12 +633,13 @@ def test_save_observer_state_dict(self):
         qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')}
         # symbolically trace
         model = symbolic_trace(model)
-        model = prepare_static_fx(model, qconfig_dict)
+        model = prepare_fx(model, qconfig_dict)
+
         # run it through input
         x = torch.randn(5, 5)
         model(x)
 
-        quant = convert_static_fx(model)
+        quant = convert_fx(model)
 
         # save state_dict of model
         obs_dict = torch.quantization.get_observer_state_dict(model)
@@ -648,12 +650,12 @@ def test_save_observer_state_dict(self):
         # Load the stats into new model
         model_2 = orig
         model_2 = symbolic_trace(model_2)
-        model_2 = prepare_static_fx(model_2, qconfig_dict)
+        model_2 = prepare_fx(model_2, qconfig_dict)
 
         loaded_dict = torch.load(b)
         torch.quantization.load_observer_state_dict(model_2, loaded_dict)
 
-        quant_2 = convert_static_fx(model_2)
+        quant_2 = convert_fx(model_2)
 
         # Verify that loaded state dict produces same results.
         self.assertEqual(quant(x), quant_2(x))
@@ -765,7 +767,7 @@ def is_leaf_module(self, m, module_qualified_name):
             m = CustomTracer().trace(original_m).eval()
             qconfig_dict = {'': default_qconfig}
             # check prepared model
-            m = prepare_static_fx(m, qconfig_dict)
+            m = prepare_fx(m, qconfig_dict)
             # calibration
             m(data)
             # all activation observers are inserted in the top level module
@@ -775,7 +777,7 @@ def is_leaf_module(self, m, module_qualified_name):
             self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
 
             # check converted/quantized model
-            m = convert_static_fx(m)
+            m = convert_fx(m)
             count_check = {
                 ns.call_function(torch.quantize_per_tensor) : 1,
                 ns.call_module(nnq.Conv2d) : 1,
@@ -1347,7 +1349,7 @@ def forward(self, x):
         data = torch.rand(1, 3, 10, 10)
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M()
+        m = M().eval()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
@@ -1442,7 +1444,7 @@ def forward(self, x):
 
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M()
+        m = M().eval()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index ae9e92ccda26..fbdccbc5e3e2 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -25,18 +25,6 @@ def insert(fn):
 def get_quant_patterns():
     return QUANTIZATION_PATTERNS
 
-DYNAMIC_QUANTIZATION_PATTERNS = OrderedDict()
-# Register pattern for dynamic quantization
-def register_dynamic_quant_pattern(pattern):
-    def insert(fn):
-        DYNAMIC_QUANTIZATION_PATTERNS[pattern] = fn
-        return fn
-    return insert
-
-# Get patterns for dynamic quantization
-def get_dynamic_quant_patterns():
-    return DYNAMIC_QUANTIZATION_PATTERNS
-
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 # class ConvBNReLUFusion():
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 0cb076dd73c7..844351a30def 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -2,6 +2,9 @@
 from torch.fx.graph import (
     Node,
 )
+import torch.nn.quantized as nnq
+import torch.nn.quantized.dynamic as nnqd
+
 from ..quantization_mappings import (
     get_static_quant_module_class,
     get_quantized_operator,
@@ -11,12 +14,15 @@
 )
 from .pattern_utils import (
     register_quant_pattern,
-    register_dynamic_quant_pattern,
 )
 from .utils import (
     _parent_name,
     quantize_node,
     get_per_tensor_qparams,
+    activation_is_statically_quantized,
+    weight_is_quantized,
+    weight_dtype,
+    get_linear_prepack_op_for_dtype,
 )
 
 from abc import ABC, abstractmethod
@@ -235,7 +241,7 @@ def convert(self, quantizer, node, load_arg, debug=False):
 # for error checks
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Linear))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Linear))
-class LinearReLU(QuantizeHandler):
+class LinearReLUQuantizeHandler(QuantizeHandler):
     def __init__(self, quantizer, node):
         super().__init__(quantizer, node)
         self.relu_node = None
@@ -248,50 +254,76 @@ def __init__(self, quantizer, node):
             self.linear = quantizer.modules[self.linear_node.target]
 
     def convert(self, quantizer, node, load_arg, debug=False):
+        qconfig = quantizer.qconfig_map[node.name]
+        activation_statically_quantized = activation_is_statically_quantized(qconfig)
         # TODO: debug option for linear module
         if self.linear_node.op == 'call_module':
             # note that relu should already be fused into conv module in the fusion step
             assert self.relu_node is None, 'linear module and relu fusion is not executed, ' \
                 'please make sure to run fusion before prepare'
-            # 1. attach activation post process to module
-            if type(self.linear) == torch.nn.intrinsic.LinearReLU:
-                self.linear[1].activation_post_process = quantizer.activation_post_process_map[node.name]
+            # 1. attach output activation post process to linear module
+            if node.name in quantizer.activation_post_process_map:
+                # this is the static quantization case
+                output_activation_post_process = quantizer.activation_post_process_map[node.name]
             else:
-                self.linear.activation_post_process = quantizer.activation_post_process_map[node.name]
-            # 2. select quantized class
+                output_activation_post_process = None
+
+            if output_activation_post_process:
+                if type(self.linear) == torch.nn.intrinsic.LinearReLU:
+                    float_linear_module = self.linear[1]
+                else:
+                    float_linear_module = self.linear
+                float_linear_module.activation_post_process = output_activation_post_process
+
+            # 2. select corresponding quantized linear class for the float linear class
             if type(self.linear) in [torch.nn.Linear, torch.nn.qat.Linear]:
-                qlinear = torch.nn.quantized.Linear
+                qlinear = nnq.Linear if activation_statically_quantized else nnqd.Linear
             elif type(self.linear) in [torch.nn.intrinsic.LinearReLU, torch.nn.intrinsic.qat.LinearReLU]:
+                assert activation_statically_quantized, \
+                    'Only static quantization is supported for LinearReLU'
                 qlinear = torch.nn.intrinsic.quantized.LinearReLU
             else:
                 raise Exception("unhandled linear type:", type(self.linear))
             quantized = qlinear.from_float(self.linear)
             parent_name, name = _parent_name(self.linear_node.target)
             setattr(quantizer.modules[parent_name], name, quantized)
+            # activation needs to be quantized for static quantization
             return quantizer.quantized_graph.create_node(
                 'call_module',
-                self.linear_node.target, (load_arg(quantized=True)(self.linear_node.args[0]),), {})
+                self.linear_node.target,
+                (load_arg(quantized=activation_statically_quantized)(self.linear_node.args[0]),), {})
         elif self.linear_node.op == 'call_function':
             if debug:
-                args = load_arg(quantized=[0, 1])(self.linear_node.args)
+                quantized_input_idxs = []
+                if activation_statically_quantized:
+                    quantized_input_idxs.append(0)
+                if weight_is_quantized(qconfig):
+                    quantized_input_idxs.append(1)
+                args = load_arg(quantized=quantized_input_idxs)(self.linear_node.args)
                 args = load_arg(quantized=False)(self.linear_node.args)
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 linear_out = quantizer.quantized_graph.create_node(
                     'call_function', torch.nn.functional.linear, args, kwargs)
-                root_module = quantizer.modules['']
-                return quantize_node(
-                    root_module,
-                    quantizer.quantized_graph,
-                    linear_out,
-                    quantizer.activation_post_process_map[self.linear_node.name])
-            else:
-                # TODO: this code can be merged with dynamic linear code
+                if activation_statically_quantized:
+                    # quantize output for statically quantized linear op
+                    root_module = quantizer.modules['']
+                    return quantize_node(
+                        root_module,
+                        quantizer.quantized_graph,
+                        linear_out,
+                        quantizer.activation_post_process_map[self.linear_node.name])
+                else:
+                    # output for dynamically quantized linear op is not quantized
+                    return linear_out
+            else:  # non-debug option
                 # linear args
                 # (x, weight, bias, ...)
-                args = load_arg(quantized=[0, 1])(self.linear_node.args)
+                weight_quantized = weight_is_quantized(qconfig)
+                linear_weight = load_arg(quantized=weight_quantized)(self.linear_node.args[1])
+
+                # get other arguments
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 # pack weight
-                weight = load_arg(quantized=True)(self.linear_node.args[1])
                 bias = None
                 # all args after bias, including bias
                 other_args = load_arg(quantized=False)(self.linear_node.args[2:])
@@ -303,17 +335,24 @@ def convert(self, quantizer, node, load_arg, debug=False):
                         'expect bias provided as a keyword argument when it is not a positional argument'
                     bias = kwargs['bias']
                     kwargs.pop('bias')
-                prepack_args = (weight, bias)
+                prepack_args = (linear_weight, bias)
+                prepack_op = get_linear_prepack_op_for_dtype(weight_dtype(qconfig))
                 packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear_prepack, prepack_args, {})
+                    'call_function', prepack_op, prepack_args, {})
                 # construct linear input
-                linear_input = load_arg(quantized=True)(self.linear_node.args[0])
-                activation_post_process = \
-                    quantizer.activation_post_process_map[self.linear_node.name]
-                scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
-                qlinear_args = (linear_input, packed_weight, scale, zero_point)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
+                if activation_statically_quantized:
+                    linear_input = load_arg(quantized=True)(self.linear_node.args[0])
+                    activation_post_process = \
+                        quantizer.activation_post_process_map[self.linear_node.name]
+                    scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
+                    qlinear_args = (linear_input, packed_weight, scale, zero_point)
+                    return quantizer.quantized_graph.create_node(
+                        'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
+                else:
+                    linear_input = load_arg(quantized=False)(self.linear_node.args[0])
+                    qlinear_args = (linear_input, packed_weight)
+                    return quantizer.quantized_graph.create_node(
+                        'call_function', torch.ops.quantized.linear_dynamic, qlinear_args, kwargs)
 
 @register_quant_pattern(torch.nn.BatchNorm2d)
 @register_quant_pattern(torch.nn.BatchNorm3d)
@@ -537,10 +576,8 @@ class StandaloneModuleQuantizeHandler(QuantizeHandler):
     """
     def convert(self, quantizer, node, load_arg, debug=False):
         assert node.op == 'call_module'
-        if quantizer.is_dynamic_quant:
-            convert = torch.quantizations.quantize_fx._convert_dynamic_standalone_module_fx
-        else:
-            convert = torch.quantization.quantize_fx._convert_standalone_module_fx
+        qconfig = quantizer.qconfig_map[node.name]
+        convert = torch.quantization.quantize_fx._convert_standalone_module_fx
         observed_standalone_module = quantizer.modules[node.target]
         quantized_standalone_module = convert(observed_standalone_module, debug=debug)
         parent_name, name = _parent_name(node.target)
@@ -548,67 +585,3 @@ def convert(self, quantizer, node, load_arg, debug=False):
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
         return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
-
-
-# 2. Post Training Dynamic Quantizatoin Patterns
-@register_dynamic_quant_pattern(torch.nn.Linear)
-@register_dynamic_quant_pattern(torch.nn.functional.linear)
-class DynamicLinear(QuantizeHandler):
-    def __init__(self, quantizer, node):
-        super().__init__(quantizer, node)
-        self.linear_node = node
-        if node.op == 'call_module':
-            assert isinstance(quantizer.modules[node.target], torch.nn.Linear)
-            self.linear = quantizer.modules[self.linear_node.target]
-
-    def convert(self, quantizer, node, load_arg, debug=False):
-        if self.linear_node.op == 'call_module':
-            quantized = torch.nn.quantized.dynamic.Linear.from_float(self.linear)
-            parent_name, name = _parent_name(self.linear_node.target)
-            setattr(quantizer.modules[parent_name], name, quantized)
-            return quantizer.quantized_graph.create_node(
-                'call_module',
-                self.linear_node.target,
-                (load_arg(quantized=False)(self.linear_node.args[0]),),
-                {})
-        elif self.linear_node.op == 'call_function':
-            if debug:
-                # quantize and dequantize weight
-                args = load_arg(quantized=[1])(self.linear_node.args)
-                args = load_arg(quantized=False)(self.linear_node.args)
-                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.nn.functional.linear, args, kwargs)
-            else:
-                # linear args:
-                # (x, observed_weight, bias)
-                # get observer for the weight
-                weight_observer = quantizer.activation_post_process_map[self.linear_node.args[1].args[0].name]
-
-                if weight_observer.dtype == torch.float16:
-                    linear_weight = load_arg(quantized=False)(self.linear_node.args[1])
-                    prepack_op = torch.ops.quantized.linear_prepack_fp16
-                else:
-                    linear_weight = load_arg(quantized=True)(self.linear_node.args[1])
-                    prepack_op = torch.ops.quantized.linear_prepack
-                bias = None
-                # all args after bias, including bias
-                other_args = load_arg(quantized=False)(self.linear_node.args[2:])
-                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
-                if len(self.linear_node.args) > 2:
-                    bias = load_arg(quantized=False)(self.linear_node.args[2])
-                    other_args = other_args[1:]  # remove the bias argument
-                else:
-                    assert 'bias' in kwargs, \
-                        'expect bias provided as a keyword argument when it is not a positional argument'
-                    bias = kwargs['bias']
-                    kwargs.pop('bias')
-                prepack_args = (linear_weight, bias)
-                # pack weight
-                packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', prepack_op, prepack_args, {})
-                # construct dynamic linear input
-                non_quantized_input = load_arg(quantized=False)(self.linear_node.args[0])
-                qdynamic_linear_args = (non_quantized_input, packed_weight)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear_dynamic, qdynamic_linear_args, kwargs)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index eaebe4c04d55..64c4fd18103a 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -31,7 +31,6 @@
 from .pattern_utils import (
     is_match,
     get_quant_patterns,
-    get_dynamic_quant_patterns,
 )
 
 from .standalone_module import (
@@ -44,6 +43,7 @@
 from .utils import (
     _parent_name,
     quantize_node,
+    activation_is_statically_quantized,
 )
 
 from collections import OrderedDict
@@ -304,7 +304,7 @@ def get_qconfig(module_name):
                 self.modules[node.target].qconfig = module_qconfig
                 self.qconfig_map[node.name] = module_qconfig
 
-    def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module):
+    def _prepare(self, model, qconfig_dict, inplace, is_standalone_module):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
@@ -320,11 +320,7 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant, is_standalone
         """
         if not inplace:
             model = copy.deepcopy(model)
-        self.is_dynamic_quant = is_dynamic_quant
-        if self.is_dynamic_quant:
-            self.patterns = get_dynamic_quant_patterns()
-        else:
-            self.patterns = get_quant_patterns()
+        self.patterns = get_quant_patterns()
 
         flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict)
         # TODO: support regex as well
@@ -404,10 +400,7 @@ def insert_observer(node, observer, device):
                     # observe standalone module
                     standalone_module = self.modules[node.target]
                     traced_standalone_module = symbolic_trace(standalone_module)
-                    if self.is_dynamic_quant:
-                        prepare = torch.quantization.quantize_fx._prepare_dynamic_standalone_module_fx
-                    else:
-                        prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
+                    prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
                     observed_standalone_module = prepare(traced_standalone_module, {'': qconfig})
                     observed_standalone_module.qconfig = qconfig
                     standalone_module_input_idxs = observed_standalone_module._standalone_module_observed_input_idxs
@@ -417,8 +410,9 @@ def insert_observer(node, observer, device):
                     self.modules[node.target] = observed_standalone_module
 
 
-                # don't need to insert observer for output in dynamic quantization
-                if self.is_dynamic_quant:
+                # don't need to insert observer for output if activation does not
+                # need to be statically quantized
+                if not activation_is_statically_quantized(qconfig):
                     continue
 
                 # inserting observers for output of observed module, or mark the output
@@ -516,16 +510,13 @@ def restore_state(self, observed):
         self.qconfig_map = observed._qconfig_map
 
     def prepare(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=is_standalone_module)
-
-    def prepare_dynamic(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=is_standalone_module)
+        return self._prepare(model, qconfig_dict, inplace, is_standalone_module=is_standalone_module)
 
     def _run_weight_observers(self, observed):
-        r''' Extract the subgraph that produces the weight for dynamically quantized
-        node and run the subgraph to observe the weight.
-        Note that the observers of dynamically quantized modules are run during
-        the conversion step.
+        r''' Extract the subgraph that produces the weight for dynamic quant
+        or weight only quant node and run the subgraph to observe the weight.
+        Note that the observers of dynamic quant or weight only quant ops are run during
+        the convert step.
         '''
         for node in observed.graph.nodes:
             if node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
@@ -540,7 +531,7 @@ def _run_weight_observers(self, observed):
                             weight_observer_module()
         return
 
-    def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False, is_standalone_module=False):
+    def _convert(self, model, inplace=False, debug=False, is_standalone_module=False):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
         For standalone module: the inputs will be quantized by parent module,
@@ -553,11 +544,9 @@ def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False, is
         self.restore_state(model)
         if not inplace:
             model = copy.deepcopy(model)
-        self.is_dynamic_quant = is_dynamic_quant
-        # run weight observers before inserting quant dequant nodes
-        # for dynamic quantization
-        if self.is_dynamic_quant:
-            self._run_weight_observers(model)
+        # always run weight observers in the top level forward method
+        # for dynamic quant ops or weight only quant ops
+        self._run_weight_observers(model)
 
         # move to cpu since we only have quantized cpu kernels
         model.eval().cpu()
@@ -658,7 +647,7 @@ def is_quantized(node):
                     result = self.quantized_graph.node_copy(node, load_non_quantized)
                     quantized = False
                 else:
-                    result = obj.convert(self, node, load_arg)
+                    result = obj.convert(self, node, load_arg, debug=debug)
                     if node.op == 'call_module' and is_observed_standalone_module(self.modules[node.target]):
                         quantized = self.modules[node.target]._output_is_observed
                     else:
@@ -673,8 +662,7 @@ def is_quantized(node):
                             'CopyNode of type ' + node.op + ' is not handled'
                         quantized = is_quantized(node.args[0])
 
-                    # output of dynamic quantization is not quantized
-                    if self.is_dynamic_quant:
+                    if not activation_is_statically_quantized(qconfig):
                         quantized = False
 
                 if quantized:
@@ -800,8 +788,8 @@ def load_arg(a):
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 
-    def convert(self, model, inplace=False, debug=False, is_dynamic=False, is_standalone_module=False):
-        quantized = self._convert(model, inplace, debug, is_dynamic, is_standalone_module)
+    def convert(self, model, inplace=False, debug=False, is_standalone_module=False):
+        quantized = self._convert(model, inplace, debug, is_standalone_module)
         if not debug:
             quantized = self._fold_weight(quantized)
         return quantized
@@ -903,7 +891,8 @@ def visit_arg(arg):
                     for i, node_arg in enumerate(node.args):
                         if arg is node_arg and i in WEIGHT_INDEX_DICT[node.target]:
                             is_weight = True
-                if (not self.is_dynamic_quant) or is_weight:
+                if qconfig is not None and \
+                   (activation_is_statically_quantized(qconfig) or is_weight):
                     # overwrite previous quant config
                     quants[arg.name] = (DefaultQuant(self, arg), qconfig, is_weight)
             return visit_arg
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 5d5532dc48fc..98f94a0633a0 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -138,3 +138,30 @@ def get_next_qparams_idx(module, qparams):
         qparam_full_path = key + str(idx)
         inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})
+
+def activation_is_statically_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    statically quantized or not
+    """
+    assert qconfig is not None
+    activation = qconfig.activation()
+    return activation.dtype in [torch.quint8, torch.qint8]
+
+def weight_dtype(qconfig):
+    assert qconfig is not None
+    weight = qconfig.weight()
+    return weight.dtype
+
+def weight_is_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    quantized or not
+    """
+    return weight_dtype(qconfig) in [torch.quint8, torch.qint8]
+
+def get_linear_prepack_op_for_dtype(dtype):
+    if dtype == torch.float16:
+        return torch.ops.quantized.linear_prepack_fp16
+    elif dtype == torch.qint8:
+        return torch.ops.quantized.linear_prepack
+    else:
+        raise Exception("can't get linear prepack op for dtype:", dtype)
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 8c48a8dded95..7fa3fcbee82f 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -20,10 +20,10 @@ def fuse_fx(graph_module, inplace=False):
     fuser = Fuser()
     return fuser.fuse(graph_module, inplace)
 
-def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module=False):
+def _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=False):
     r""" Internal helper function for prepare_fx
     Args:
-      graph_module, qconfig_dict, inplace: see docs for :func:`~torch.quantization.prepare_fx`
+      `graph_modul`e, `qconfig_dict`, `inplace`: see docs for :func:`~torch.quantization.prepare_fx`
       `is_standalone_module`: a boolean flag indicates whether we are
       quantizing a standalone module or not, a standalone module
       is a submodule of the parent module that is not inlined in the
@@ -34,8 +34,7 @@ def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant, is_standa
     _check_is_graph_module(graph_module)
     graph_module = fuse_fx(graph_module, inplace)
     quantizer = Quantizer()
-    prepare = quantizer.prepare_dynamic if is_dynamic_quant else quantizer.prepare
-    return prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
+    return quantizer.prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
 
 def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
     r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
@@ -52,28 +51,81 @@ def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
                                    custom module is observed or not
 
     """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=True)
-
-def _prepare_dynamic_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
-    r""" See :func:`~torch.quantization.prepare_standalone_module_fx`
-    """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=True)
+    return _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=True)
 
 def prepare_fx(graph_module, qconfig_dict, inplace=False):
-    r""" Prepare a model for post training static quantization or
-    qantization aware training, not for public use.
+    r""" Prepare a model for post training static quantization
 
     Args:
-      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
       an eval model
-      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
+      `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
+      qconfig_dict = {
+      # optional, global config
+      "": qconfig?,
+
+      # optional, used for module and function types
+      # could also be split into module_types and function_types if we prefer
+      "object_type": [
+        (torch.nn.Conv2d, qconfig?),
+        (torch.nn.functional.add, qconfig?),
+        ...,
+       ],
+
+      # optional, used for module names
+      "module_name": [
+        ("foo.bar", qconfig?)
+        ...,
+      ],
+
+      # optional, matched in order, first match takes precedence
+      "module_name_regex": [
+        ("foo.*bar.*conv[0-9]+", qconfig?)
+        ...,
+      ],
+      # priority (in increasing order): global, object_type, module_name_regex, module_name
+      # qconfig == None means fusion and quantization should be skipped for anything
+      # matching the rule
+
+      # optional: specify the path for standalone modules
+      # These modules are symbolically traced and quantized as one unit
+      # User should also skip symbolic tracing through these modules
+      # so that the call to the submodule appears as one call_module
+      # node in the forward graph of the GraphModule
+      "standalone_module_name": [
+         "submodule.standalone"
+      ]
+      }
+      `inplace`: flag for carry out model transformations in-place,
+      the original module is mutated
+
 
     Return:
-      A GraphModule with observer or fake quant modules, ready for
-      calibration or quantization aware training
+      A GraphModule with observer (configured by qconfig_dict), ready for calibration
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import get_default_qconfig
+    from torch.quantization import prepare_fx
+
+    float_model.eval()
+    graph_module = torch.fx.symbolic_trace(float_model)
+    qconfig = get_default_qconfig('fbgemm')
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+    qconfig_dict = {"": qconfig}
+    prepared_model = prepare_fx(graph_module, qconfig_dict)
+    # Run calibration
+    calibrate(prepared_model, sample_inference_data)
     """
-    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False)
-    return prepared
+    assert not graph_module.training, 'prepare_fx only works for models in' + \
+        'eval mode'
+    return _prepare_fx(graph_module, qconfig_dict, inplace)
 
 def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
     assert not graph_module.training, 'prepare_static_fx only works for models in ' + \
@@ -83,39 +135,63 @@ def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
 def prepare_qat_fx(graph_module, qconfig_dict, inplace=False):
     r""" Prepare a model for quantization aware training
     Args:
-      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
-      a train model
-      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
+      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+       a train model
+      `qconfig_dict`: see :func:`~torch.quantization.prepare_fx`
+      `inplace`: flag for carry out model transformations in-place,
+       the original module is mutated
 
     Return:
-      A GraphModule with observer or fake quant modules, ready for
-      calibration or quantization aware training
+      A GraphModule with fake quant modules (configured by qconfig_dict), ready for
+      quantization aware training
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import get_default_qat_qconfig
+    from torch.quantization import prepare_fx
+
+    float_model.train()
+    graph_module = torch.fx.symbolic_trace(float_model)
+    qconfig = get_default_qat_qconfig('fbgemm')
+    def train_loop(model, train_data):
+        model.train()
+        for image, target in data_loader:
+            ...
+
+    qconfig_dict = {"": qconfig}
+    prepared_model = prepare_fx(graph_module, qconfig_dict)
+    # Run calibration
+    train_loop(prepared_model, train_loop)
     """
     assert graph_module.training, 'prepare_qat_fx only works for models in ' + \
         'train mode'
-    return prepare_fx(graph_module, qconfig_dict, inplace)
+    return _prepare_fx(graph_module, qconfig_dict, inplace)
 
-def prepare_dynamic_fx(graph_module, qconfig_dict, inplace=False):
-    r""" Prepare a model for post training dynamic quantization
+def _convert_fx(graph_module, inplace, debug, is_standalone_module=False):
+    """ `is_standalone_module`: see docs in :func:`~torch.quantization.prepare_standalone_module_fx`
     """
-    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, True)
-    return prepared
-
-def _convert_fx(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module=False):
     _check_is_graph_module(graph_module)
     quantizer = Quantizer()
-    return quantizer.convert(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module)
+    return quantizer.convert(graph_module, inplace, debug, is_standalone_module)
 
 def convert_fx(graph_module, inplace=False, debug=False):
     r""" Convert a calibrated or trained model to a quantized model
-    """
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False)
-
-convert_static_fx = convert_fx
-convert_qat_fx = convert_fx
+    Args:
+        `graph_module`: A prepared and calibrated/trained model (GraphModule)
+        `inplace`: flag for carry out model transformations in-place,
+        the original module is mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+    Return:
+        A quantized model (GraphModule)
 
-def convert_dynamic_fx(graph_module, inplace=False, debug=False):
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True)
+    Example:
+    ```python
+    # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+    quantized_model = convert_fx(prepared_model)
+    ```
+    """
+    return _convert_fx(graph_module, inplace, debug)
 
 def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
@@ -128,151 +204,4 @@ def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
       A quantized standalone module which accepts quantized input(if needed)
       and produces quantized output (if needed).
     """
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False, is_standalone_module=True)
-
-def _convert_dynamic_standalone_module_fx(graph_module, inplace=False, debug=False):
-    r""" See :func:`~torch.quantization.convert_standalone_module_fx`
-    """
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True, is_standalone_module=True)
-
-def _quantize_fx(model, qconfig_dict, run_fn=None, run_args=None, inplace=False,
-                 debug=False, is_dynamic_quant=False):
-    assert not model.training, 'quantize_fx is only used for post training ' + \
-        'quantization(eval mode), for quantization aware training please use ' + \
-        'prepare_qat_fx and convert_qat_fx.'
-
-    if is_dynamic_quant:
-        model = prepare_dynamic_fx(model, qconfig_dict, inplace)
-        # inplace is True since the inplace option is already applied in previous step
-        model = convert_dynamic_fx(model, inplace=True, debug=debug)
-    else:
-        assert run_fn, "Must provide calibration function for post training static quantization"
-        assert run_args, "Must provide calibration dataset for post training static quantization"
-        model = prepare_fx(model, qconfig_dict, inplace)
-        run_fn(model, *run_args)
-        # inplace is True since the inplace option is already applied in previous step
-        model = convert_fx(model, inplace=True, debug=debug)
-
-    return model
-
-
-def quantize_static_fx(model, qconfig_dict, run_fn, run_args, inplace=False, debug=False):
-    r"""Quantize the input float symbolically traced GraphModule model with
-    post training static quantization
-
-    First it will prepare the model for calibration, then it calls
-    `run_fn` which will run the calibration step, after that we will
-    convert the model to a quantized model.
-
-    Args:
-        `model`: input float TorchScript model
-        `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
-        qconfig_dict = {
-        # optional, global config
-        "": qconfig?,
-
-        # optional, used for module and function types
-        # could also be split into module_types and function_types if we prefer
-        "object_type": [
-          (torch.nn.Conv2d, qconfig?),
-          (torch.nn.functional.add, qconfig?),
-          ...,
-         ],
-
-        # optional, used for module names
-        "module_name": [
-          ("foo.bar", qconfig?)
-          ...,
-        ],
-
-        # optional, matched in order, first match takes precedence
-        "module_name_regex": [
-          ("foo.*bar.*conv[0-9]+", qconfig?)
-          ...,
-        ],
-        # priority (in increasing order): global, object_type, module_name_regex, module_name
-        # qconfig == None means fusion and quantization should be skipped for anything
-        # matching the rule
-
-        # optional: specify the path for standalone modules
-        # These modules are symbolically traced and quantized as one unit
-        # User should also skip symbolic tracing through these modules
-        # so that the call to the submodule appears as one call_module
-        # node in the forward graph of the GraphModule
-        "standalone_module_name": [
-           "submodule.standalone"
-        ]
-        }
-        `run_fn`: a calibration function for calibrating the prepared model
-        `run_args`: positional arguments for `run_fn`
-        `inplace`: carry out model transformations in-place, the original module is
-        mutated
-        `debug`: flag for producing a debug friendly model (preserve weight attribute)
-
-    Return:
-        Quantized TorchSciprt model.
-
-    Example:
-    ```python
-    import torch
-    from torch.quantization import get_default_qconfig
-    from torch.quantization import quantize_fx
-
-    graph_module = torch.fx.symbolic_trace(float_model.eval())
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-
-    quantized_model = quantize_fx(
-        graph_module,
-        {'': qconfig},
-        calibrate,
-        [data_loader_test])
-    ```
-    """
-    return _quantize_fx(
-        model, qconfig_dict, run_fn, run_args, inplace, debug, is_dynamic_quant=False)
-
-def quantize_dynamic_fx(model, qconfig_dict, inplace=False, debug=False):
-    r"""Quantize the input float symbolically traced GraphModule model with
-    post training dynamic quantization.
-    Currently only qint8 quantization of torch.nn.Linear is supported.
-
-    Args:
-        `model`: input float TorchScript model
-        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
-        qconfig for that module as value, please see detailed
-        descriptions in :func:`~torch.quantization.quantize_fx`
-        `inplace`: carry out model transformations in-place, the original module is
-        mutated
-        `debug`: flag for producing a debug friendly model (preserve weight attribute)
-
-    Return:
-        Quantized TorchSciprt model.
-
-    Example:
-    ```python
-    import torch
-    from torch.quantization import per_channel_dynamic_qconfig
-    from torch.quantization import quantize_dynmiac_fx
-
-    graph_module = torch.fx.symbolic_trace(float_model.eval())
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-
-    quantized_model = quantize_dynamic_fx(
-        graph_module,
-        {'': qconfig},
-        calibrate,
-        [data_loader_test])
-    ```
-    """
-    return _quantize_fx(
-        model, qconfig_dict, inplace=inplace, debug=debug, is_dynamic_quant=True)
+    return _convert_fx(graph_module, inplace, debug, is_standalone_module=True)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index bcdb766b997d..ccbda8232952 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -29,9 +29,8 @@
 from torch.quantization import (
     QuantType,
     prepare_fx,
-    prepare_dynamic_fx,
+    prepare_qat_fx,
     convert_fx,
-    convert_dynamic_fx,
 )
 
 import copy
@@ -630,41 +629,46 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         if type(inputs) == list:
             inputs = inputs[0]
         if quant_type == QuantType.QAT:
-            qconfig_dict = {'': get_default_qat_qconfig(torch.backends.quantized.engine)}
+            qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
             model.train()
+        elif quant_type == QuantType.STATIC:
+            qconfig = get_default_qconfig(torch.backends.quantized.engine)
+            model.eval()
         else:
-            qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
+            qconfig = default_dynamic_qconfig
             model.eval()
-        original = symbolic_trace(model)
 
-        if quant_type == QuantType.DYNAMIC:
-            prepare = prepare_dynamic_fx
-            convert = convert_dynamic_fx
+        original = symbolic_trace(model)
+        if quant_type == QuantType.QAT:
+            prepare = prepare_qat_fx
         else:
             prepare = prepare_fx
-            convert = convert_fx
 
+        qconfig_dict = {'': qconfig}
         prepared = prepare(original, qconfig_dict)
         prepared(*inputs)
-        qgraph = convert(prepared)
-        qgraph_debug = convert(prepared, debug=True)
+        qgraph = convert_fx(prepared)
+        qgraph_debug = convert_fx(prepared, debug=True)
 
         result = qgraph(*inputs)
         result_debug = qgraph_debug(*inputs)
 
-        self.assertEqual((result - result_debug).abs().max(), 0), \
-            'Expecting debug and non-debug option to produce identical result'
+        # numeric match for debug option for dynamic
+        # quantized op is not needed right now
+        if quant_type != QuantType.DYNAMIC:
+            self.assertEqual((result - result_debug).abs().max(), 0), \
+                'Expecting debug and non-debug option to produce identical result'
 
+        qgraph_to_check = qgraph_debug if debug else qgraph
         if print_debug_info:
             print()
             print('quant type:', quant_type)
             print('origianl graph module:', type(model))
             self.printGraphModule(original)
             print()
-            print('quantized graph module:', type(qgraph))
-            self.printGraphModule(qgraph)
+            print('quantized graph module:', type(qgraph_to_check))
+            self.printGraphModule(qgraph_to_check)
             print()
-        qgraph_to_check = qgraph_debug if debug else qgraph
         self.checkGraphModuleNodes(
             qgraph_to_check, expected_node, expected_node_occurrence, expected_node_list)
 

From 3da4cea658e4b8aa27a4375e1c3fc4e17d7647fb Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Wed, 30 Sep 2020 21:54:15 -0700
Subject: [PATCH 330/449] [ONNX] Add dim_param support in export with onnx
 shape inference (#44920)

Summary:
* Support propagating `dim_param` in ONNX by encoding as `ShapeSymbol` in `SymbolicShape` of outputs. If export is called with `dynamic_axes` provided, shape inference will start with these axes set as dynamic.
* Add new test file `test_pytorch_onnx_shape_inference.py`, reusing all test cases from `test_pytorch_onnx_onnxruntime.py`, but focus on validating shape for all nodes in graph. Currently this is not enabled in the CI, since there are still quite some existing issues and corner cases to fix. The test is default to run only at opset 12.
* Bug fixes, such as div, _len, and peephole.cpp passes for PackPadded, and LogSoftmaxCrossEntropy.
* This PR depends on existing PR such as 44332.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44920

Reviewed By: eellison

Differential Revision: D23958398

Pulled By: bzinodev

fbshipit-source-id: 00479d9bd19c867d526769a15ba97ec16d56e51d
---
 scripts/onnx/test.sh                          |   2 +
 test/onnx/test_pytorch_onnx_onnxruntime.py    |  14 ++
 .../onnx/test_pytorch_onnx_shape_inference.py |  78 ++++++++
 torch/csrc/jit/passes/onnx/peephole.cpp       |   9 +-
 .../jit/passes/onnx/shape_type_inference.cpp  | 174 ++++++++++++++----
 .../jit/passes/onnx/shape_type_inference.h    |  28 +++
 torch/csrc/jit/python/init.cpp                |  19 +-
 torch/csrc/jit/python/python_ir.cpp           |   4 +-
 torch/csrc/jit/python/script_init.cpp         |  58 +++---
 torch/csrc/jit/serialization/export.cpp       |  43 +++--
 torch/csrc/jit/serialization/export.h         |  40 ++--
 torch/onnx/symbolic_helper.py                 |   2 +
 torch/onnx/symbolic_opset11.py                |   4 +-
 torch/onnx/symbolic_opset9.py                 |   1 +
 torch/onnx/utils.py                           |  41 +++--
 15 files changed, 393 insertions(+), 124 deletions(-)
 create mode 100644 test/onnx/test_pytorch_onnx_shape_inference.py

diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh
index 77f9c8b9f16e..8b6fc6c4cf63 100755
--- a/scripts/onnx/test.sh
+++ b/scripts/onnx/test.sh
@@ -70,4 +70,6 @@ if [[ "$BUILD_ENVIRONMENT" == *ort_test2* ]]; then
     pytest "${args[@]}" \
       "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset$i"
   done
+  pytest "${args[@]}" \
+    "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset12_onnx_shape_inference"
 fi
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 310072ff4c4f..23d4879a8a4c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -640,6 +640,20 @@ def forward(self, input1, input2, input3):
         self.run_test(TraceModel(), (x1, x2, x3), atol=10e-5)
         self.run_test(ScriptModel(), (x1, x2, x3), atol=10e-5)
 
+    def test_conv_shape_inference(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super(Model, self).__init__()
+                self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+
+            def forward(self, input):
+                return self.conv2(input) + 2
+
+        x = torch.randn(20, 16, 50, 100)
+        self.run_test(Model(), x, atol=10e-5,
+                      input_names=['x'],
+                      dynamic_axes={'x': [0]})
+
     def test_conv_transpose(self):
         class TraceModel(torch.nn.Module):
             def __init__(self):
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
new file mode 100644
index 000000000000..b0b56d9296c7
--- /dev/null
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -0,0 +1,78 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import unittest
+import torch
+
+import copy
+
+import test_pytorch_onnx_onnxruntime
+from test_pytorch_onnx_onnxruntime import TestONNXRuntime
+from torch.onnx import utils, OperatorExportTypes, TrainingMode
+from torch.onnx.utils import _validate_dynamic_axes
+from torch.onnx.symbolic_helper import (_set_opset_version, _set_operator_export_type,
+                                        _set_onnx_shape_inference, _set_training_mode,
+                                        _is_tensor_list, _is_tensor, _is_none)
+
+
+def verify_inferred_shape(graph):
+    # Check every node in graph has type properly assigned.
+    for n in graph.nodes():
+        for out in n.outputs():
+            if not _is_tensor_list(out) and not _is_tensor(out) and not _is_none(out):
+                raise RuntimeError("Output of node is neither type Tensor nor type list of Tensor: ", out)
+            if _is_tensor(out) and out.type().scalarType() is None:
+                raise RuntimeError("Output of node does not have type assigned", out)
+            if _is_tensor(out) and out.type().dim() is None:
+                raise RuntimeError("Output of node does not have shape assigned", out)
+
+
+def run_model_test(self, model, batch_size=2, state_dict=None,
+                   input=None, use_gpu=True, rtol=0.001, atol=1e-7,
+                   example_outputs=None, do_constant_folding=True,
+                   dynamic_axes=None, test_with_inputs=None,
+                   input_names=None, output_names=None,
+                   fixed_batch_size=False):
+    model.eval()
+
+    if input is None:
+        input = torch.randn(batch_size, 3, 224, 224, requires_grad=True)
+
+    with torch.no_grad():
+        if isinstance(input, torch.Tensor):
+            input = (input,)
+        # In-place operators will update input tensor data as well.
+        # Thus inputs are replicated before every forward call.
+        input_copy = copy.deepcopy(input)
+        output = model(*input_copy)
+        if isinstance(output, torch.Tensor):
+            output = (output,)
+
+        _set_opset_version(self.opset_version)
+        _set_operator_export_type(OperatorExportTypes.ONNX)
+        _set_onnx_shape_inference(True)
+        _set_training_mode(False)
+        if dynamic_axes is None:
+            dynamic_axes = {}
+        _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+        input_copy = copy.deepcopy(input)
+        graph, _, _ = utils._model_to_graph(model, input_copy,
+                                            input_names=input_names,
+                                            output_names=output_names,
+                                            operator_export_type=OperatorExportTypes.ONNX,
+                                            example_outputs=output,
+                                            do_constant_folding=do_constant_folding,
+                                            training=TrainingMode.EVAL,
+                                            use_new_jit_passes=self.use_new_jit_passes,
+                                            dynamic_axes=dynamic_axes)
+        verify_inferred_shape(graph)
+
+
+if __name__ == '__main__':
+    TestONNXRuntime.opset_version = 12
+    test_pytorch_onnx_onnxruntime.run_model_test = run_model_test
+
+    unittest.main()
diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp
index 6177fbbcc643..e04a74a1905a 100644
--- a/torch/csrc/jit/passes/onnx/peephole.cpp
+++ b/torch/csrc/jit/passes/onnx/peephole.cpp
@@ -310,7 +310,13 @@ void pushPackingPastRnn(Block* b) {
       std::vector<int64_t> new_sizes;
       new_sizes.push_back(*oldType->sizes()[0]);
       new_sizes.push_back(*oldType->sizes()[1]);
-      new_sizes.push_back(rnn->i(attr::hidden_size));
+      if (next->kind() == onnx::Reshape) {
+        // bidirection
+        new_sizes.push_back(rnn->i(attr::hidden_size) * 2);
+      } else {
+        // unidirection
+        new_sizes.push_back(rnn->i(attr::hidden_size));
+      }
       TensorTypePtr newType = TensorType::createContiguous(
           *oldType->scalarType(), *oldType->device(), new_sizes);
       next->outputs().at(0)->setType(newType);
@@ -747,6 +753,7 @@ static void fuseLogSoftmaxNllLoss(Block* b) {
               prim::ListConstruct);
           // make output of reshape the output of nllloss
           nllloss_output->replaceAllUsesWith(origNllLossNode);
+          origNllLossNode->output(0)->copyMetadata(nllloss_output->output(0));
         }
       } else {
         continue;
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index c3098c4b40e2..ffd225ff82d5 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -40,8 +40,8 @@ TypePtr MergeInferredType(TypePtr existing_type, TypePtr inferred_type) {
       return new_tensor_type;
     }
     auto type = old_tensor_type;
-    if (new_tensor_type->sizes().isComplete()) {
-      type = type->withSizes(new_tensor_type->sizes().concrete_sizes().value());
+    if (new_tensor_type->dim()) {
+      type = type->withSymbolicShapes(new_tensor_type->symbolic_sizes());
     }
     if (new_tensor_type->scalarType().has_value()) {
       type = type->withScalarType(new_tensor_type->scalarType());
@@ -69,7 +69,8 @@ namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
 
 TensorTypePtr TorchTensorTypeFromONNX(
-    const onnx::TypeProto_Tensor& onnx_tensor_type) {
+    const onnx::TypeProto_Tensor& onnx_tensor_type,
+    const SymbolDimMap& symbol_map) {
   c10::optional<at::ScalarType> scalar_type;
   if (onnx_tensor_type.has_elem_type()) {
     scalar_type = ONNXTypeToATenType(onnx_tensor_type.elem_type());
@@ -82,33 +83,51 @@ TensorTypePtr TorchTensorTypeFromONNX(
       c10::VaryingShape<c10::Stride>{},
       {});
   if (onnx_tensor_type.has_shape()) {
-    std::vector<int64_t> sizes;
+    std::vector<c10::ShapeSymbol> sizes;
     auto onnx_shape = onnx_tensor_type.shape();
 
     for (int i = 0; i < onnx_shape.dim_size(); ++i) {
       auto& dim = onnx_shape.dim(i);
       if (dim.has_dim_value()) {
-        sizes.push_back(dim.dim_value());
+        sizes.emplace_back(c10::ShapeSymbol::fromStaticSize(dim.dim_value()));
       } else {
-        // TODO: handle dim_param?
-        return v_type;
+        GRAPH_UPDATE("Got dim_param:", dim.dim_param());
+        c10::optional<c10::ShapeSymbol> sym = c10::nullopt;
+        for (auto pair : symbol_map) {
+          if (pair.second == dim.dim_param()) {
+            sym = pair.first;
+            break;
+          }
+        }
+        if (!sym) {
+          sym = c10::ShapeSymbol::newSymbol();
+        }
+        sizes.emplace_back(sym.value());
       }
     }
     v_type = TensorType::create(scalar_type, at::kCPU, sizes.size(), {});
-    v_type = v_type->withSizes(sizes);
+    v_type = v_type->withSymbolicShapes(c10::SymbolicShape(sizes));
+
+    if (v_type->sizes().concrete_sizes().has_value()) {
+      // Populate strides based on sizes info, if sizes are all static.
+      // Creating strides ensures yielding True for isCompleteTensor.
+      v_type = v_type->contiguous();
+    }
   }
 
   return v_type;
 }
 
 ListTypePtr TorchListTypeFromONNX(
-    const onnx::TypeProto_Sequence& onnx_sequence_type) {
+    const onnx::TypeProto_Sequence& onnx_sequence_type,
+    SymbolDimMap symbol_map) {
   c10::optional<at::ScalarType> scalar_type;
   if (onnx_sequence_type.has_elem_type()) {
     auto onnx_seq_elem_type = onnx_sequence_type.elem_type();
     if (onnx_seq_elem_type.has_tensor_type()) {
       auto onnx_tensor_type = onnx_seq_elem_type.tensor_type();
-      auto v_tensor_type = TorchTensorTypeFromONNX(onnx_tensor_type);
+      auto v_tensor_type =
+          TorchTensorTypeFromONNX(onnx_tensor_type, symbol_map);
       auto v_type = ListType::create(v_tensor_type);
       return v_type;
     }
@@ -118,21 +137,24 @@ ListTypePtr TorchListTypeFromONNX(
 
 void UpdateTorchValueByOnnxValueInfo(
     Value* v,
-    const onnx::ValueInfoProto& p_info) {
+    const onnx::ValueInfoProto& p_info,
+    SymbolDimMap symbol_map) {
   if (!p_info.has_type()) {
     return;
   }
 
   auto p_type = p_info.type();
   if (p_type.has_tensor_type()) {
-    auto torch_tensor_type = TorchTensorTypeFromONNX(p_type.tensor_type());
+    auto torch_tensor_type =
+        TorchTensorTypeFromONNX(p_type.tensor_type(), symbol_map);
     if (torch_tensor_type) {
-      v->setType(torch_tensor_type);
+      v->setType(MergeInferredType(v->type(), torch_tensor_type));
     }
   } else if (p_type.has_sequence_type()) {
-    auto torch_list_type = TorchListTypeFromONNX(p_type.sequence_type());
+    auto torch_list_type =
+        TorchListTypeFromONNX(p_type.sequence_type(), symbol_map);
     if (torch_list_type) {
-      v->setType(torch_list_type);
+      v->setType(MergeInferredType(v->type(), torch_list_type));
     }
   }
 }
@@ -148,9 +170,17 @@ bool IsSupportedNode(const Node* n) {
   // Skip when block size is zero. This is when the node is first created,
   // doesn't have subblocks attached yet. Run shape inference for these nodes
   // when the subgraph has already completed shape inferencing.
-  if ((node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) &&
-      n->blocks().size() == 0) {
-    return false;
+  if (node_kind == ::c10::onnx::Loop || node_kind == ::c10::onnx::If) {
+    if (n->blocks().size() == 0) {
+      return false;
+    }
+    for (auto b : n->blocks()) {
+      for (auto b_n : b->nodes()) {
+        if (!IsSupportedNode(b_n)) {
+          return false;
+        }
+      }
+    }
   }
 
   return true;
@@ -234,9 +264,10 @@ bool IsGraphValidForInference(std::shared_ptr<Graph> graph) {
 void ConvertGraphToONNXProto(
     std::shared_ptr<Graph> graph,
     std::shared_ptr<onnx::ModelProto>& model_proto,
+    SymbolDimMap& symbol_map,
     int opset_version) {
   RawDataExportMap export_map;
-  std::tie(model_proto, export_map) = export_onnx(
+  std::tie(model_proto, export_map, symbol_map) = export_onnx(
       graph,
       {},
       opset_version,
@@ -282,7 +313,8 @@ void SpecialPostProcess(Node* n) {
 void UpdateOutputTypeByONNXProto(
     Node* n,
     Node* clone_node,
-    const onnx::ModelProto& model_proto) {
+    const onnx::ModelProto& model_proto,
+    SymbolDimMap symbol_map) {
   auto graph_proto = model_proto.graph();
   // inferred shapes are stored in value_info.
   for (size_t i = 0; i < graph_proto.value_info_size(); ++i) {
@@ -290,12 +322,10 @@ void UpdateOutputTypeByONNXProto(
     // get data from value_info and updated original graph.
     for (size_t j = 0; j < clone_node->outputs().size(); ++j) {
       if (clone_node->output(j)->debugName() == v_info.name()) {
-        UpdateTorchValueByOnnxValueInfo(n->output(j), v_info);
+        UpdateTorchValueByOnnxValueInfo(n->output(j), v_info, symbol_map);
       }
     }
   }
-
-  SpecialPostProcess(n);
 }
 
 } // namespace
@@ -320,26 +350,94 @@ void ONNXShapeTypeInference(Node* n, int opset_version) {
   GRAPH_DEBUG(
       "Cloned torch graph to run shape inference: ", n_graph->toString());
 
-  if (!IsGraphValidForInference(n_graph)) {
-    GRAPH_UPDATE("Skipping ONNX shape inference for this node.");
-    return;
+  if (IsGraphValidForInference(n_graph)) {
+    // TODO: Some ops have conversion happen at Peephole pass.
+    //       The conversion here is incomplete for these ops.
+    //       e.g: ListConstruct, ListUnpack, etc.
+    std::shared_ptr<onnx::ModelProto> model_proto;
+    SymbolDimMap symbol_map;
+    ConvertGraphToONNXProto(n_graph, model_proto, symbol_map, opset_version);
+    GRAPH_DEBUG(
+        "ONNX graph to run shape inference: ", prettyPrint(*model_proto));
+
+    // infer shape
+    onnx::shape_inference::InferShapes(*model_proto);
+    GRAPH_DEBUG(
+        "ONNX graph after shape inference: ", prettyPrint(*model_proto));
+
+    UpdateOutputTypeByONNXProto(n, clone_node, *model_proto, symbol_map);
   }
 
-  // TODO: Some ops have conversion happen at Peephole pass.
-  //       The conversion here is incomplete for these ops.
-  //       e.g: ListConstruct, ListUnpack, etc.
-  std::shared_ptr<onnx::ModelProto> model_proto;
-  ConvertGraphToONNXProto(n_graph, model_proto, opset_version);
-  GRAPH_DEBUG("ONNX graph to run shape inference: ", prettyPrint(*model_proto));
-
-  // infer shape
-  onnx::shape_inference::InferShapes(*model_proto);
-  GRAPH_DEBUG("ONNX graph after shape inference: ", prettyPrint(*model_proto));
-
-  UpdateOutputTypeByONNXProto(n, clone_node, *model_proto);
+  SpecialPostProcess(n);
   GRAPH_DEBUG(
       "Torch graph after shape inference:", n->owningGraph()->toString());
 }
 
+void ONNXSetDynamicInputShape(
+    std::shared_ptr<Graph>& graph,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    const std::vector<std::string>& input_names) {
+  GRAPH_UPDATE("ONNX set dynamic input shape.");
+  GRAPH_UPDATE("dynamic axes tensor names:", [&]() {
+    std::vector<std::string> res(dynamic_axes.size());
+    std::transform(
+        dynamic_axes.begin(), dynamic_axes.end(), res.begin(), [](auto pair) {
+          return pair.first;
+        });
+    return res;
+  }());
+
+  std::map<std::string, ::c10::ShapeSymbol> name_to_sym;
+
+  for (int i = 0; i < input_names.size(); ++i) {
+    auto input_name = input_names[i];
+    if (dynamic_axes.find(input_name) != dynamic_axes.end()) {
+      auto axes_names = dynamic_axes.find(input_name)->second;
+      TORCH_INTERNAL_ASSERT(i < graph->inputs().size());
+      auto input_tensor_type = graph->inputs()[i]->type()->cast<TensorType>();
+      if (!input_tensor_type) {
+        continue;
+      }
+
+      auto shape = input_tensor_type->symbolic_sizes().sizes().value();
+
+      for (auto pair : axes_names) {
+        auto axis = pair.first;
+        auto name = pair.second;
+        if (name_to_sym.find(name) == name_to_sym.end()) {
+          name_to_sym[name] = ::c10::ShapeSymbol::newSymbol();
+        }
+        shape[axis] = name_to_sym[name];
+      }
+
+      graph->inputs()[i]->setType(
+          input_tensor_type->withSymbolicShapes(::c10::SymbolicShape(shape)));
+    }
+  }
+}
+
+void ONNXAssignOutputShape(
+    std::shared_ptr<Graph>& graph,
+    at::ArrayRef<at::Tensor> outputs,
+    bool onnx_shape_inference) {
+  TORCH_INTERNAL_ASSERT(graph->outputs().size() == outputs.size());
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    if (onnx_shape_inference) {
+      graph->outputs()[i]->setType(MergeInferredType(
+          TensorType::create(outputs[i]), graph->outputs()[i]->type()));
+    } else {
+      graph->outputs()[i]->inferTypeFrom(outputs[i]);
+    }
+  }
+}
+
+void ONNXShapeTypeInference(std::shared_ptr<Graph>& graph, int opset_version) {
+  for (auto n : graph->nodes()) {
+    ONNXShapeTypeInference(n, opset_version);
+  }
+}
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.h b/torch/csrc/jit/passes/onnx/shape_type_inference.h
index d373f3a06d92..79e7c06045ea 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.h
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -8,11 +8,39 @@ namespace jit {
 TORCH_API TypePtr
 MergeInferredType(TypePtr existing_type, TypePtr inferred_type);
 
+// Update graph input types with dynamic axes info.
+// Axes that are marked as dynamic will be assigned as dynamic ShapeSymbol.
+// Note it is possible for multiple axes to share the same ShapeSymbol,
+// if they are defined as such in dynamic_axes.
+TORCH_API void ONNXSetDynamicInputShape(
+    std::shared_ptr<Graph>& graph,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    const std::vector<std::string>& input_names);
+
+// Update graph output with types of output Tensors.
+// If onnx_shape_inference is true, types of output Tensors will be compared and
+// merged with inferred types. It is possible that inferred types contain
+// dynamic axes, hence it takes precedence over types of output Tensors.
+TORCH_API void ONNXAssignOutputShape(
+    std::shared_ptr<Graph>& graph,
+    at::ArrayRef<at::Tensor> outputs,
+    bool onnx_shape_inference);
+
 // Utilize ONNX Shape Inference for node.
 // The node must have ONNX namespace, and is valid ONNX node accroding to spec.
 // On successful ONNX shape inference runs, the function updates output types of
 // n with inferred shape and type. Otherwise n is unchanged.
 TORCH_API void ONNXShapeTypeInference(Node* n, int opset_version);
 
+// Utilize ONNX Shape Inference for graph.
+// Internally calls ONNXShapeTypeInference for each node, to achieve more
+// coverage that skips only individual nodes if illegal, instead of skipping for
+// the entire graph.
+TORCH_API void ONNXShapeTypeInference(
+    std::shared_ptr<Graph>& g,
+    int opset_version);
+
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 43e96839a3c2..1c6a5c968ce3 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -139,6 +139,13 @@ void initJITBindings(PyObject* module) {
       .def("_jit_pass_onnx_remove_print", RemovePrintOps)
       .def("_jit_pass_onnx_preprocess_caffe2", PreprocessCaffe2Ops)
       .def("_jit_pass_onnx", ToONNX)
+      .def(
+          "_jit_pass_onnx_assign_output_shape",
+          [](std::shared_ptr<Graph>& graph,
+             const std::vector<at::Tensor>& tensors,
+             bool onnx_shape_inference = false) {
+            ONNXAssignOutputShape(graph, tensors, onnx_shape_inference);
+          })
       .def("_jit_pass_lower_all_tuples", LowerAllTuples)
       .def("_jit_pass_onnx_function_substitution", ONNXFunctionCallSubstitution)
       .def(
@@ -188,7 +195,17 @@ void initJITBindings(PyObject* module) {
       .def(
           "_jit_pass_onnx_prepare_inplace_ops_for_onnx",
           PrepareInplaceOpsForONNX)
-      .def("_jit_pass_onnx_node_shape_type_inference", ONNXShapeTypeInference)
+      .def(
+          "_jit_pass_onnx_node_shape_type_inference",
+          [](Node* n, int opset_version) {
+            ONNXShapeTypeInference(n, opset_version);
+          })
+      .def(
+          "_jit_pass_onnx_graph_shape_type_inference",
+          [](std::shared_ptr<Graph>& graph, int opset_version) {
+            ONNXShapeTypeInference(graph, opset_version);
+          })
+      .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape)
       .def("_jit_pass_fuse", FuseGraph)
       .def(
           "_jit_pass_dce",
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 640725679595..c5889144bd1f 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -238,7 +238,8 @@ void initPythonIRBindings(PyObject* module_) {
             std::string graph;
             std::shared_ptr<::ONNX_NAMESPACE::ModelProto> model_proto;
             RawDataExportMap export_map;
-            std::tie(model_proto, export_map) = export_onnx(
+            SymbolDimMap symbol_map;
+            std::tie(model_proto, export_map, symbol_map) = export_onnx(
                 g,
                 initializers,
                 onnx_opset_version,
@@ -251,6 +252,7 @@ void initPythonIRBindings(PyObject* module_) {
                 add_node_names,
                 use_external_data_format,
                 onnx_file_path);
+            graph = serialize_model_proto_to_string(model_proto);
             std::unordered_map<std::string, py::bytes>
                 python_serialized_export_map;
             for (auto& kv : export_map) {
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 95d041fe315b..5e4031fdf435 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -147,11 +147,11 @@ struct PythonResolver : public Resolver {
   ClassTypePtr classType_;
 };
 
-std::shared_ptr<PythonResolver> pythonResolver(ResolutionCallback rcb) {
+std::shared_ptr<PythonResolver> pythonResolver(const ResolutionCallback& rcb) {
   return std::make_shared<PythonResolver>(rcb);
 }
 std::shared_ptr<PythonResolver> pythonResolver(
-    ResolutionCallback rcb,
+    const ResolutionCallback& rcb,
     std::string classname,
     ClassTypePtr classType) {
   return std::make_shared<PythonResolver>(
@@ -491,21 +491,6 @@ static std::shared_ptr<Graph> _propagate_and_assign_input_shapes(
   return retval;
 }
 
-static std::shared_ptr<Graph> _assign_output_shapes(
-    Graph& graph,
-    std::vector<at::Tensor> outputs) {
-  auto retval = graph.copy();
-  AT_ASSERT(retval->outputs().size() == outputs.size());
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto scalar_type = outputs[i].scalar_type();
-    auto sizes = outputs[i].sizes();
-    auto type =
-        torch::jit::TensorType::createContiguous(scalar_type, at::kCPU, sizes);
-    retval->outputs()[i]->setType(type);
-  }
-  return retval;
-}
-
 void addFunctionToModule(Module& module, const StrongFunctionPtr& func) {
   // Make a graph with a fake self argument
   auto graph = func.function_->graph()->copy();
@@ -641,7 +626,7 @@ struct slot_dict_impl {
 template <typename T>
 py::list debugMakeList(const T& list) {
   py::list result;
-  for (auto elem : list) {
+  for (const auto& elem : list) {
     result.append(py::cast(elem));
   }
   return result;
@@ -681,7 +666,7 @@ static py::dict _jit_debug_module_iterators(Module& module) {
   return result;
 }
 
-static constexpr const char* magic_method_names[] = {
+static constexpr std::array<const char*, 47> magic_method_names = {
     "__lt__",      "__le__",      "__eq__",        "__ne__",
     "__ge__",      "__gt__",      "__not__",       "__abs__",
     "__add__",     "__and__",     "__floordiv__",  "__index__",
@@ -806,7 +791,8 @@ void initJitScriptBindings(PyObject* module) {
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
               },
-              [](std::tuple<py::object, std::string> state_tup) -> Object {
+              [](const std::tuple<py::object, std::string>& state_tup)
+                  -> Object {
                 py::object state;
                 std::string qualname;
                 std::tie(state, qualname) = state_tup;
@@ -970,7 +956,7 @@ void initJitScriptBindings(PyObject* module) {
           [](Module& m,
              std::shared_ptr<ConcreteModuleType> concreteType,
              const std::string& script,
-             ResolutionCallback rcb) {
+             const ResolutionCallback& rcb) {
             const auto self = ModuleSelf(std::move(concreteType));
             m._ivalue()->compilation_unit()->define(
                 *m.type()->name(), script, pythonResolver(rcb), &self);
@@ -980,7 +966,7 @@ void initJitScriptBindings(PyObject* module) {
           "_register_attribute",
           [](Module& m,
              const std::string& name,
-             TypePtr type,
+             const TypePtr& type,
              py::handle value) {
             m.register_attribute(name, type, toIValue(value, type));
           })
@@ -988,9 +974,9 @@ void initJitScriptBindings(PyObject* module) {
           "_create_method_from_trace",
           [](Module& self,
              const std::string& name,
-             py::function func,
-             py::tuple input_tuple,
-             py::function var_lookup_fn,
+             const py::function& func,
+             const py::tuple& input_tuple,
+             const py::function& var_lookup_fn,
              bool strict,
              bool force_outplace) {
             // prereq: Module's buffers and parameters are unique
@@ -1106,7 +1092,7 @@ void initJitScriptBindings(PyObject* module) {
           "define",
           [](CompilationUnit& cu,
              const std::string& src,
-             ResolutionCallback rcb) {
+             const ResolutionCallback& rcb) {
             cu.define(c10::nullopt, src, pythonResolver(rcb), nullptr);
           })
       .def(
@@ -1279,10 +1265,10 @@ void initJitScriptBindings(PyObject* module) {
       });
   m.def(
       "_create_function_from_trace",
-      [](std::string qualname,
-         py::function func,
-         py::tuple input_tuple,
-         py::function var_lookup_fn,
+      [](const std::string& qualname,
+         const py::function& func,
+         const py::tuple& input_tuple,
+         const py::function& var_lookup_fn,
          bool strict,
          bool force_outplace) {
         auto typed_inputs = toTraceableStack(input_tuple);
@@ -1303,7 +1289,7 @@ void initJitScriptBindings(PyObject* module) {
       [](const std::string& qualifiedName,
          const ClassDef& classDef,
          const ClassMethodDefaults& defaults,
-         ResolutionCallback rcb) {
+         const ResolutionCallback& rcb) {
         C10_LOG_API_USAGE_ONCE("torch.script.class");
         if (classDef.superclass().present()) {
           throw ErrorReport(classDef.range())
@@ -1465,7 +1451,6 @@ void initJitScriptBindings(PyObject* module) {
   m.def("_propagate_shapes", _propagate_shapes);
   m.def(
       "_propagate_and_assign_input_shapes", _propagate_and_assign_input_shapes);
-  m.def("_assign_output_shapes", _assign_output_shapes);
   m.def(
       "_last_executed_optimized_graph",
       []() { return lastExecutedOptimizedGraph(); },
@@ -1668,18 +1653,23 @@ void initJitScriptBindings(PyObject* module) {
 
   m.def(
       "_resolve_type",
-      [](const std::string& name, SourceRange range, ResolutionCallback rcb) {
+      [](const std::string& name,
+         const SourceRange& range,
+         const ResolutionCallback& rcb) {
         return pythonResolver(rcb)->resolveType(name, range);
       });
   m.def(
       "_resolve_type_from_object",
-      [](const py::object& obj, SourceRange range, ResolutionCallback rcb) {
+      [](const py::object& obj,
+         const SourceRange& range,
+         const ResolutionCallback& rcb) {
         return pythonResolver(rcb)->resolveTypeFromObject(obj, range);
       });
 
   m.def(
       "_run_emit_module_hook", [](const Module& m) { didFinishEmitModule(m); });
 
+  // NOLINTNEXTLINE(bugprone-unused-raii)
   py::class_<logging::LoggerBase, std::shared_ptr<logging::LoggerBase>>(
       m, "LoggerBase");
   py::enum_<logging::LockingLogger::AggregationType>(m, "AggregationType")
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index ee5a691bb489..ee1eba98483b 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -178,6 +178,10 @@ class EncoderBase {
     return model_proto_;
   }
 
+  SymbolDimMap get_symbol_dim_param_map() {
+    return symbol_dim_map_;
+  }
+
  protected:
   // Using std::map instead of std::unordered_map for initializers
   // in EncodeGraph constructor so that the order in which initializers
@@ -243,6 +247,7 @@ class EncoderBase {
       const bool use_external_data_format = false,
       const std::string& onnx_file_path = std::string());
 
+  SymbolDimMap symbol_dim_map_;
   onnx::ModelProto model_proto_;
   size_t num_blocks_;
   size_t num_op_nodes_;
@@ -316,33 +321,38 @@ void EncoderBase::EncodeValueInfo(
         std::unordered_map<int64_t, std::string>>& dynamic_axes) {
   std::string name = n->debugName();
   v->set_name(name);
-  auto tensorTypeToONNXType = [&dynamic_axes, &name](
+  auto tensorTypeToONNXType = [&dynamic_axes, &name, this](
                                   TensorTypePtr t,
                                   onnx::TypeProto_Tensor* tensor_type) {
-    if (t->sizes().isComplete()) {
-      // onnx::TypeProto* onnx_type = v->mutable_type();
-      // onnx::TypeProto_Tensor* tensor_type = onnx_type->mutable_tensor_type();
+    if (t->dim()) {
       onnx::TensorShapeProto* shape = tensor_type->mutable_shape();
-      std::vector<std::int64_t> sizes = t->sizes().concrete_sizes().value();
+      auto sizes = t->symbolic_sizes().sizes().value();
       for (size_t i = 0; i < sizes.size(); i++) {
         shape->add_dim();
         if ((dynamic_axes.find(name) != dynamic_axes.end()) &&
             (dynamic_axes.at(name).find(i) != dynamic_axes.at(name).end())) {
           shape->mutable_dim(i)->set_dim_param(dynamic_axes.at(name).at(i));
+          if (!sizes[i].is_static()) {
+            symbol_dim_map_[sizes[i]] = dynamic_axes.at(name).at(i);
+          }
+        } else if (sizes[i].is_static()) {
+          shape->mutable_dim(i)->set_dim_value(sizes[i].static_size());
         } else {
-          shape->mutable_dim(i)->set_dim_value(sizes[i]);
+          if (symbol_dim_map_.find(sizes[i]) == symbol_dim_map_.end()) {
+            symbol_dim_map_[sizes[i]] = name + "_" + std::to_string(i);
+          }
+          shape->mutable_dim(i)->set_dim_param(symbol_dim_map_[sizes[i]]);
         }
       }
     }
     if (t->scalarType()) {
-      // onnx::TypeProto* onnx_type = v->mutable_type();
-      // onnx::TypeProto_Tensor* tensor_type = onnx_type->mutable_tensor_type();
       tensor_type->set_elem_type(ATenTypeToOnnxType(t->scalarType().value()));
     }
   };
 
   if (TensorTypePtr node_type = n->type()->cast<TensorType>()) {
-    if (node_type->sizes().isComplete() || node_type->scalarType()) {
+    if (node_type->dim() || node_type->scalarType()) {
+      // Encode type if either shape or dtype exists.
       onnx::TypeProto* onnx_type = v->mutable_type();
       onnx::TypeProto_Tensor* tensor_type = onnx_type->mutable_tensor_type();
       tensorTypeToONNXType(node_type, tensor_type);
@@ -854,7 +864,10 @@ std::string pretty_print_onnx(
 // conform to the ONNX op specification. Thus, the output will not
 // be interpretable by a ONNX-compatible framework. However, PyTorch or
 // libtorch will be able to import the IR and play it back.
-std::tuple<std::shared_ptr<::ONNX_NAMESPACE::ModelProto>, RawDataExportMap>
+std::tuple<
+    std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
+    RawDataExportMap,
+    SymbolDimMap>
 export_onnx(
     const std::shared_ptr<Graph>& graph,
     const std::map<std::string, at::Tensor>& initializers,
@@ -888,10 +901,12 @@ export_onnx(
       proto_size <= INT_MAX,
       "Exporting model exceed maximum protobuf size of 2GB. "
       "Please call torch.onnx.export with use_external_data_format=True.");
-  GRAPH_UPDATE("onnx proto:", prettyPrint(graph_encoder.get_model_proto()));
-  std::shared_ptr<onnx::ModelProto> model_proto =
-      std::make_shared<onnx::ModelProto>(graph_encoder.get_model_proto());
-  return std::make_tuple(model_proto, graph_encoder.get_raw_data_export_map());
+  GRAPH_DEBUG("onnx proto:", prettyPrint(graph_encoder.get_model_proto()));
+  return std::make_tuple(
+      std::make_shared<::ONNX_NAMESPACE::ModelProto>(
+          graph_encoder.get_model_proto()),
+      graph_encoder.get_raw_data_export_map(),
+      graph_encoder.get_symbol_dim_param_map());
 }
 
 std::string serialize_model_proto_to_string(
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index dd07e68bba33..e6a572163474 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -25,24 +25,28 @@ namespace jit {
 // file contents being the raw tensor data.
 using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
 
-TORCH_API std::
-    tuple<std::shared_ptr<::ONNX_NAMESPACE::ModelProto>, RawDataExportMap>
-    export_onnx(
-        const std::shared_ptr<Graph>& graph,
-        const std::map<std::string, at::Tensor>& initializers,
-        int64_t onnx_opset_version,
-        const std::unordered_map<
-            std::string,
-            std::unordered_map<int64_t, std::string>>& dynamic_axes,
-        bool defer_weight_export = false,
-        ::torch::onnx::OperatorExportTypes operator_export_type =
-            ::torch::onnx::OperatorExportTypes::ONNX,
-        bool strip_doc_string = true,
-        bool keep_initializers_as_inputs = true,
-        const std::map<std::string, int>& custom_opsets = {},
-        bool add_node_names = true,
-        bool use_external_data_format = false,
-        const std::string& onnx_file_path = std::string());
+using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
+
+TORCH_API std::tuple<
+    std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
+    RawDataExportMap,
+    SymbolDimMap>
+export_onnx(
+    const std::shared_ptr<Graph>& graph,
+    const std::map<std::string, at::Tensor>& initializers,
+    int64_t onnx_opset_version,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    bool defer_weight_export = false,
+    ::torch::onnx::OperatorExportTypes operator_export_type =
+        ::torch::onnx::OperatorExportTypes::ONNX,
+    bool strip_doc_string = true,
+    bool keep_initializers_as_inputs = true,
+    const std::map<std::string, int>& custom_opsets = {},
+    bool add_node_names = true,
+    bool use_external_data_format = false,
+    const std::string& onnx_file_path = std::string());
 
 TORCH_API std::string serialize_model_proto_to_string(
     const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index 035de44b4d9e..8932955703a7 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -171,6 +171,8 @@ def _is_none(x):
 def _is_value(x):
     return isinstance(x, torch._C.Value)
 
+def _is_tensor(x):
+    return x.type().isSubtypeOf(torch._C.TensorType.get())
 
 def _is_tensor_list(x):
     return isinstance(x.type(), torch._C.ListType) and isinstance(x.type().getElementType(), torch._C.TensorType)
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 83b0da0aef5d..acbcaf1bbe95 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -6,7 +6,7 @@
 import warnings
 import numpy
 
-from torch.onnx.symbolic_helper import parse_args, _unimplemented
+from torch.onnx.symbolic_helper import parse_args, _unimplemented, _is_tensor_list
 from torch.onnx.symbolic_opset9 import expand, unused
 from torch.nn.modules.utils import _single, _pair, _triple
 from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block
@@ -272,7 +272,7 @@ def masked_scatter(g, self, mask, source):
 
 
 def _len(g, self):
-    if self.type().isSubtypeOf(torch._C.ListType.ofTensors()) or self.node().kind() == "onnx::SplitToSequence":
+    if _is_tensor_list(self) or self.node().kind() == "onnx::SplitToSequence":
         return g.op("SequenceLength", self)
     return g.op("Size", self)
 
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 886be27e1f65..7e8b04bf1612 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -121,6 +121,7 @@ def floor_divide(g, self, other):
     # - self is not fp and other is not fp, the output's type is self's output type
     # - the output type defaults to Float
     scalar_type = self.type().scalarType()
+
     if scalar_type is not None:
         if not sym_help._is_fp(self) and \
            other.type().scalarType() is not None and \
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 12c1a5f34988..98dc79d6546c 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -17,7 +17,7 @@
 from torch._six import string_classes
 from torch.jit import _unique_state_dict
 from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode
-from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _assign_output_shapes, _check_onnx_proto
+from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto
 
 
 # the flag to tell the user whether it's in the middle of ONNX export or not
@@ -121,7 +121,7 @@ def _split_tensor_list_constants(g, block):
 
 
 def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    params_dict=None, use_new_jit_passes=False):
+                    params_dict=None, use_new_jit_passes=False, dynamic_axes=None, input_names=None):
     # Inline everything
     torch._C._jit_pass_inline(graph)
 
@@ -195,6 +195,11 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
         # onnx only supports tensors, so we turn all out number types into tensors
         torch._C._jit_pass_erase_number_types(graph)
 
+        from torch.onnx.symbolic_helper import _onnx_shape_inference
+        if _onnx_shape_inference:
+            input_names = [] if input_names is None else input_names
+            dynamic_axes = {} if dynamic_axes is None else dynamic_axes
+            torch._C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
         graph = torch._C._jit_pass_onnx(graph, operator_export_type)
         torch._C._jit_pass_lint(graph)
 
@@ -214,6 +219,9 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa
     torch._C._jit_pass_lint(graph)
     graph = torch._C._jit_pass_canonicalize(graph)
     torch._C._jit_pass_lint(graph)
+    from torch.onnx.symbolic_helper import _onnx_shape_inference, _export_onnx_opset_version
+    if _onnx_shape_inference:
+        torch._C._jit_pass_onnx_graph_shape_type_inference(graph, _export_onnx_opset_version)
     return graph
 
 
@@ -388,7 +396,8 @@ def _model_to_graph(model, args, verbose=False,
                     example_outputs=None,
                     _retain_param_name=False, do_constant_folding=True,
                     _disable_torch_constant_prop=False, fixed_batch_size=False,
-                    training=None, use_new_jit_passes=False):
+                    training=None, use_new_jit_passes=False,
+                    dynamic_axes=None):
     from torch.onnx.symbolic_helper import _export_onnx_opset_version
     # Special case for common case of passing a single Tensor
     if isinstance(args, torch.Tensor):
@@ -408,19 +417,20 @@ def _model_to_graph(model, args, verbose=False,
     graph = _optimize_graph(graph, operator_export_type,
                             _disable_torch_constant_prop=_disable_torch_constant_prop,
                             fixed_batch_size=fixed_batch_size, params_dict=params_dict,
-                            use_new_jit_passes=use_new_jit_passes)
+                            use_new_jit_passes=use_new_jit_passes,
+                            dynamic_axes=dynamic_axes, input_names=input_names)
+    from torch.onnx.symbolic_helper import _onnx_shape_inference
     if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):
         assert example_outputs is not None, "example_outputs must be provided when exporting a ScriptModule or " \
                                             "ScriptFunction."
         out_vars, _ = torch.jit._flatten(tuple(example_outputs))
-        graph = _assign_output_shapes(graph, out_vars)
+        torch._C._jit_pass_onnx_assign_output_shape(graph, out_vars, _onnx_shape_inference)
 
     # NB: ONNX requires complete information about output types, which might be
     # erased by some optimizations, so we need to set it explicitly again.
     if torch_out is not None:
         output_tensors, _ = torch._C._jit_flatten(torch_out)
-        for output, tensor in zip(graph.outputs(), output_tensors):
-            output.inferTypeFrom(tensor)
+        torch._C._jit_pass_onnx_assign_output_shape(graph, output_tensors, _onnx_shape_inference)
 
     _set_input_and_output_names(graph, input_names, output_names)
 
@@ -513,12 +523,12 @@ def _find_missing_ops_onnx_export(model, args, f, verbose=False, training=Traini
                                   input_names=None, output_names=None, opset_version=None, dynamic_axes=None):
     r"""
     This diagnostic tool runs your model with operator_export_type set to
-    OperatorExportTypes.ONNX_FALLTHROUGH once in order to get a list of 
+    OperatorExportTypes.ONNX_FALLTHROUGH once in order to get a list of
     all the ops that are not supported/implemented by the current exporter
 
     operator_export_type is set to OperatorExportTypes.ONNX_FALLTHROUGH by default
         OperatorExportTypes.ONNX_FALLTHROUGH: If an op is not supported
-        in ONNX, fall through and export the operator as is, as a custom 
+        in ONNX, fall through and export the operator as is, as a custom
         ONNX op. Using this mode, the op can be exported and implemented by
         the user for their runtime backend.
         Example graph::
@@ -537,7 +547,7 @@ def _find_missing_ops_onnx_export(model, args, f, verbose=False, training=Traini
                 %5 : Float(2:12, 3:4, 4:1, requires_grad=0, device=cpu) = aten::cumsum(%0, %6, %4) # main.py:6:0
                 return (%5)
 
-        In the above example, aten::cumsum in not implemented in opset 9, hence exporter falls 
+        In the above example, aten::cumsum in not implemented in opset 9, hence exporter falls
         through and provides a list of unsupported ops, the result being:
             Unsupported ops : [aten:cumsum]
     """
@@ -614,6 +624,10 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
             val_use_external_data_format, model_file_location = _decide_external_data_format(use_external_data_format,
                                                                                              operator_export_type,
                                                                                              f)
+            if dynamic_axes is None:
+                dynamic_axes = {}
+            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
             graph, params_dict, torch_out = \
                 _model_to_graph(model, args, verbose, input_names,
                                 output_names, operator_export_type,
@@ -621,17 +635,14 @@ def _export(model, args, f, export_params=True, verbose=False, training=None,
                                 val_do_constant_folding,
                                 fixed_batch_size=fixed_batch_size,
                                 training=training,
-                                use_new_jit_passes=use_new_jit_passes)
+                                use_new_jit_passes=use_new_jit_passes,
+                                dynamic_axes=dynamic_axes)
 
             # TODO: Don't allocate a in-memory string for the protobuf
             defer_weight_export = export_type is not ExportTypes.PROTOBUF_FILE
-            if dynamic_axes is None:
-                dynamic_axes = {}
             if custom_opsets is None:
                 custom_opsets = {}
 
-            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-
             if export_params:
                 proto, export_map = graph._export_onnx(
                     params_dict, opset_version, dynamic_axes, defer_weight_export,

From d306d0c2b1eac60226138c903656ca4bb0f8b9e6 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Wed, 30 Sep 2020 22:16:11 -0700
Subject: [PATCH 331/449] remove redundant PE(profiling executor) jobs in CI
 (#45397)

Summary:
This PR removes redundant profiling jobs since after the switch PE (https://github.com/pytorch/pytorch/pull/45396)  will be now running by default.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45397

Reviewed By: zhangguanheng66

Differential Revision: D23966890

Pulled By: Krovatkin

fbshipit-source-id: ef184ca5fcf079580fa139b6653f8d9a6124050e
---
 .circleci/cimodel/data/simple/ge_config_tests.py | 16 ----------------
 .circleci/config.yml                             | 15 ---------------
 2 files changed, 31 deletions(-)

diff --git a/.circleci/cimodel/data/simple/ge_config_tests.py b/.circleci/cimodel/data/simple/ge_config_tests.py
index 2f2dbf0027dc..306f616d3ef7 100644
--- a/.circleci/cimodel/data/simple/ge_config_tests.py
+++ b/.circleci/cimodel/data/simple/ge_config_tests.py
@@ -63,12 +63,6 @@ def gen_tree(self):
         None,
         ["ge_config_legacy", "test"],
         ["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
-    GeConfigTestJob(
-        MultiPartVersion([3, 6], "py"),
-        MultiPartVersion([5, 4], "gcc"),
-        None,
-        ["ge_config_profiling", "test"],
-        ["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
     GeConfigTestJob(
         MultiPartVersion([3, 6], "py"),
         MultiPartVersion([5, 4], "gcc"),
@@ -86,16 +80,6 @@ def gen_tree(self):
         # TODO Why does the build environment specify cuda10.1, while the
         # job name is cuda10_2?
         build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test"),
-    GeConfigTestJob(
-        None,
-        None,
-        CudaVersion(10, 2),
-        ["cudnn7", "py3", "ge_config_profiling", "test"],
-        ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"],
-        use_cuda_docker=True,
-        # TODO Why does the build environment specify cuda10.1, while the
-        # job name is cuda10_2?
-        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test"),
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index c82f99e1453e..f911b32c317c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7016,13 +7016,6 @@ workflows:
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           resource_class: large
-      - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_profiling-test
-          docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test
-          requires:
-            - pytorch_linux_xenial_py3_6_gcc5_4_build
-          resource_class: large
       - pytorch_linux_test:
           build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
@@ -7038,14 +7031,6 @@ workflows:
             - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           resource_class: gpu.medium
           use_cuda_docker_runtime: "1"
-      - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test
-          docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test
-          requires:
-            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
-          resource_class: gpu.medium
-          use_cuda_docker_runtime: "1"
       - pytorch_linux_bazel_build:
           build_environment: pytorch-linux-xenial-py3.6-gcc7-bazel-build
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc7

From 4339f5c076e6f59433ac4eec08b6151a803a7604 Mon Sep 17 00:00:00 2001
From: Xingying Cheng <xcheng16@fb.com>
Date: Wed, 30 Sep 2020 23:29:49 -0700
Subject: [PATCH 332/449] [PyTorch][QPL] Add instance_key into
 MOBILE_MODULE_LOAD_STATS logging. (#45518)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45518

Similar to previous diff,  Add instance_key into MOBILE_MODULE_LOAD_STATS logging.
ghstack-source-id: 113149713

Test Plan:
```
09-29 11:50:23.345  6477  9351 W MobileModuleQPLObserver.cpp: TESTINGTESTING onEnterLoadModel instance_key = 2015064908
09-29 11:50:23.409  6477  9351 W MobileModuleQPLObserver.cpp: TESTINGTESTING markerAnnotate instance_key = 2015064908, model_name = bi_pytext_v10
09-29 11:50:23.410  6477  9351 W MobileModuleQPLObserver.cpp: TESTINGTESTING markerAnnotate instance_key = 2015064908, model_type = FBNet
09-29 11:50:23.410  6477  9351 W MobileModuleQPLObserver.cpp: TESTINGTESTING markerAnnotate instance_key = 2015064908, op_list_string = ["aten::__getitem__.t", "aten::__is__", "aten::__isnot__", "aten::add.Tensor", "aten::append.t", "aten::cat", "aten::contiguous", "aten::conv1d", "aten::dim", "aten::embedding", "aten::eq.int", "aten::format", "aten::len.t", "aten::max.dim", "aten::mul.Tensor", "aten::permute", "aten::relu", "aten::softmax.int", "aten::tanh", "prepacked::linear_clamp_run", "prim::RaiseException", "prim::TupleIndex", "prim::TupleUnpack", "prim::Uninitialized", "prim::unchecked_cast"]
09-29 11:50:23.410  6477  9351 W MobileModuleQPLObserver.cpp: TESTINGTESTING onExitLoadModel instance_key = 2015064908
```

Reviewed By: iseeyuan

Differential Revision: D23996150

fbshipit-source-id: 7bf76af3b7e6b346afd20ab341204743c81cfe83
---
 torch/csrc/jit/mobile/import.cpp      | 13 +++++++++----
 torch/csrc/jit/mobile/import_data.cpp |  9 +++++----
 torch/csrc/jit/mobile/observer.h      |  6 ++++--
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index e26177605674..c7f41b902ad6 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -403,8 +403,9 @@ mobile::Module _load_for_mobile(
     std::unique_ptr<ReadAdapterInterface> rai,
     c10::optional<c10::Device> device) {
   auto observer = torch::observerConfig().getModuleObserver();
+  auto instance_key = std::rand();
   if (observer) {
-    observer->onEnterLoadModel();
+    observer->onEnterLoadModel(instance_key);
   }
   auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
   BytecodeDeserializer deserializer(std::move(reader));
@@ -416,13 +417,15 @@ mobile::Module _load_for_mobile(
       copied_metadata["model_name"] = result.name();
     }
     if (observer) {
-      observer->onExitLoadModel(copied_metadata);
+      observer->onExitLoadModel(instance_key, copied_metadata);
     }
     return result;
   } catch (c10::Error& error) {
     if (observer) {
       observer->onFailLoadModel(
-          error.what(), deserializer.deserializeMetadata(std::move(device)));
+          instance_key,
+          error.what(),
+          deserializer.deserializeMetadata(std::move(device)));
     }
     TORCH_RETHROW(error);
   } catch (...) {
@@ -440,7 +443,9 @@ mobile::Module _load_for_mobile(
     } catch (c10::Error& error) {
       if (observer) {
         observer->onFailLoadModel(
-            error.what(), deserializer.deserializeMetadata(std::move(device)));
+            instance_key,
+            error.what(),
+            deserializer.deserializeMetadata(std::move(device)));
       }
       TORCH_RETHROW(error);
     }
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index a1dacef29398..6ded78b1f56d 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -171,8 +171,9 @@ mobile::Module _load_data(
     std::unique_ptr<ReadAdapterInterface> rai,
     c10::optional<c10::Device> device) {
   auto observer = torch::observerConfig().getModuleObserver();
+  auto instance_key = std::rand();
   if (observer) {
-    observer->onEnterLoadModel();
+    observer->onEnterLoadModel(instance_key);
   }
   try {
     auto reader = torch::make_unique<PyTorchStreamReader>(std::move(rai));
@@ -186,12 +187,12 @@ mobile::Module _load_data(
       copied_metadata["model_name"] = result.name();
     }
     if (observer) {
-      observer->onExitLoadModel(copied_metadata);
+      observer->onExitLoadModel(instance_key, copied_metadata);
     }
     return result;
   } catch (c10::Error& error) {
     if (observer) {
-      observer->onFailLoadModel(error.what());
+      observer->onFailLoadModel(instance_key, error.what());
     }
     TORCH_RETHROW(error);
   } catch (...) {
@@ -208,7 +209,7 @@ mobile::Module _load_data(
       }
     } catch (c10::Error& error) {
       if (observer) {
-        observer->onFailLoadModel(error.what());
+        observer->onFailLoadModel(instance_key, error.what());
       }
       TORCH_RETHROW(error);
     }
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
index bdc0e28cc691..6ec2806d83b4 100644
--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@@ -74,11 +74,13 @@ class MobileModuleObserver {
       const std::string&) {}
   virtual void onExitRunMethod(const int32_t) {}
   virtual void onFailRunMethod(const int32_t, const char*) {}
-  virtual void onEnterLoadModel() {}
+  virtual void onEnterLoadModel(const int32_t) {}
   virtual void onExitLoadModel(
+      const int32_t,
       const std::unordered_map<std::string, std::string>&) {}
-  virtual void onFailLoadModel(const char*) {}
+  virtual void onFailLoadModel(const int32_t, const char*) {}
   virtual void onFailLoadModel(
+      const int32_t,
       const char*,
       const std::unordered_map<std::string, std::string>&) {}
 };

From 78b95b6204809822def6dd1b06d03cf002cd30c5 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Thu, 1 Oct 2020 01:05:40 -0700
Subject: [PATCH 333/449] Revert "Revert D24024606: [FX] Shape propagation
 example" (#45637)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45637

This reverts commit 869b05648def7a3b01685da94d4ee36f671d5dd6.

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D24037870

Pulled By: jamesr66a

fbshipit-source-id: 851beb42fe72383108ceeff1fe97f388d9ad059e
---
 test/test_fx.py                     | 27 ++++++++++++++++
 torch/fx/experimental/shape_prop.py | 49 +++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 torch/fx/experimental/shape_prop.py

diff --git a/test/test_fx.py b/test/test_fx.py
index 1790cf3e3559..c4fba462421f 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Tracer, Graph
 from torch.fx.experimental import GraphManipulation
+from torch.fx.experimental import shape_prop
 
 from torch.fx.proxy import TraceError
 
@@ -733,5 +734,31 @@ def test_wrong_topo(self):
         with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'):
             graph.lint()
 
+    def test_example_shape_prop(self):
+        class TestCase(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.randn(3, 4)
+                self.submod = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return torch.neg(self.submod(x.relu() + self.attr))
+        tc = TestCase()
+        tc_traced = symbolic_trace(tc)
+        ref_out = tc_traced(torch.rand(3, 4))
+
+        # Make sure we're testing all opcodes
+        opcodes = set()
+        for node in tc_traced.graph.nodes:
+            opcodes.add(node.op)
+        self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method', 'call_module']))
+
+        # Test shape propogation and make sure results match actual
+        shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4))
+        self.assertEqual(tc_traced.graph.result.shape, ref_out.shape)
+
+
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py
new file mode 100644
index 000000000000..6a9184d59a78
--- /dev/null
+++ b/torch/fx/experimental/shape_prop.py
@@ -0,0 +1,49 @@
+import torch
+import torch.fx
+from torch.fx.node import Node
+
+from typing import Dict
+
+class ShapeProp:
+    def __init__(self, mod):
+        self.mod = mod
+        self.graph = mod.graph
+        self.modules = dict(self.mod.named_modules())
+
+    def propagate(self, *args):
+        args_iter = iter(args)
+        env : Dict[str, Node] = {}
+
+        def load_arg(a):
+            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+
+        def fetch_attr(target : str):
+            target_atoms = target.split('.')
+            attr_itr = self.mod
+            for i, atom in enumerate(target_atoms):
+                if not hasattr(attr_itr, atom):
+                    raise RuntimeError(f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}")
+                attr_itr = getattr(attr_itr, atom)
+            return attr_itr
+
+        for node in self.graph.nodes:
+            if node.op == 'placeholder':
+                result = next(args_iter)
+            elif node.op == 'get_attr':
+                result = fetch_attr(node.target)
+            elif node.op == 'call_function':
+                result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'call_method':
+                self_obj, *args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = getattr(self_obj, node.target)(*args, **kwargs)
+            elif node.op == 'call_module':
+                result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+
+            if isinstance(result, torch.Tensor):
+                node.shape = result.shape
+                node.dtype = result.dtype
+
+            env[node.name] = result
+
+        return load_arg(self.graph.result)

From c36b35407270b7ba803dc475d333ca0ee4c8c0c8 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@fb.com>
Date: Thu, 1 Oct 2020 03:11:06 -0700
Subject: [PATCH 334/449] Revert D23913105: [quant][graphmode][fx] Merge all
 quantization mode

Test Plan: revert-hammer

Differential Revision:
D23913105 (https://github.com/pytorch/pytorch/commit/ffcb0989e76798ddb893b9e156ae1113d2498bb5)

Original commit changeset: 4e335286d6de

fbshipit-source-id: 5765b4e8ec917423f1745f73a9f3f235fc53423d
---
 test/quantization/test_quantize_fx.py         |  80 +++--
 torch/quantization/fx/pattern_utils.py        |  12 +
 .../quantization/fx/quantization_patterns.py  | 167 ++++++----
 torch/quantization/fx/quantize.py             |  55 ++--
 torch/quantization/fx/utils.py                |  27 --
 torch/quantization/quantize_fx.py             | 301 +++++++++++-------
 .../testing/_internal/common_quantization.py  |  36 +--
 7 files changed, 383 insertions(+), 295 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 23b416d7f5c7..e526d096a878 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -17,6 +17,10 @@
     fuse_fx,
     prepare_fx,
     convert_fx,
+    prepare_static_fx,
+    convert_static_fx,
+    quantize_static_fx,
+    quantize_dynamic_fx,
     prepare_qat_fx,
     register_observed_custom_module_mapping,
     register_quantized_custom_module_mapping,
@@ -154,11 +158,11 @@ def test_functional_debug(self):
             quant_type = QuantType.DYNAMIC if is_dynamic else QuantType.STATIC
             node_occurrence = dict()
             if weight_prepack_node:
-                node_occurrence[weight_prepack_node] = 0
-                node_occurrence[quantized_node] = 0
+                node_occurrence[weight_prepack_node] = 1
             self.checkGraphModeFxOp(
                 ModuleClass(*module_constructor_inputs),
                 inputs, quant_type,
+                expected_node=quantized_node,
                 expected_node_occurrence=node_occurrence,
                 debug=True)
 
@@ -179,8 +183,7 @@ def forward(self, x):
         original = symbolic_trace(m)
         qconfig = default_dynamic_qconfig
         qconfig_dict = {'': qconfig}
-        prepared = prepare_fx(original, qconfig_dict)
-        quantized = convert_fx(prepared, debug=True)
+        quantized = quantize_dynamic_fx(original, qconfig_dict, debug=True)
         qparams = (quantized._scale_0, quantized._zero_point_0)
         weight_obs = qconfig.weight()
         weight_obs(quantized.weight)
@@ -223,12 +226,14 @@ def forward(self, x):
             for debug in [True, False]:
                 node_occurrence = dict()
                 if weight_prepack_node:
-                    node_occurrence[weight_prepack_node] = 0
+                    if debug:
+                        node_occurrence[weight_prepack_node] = 1
+                    else:
+                        node_occurrence[weight_prepack_node] = 0
                 m = ModuleClass(*module_constructor_inputs).eval()
                 m = symbolic_trace(m)
                 qconfig_dict = {"": float16_dynamic_qconfig}
-                m = prepare_fx(m, qconfig_dict)
-                m = convert_fx(m, debug=debug)
+                m = quantize_dynamic_fx(m, qconfig_dict, debug=debug)
                 self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
 
@@ -288,19 +293,13 @@ def __init__(self):
             def forward(self, x):
                 return self.conv(x)
 
-        model = M().eval()
-        model = symbolic_trace(model)
+        model = symbolic_trace(M().eval())
         qconfig_dict = {'': default_qconfig}
-        prepared = prepare_fx(
-            model, qconfig_dict, inplace=False)
-        test_only_eval_fn(model, self.img_data_2d)
-        non_inplace_model = convert_fx(prepared, inplace=True)
-
-        prepared = prepare_fx(
-            model, qconfig_dict, inplace=True)
-        test_only_eval_fn(model, self.img_data_2d)
-        inplace_model = convert_fx(prepared, inplace=True)
-
+        non_inplace_model = quantize_static_fx(
+            model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=False)
+        inplace_model = model
+        inplace_model = quantize_static_fx(
+            inplace_model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=True)
         non_inplace_res = non_inplace_model(self.img_data_2d[0][0])
         inplace_res = inplace_model(self.img_data_2d[0][0])
         self.assertEqual(non_inplace_res, inplace_res)
@@ -320,9 +319,9 @@ def forward(self, x):
         dict_input = {"input": torch.randn(1, 1, 1, 1)}
         m = symbolic_trace(M()).eval()
         qconfig_dict = {"": default_qconfig}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_static_fx(m, qconfig_dict)
         m(dict_input)
-        m = convert_fx(m)
+        m = convert_static_fx(m)
         m(dict_input)
 
     def test_standalone_module_class(self):
@@ -432,10 +431,10 @@ def forward(self, x):
         m = symbolic_trace(m)
         qconfig_dict = {"": default_qconfig,
                         "module_name": [("conv2", None)]}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_static_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_fx(m)
+        m = convert_static_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -461,10 +460,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"object_type": [(torch.nn.Conv2d, default_qconfig)]}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_static_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_fx(m)
+        m = convert_static_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -486,10 +485,10 @@ def forward(self, x, y):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"object_type": [(operator.add, default_qconfig)]}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_static_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data, data)
-        m = convert_fx(m)
+        m = convert_static_fx(m)
         m(data, data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -514,10 +513,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"module_name_regex": [("conv*", default_qconfig)]}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_static_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_fx(m)
+        m = convert_static_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -559,7 +558,7 @@ def forward(self, x):
             "object_type": [(nn.Conv2d, object_type_qconfig)],
             "module_name_regex": [("module_conv*", module_name_regex_qconfig)],
             "module_name": [("module_conv2", module_name_qconfig)]}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_static_fx(m, qconfig_dict)
         self.assertEqual(m.linear.qconfig, global_qconfig)
         self.assertEqual(m.conv.qconfig, object_type_qconfig)
         self.assertEqual(m.module_conv1.qconfig, module_name_regex_qconfig)
@@ -578,10 +577,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {'': default_qconfig}
-        m = prepare_fx(m, qconfig_dict)
+        m = prepare_static_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_fx(m)
+        m = convert_static_fx(m)
         m(data)
         for name, module in m.named_modules():
             self.assertFalse(hasattr(module, 'qconfig'),
@@ -633,13 +632,12 @@ def test_save_observer_state_dict(self):
         qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')}
         # symbolically trace
         model = symbolic_trace(model)
-        model = prepare_fx(model, qconfig_dict)
-
+        model = prepare_static_fx(model, qconfig_dict)
         # run it through input
         x = torch.randn(5, 5)
         model(x)
 
-        quant = convert_fx(model)
+        quant = convert_static_fx(model)
 
         # save state_dict of model
         obs_dict = torch.quantization.get_observer_state_dict(model)
@@ -650,12 +648,12 @@ def test_save_observer_state_dict(self):
         # Load the stats into new model
         model_2 = orig
         model_2 = symbolic_trace(model_2)
-        model_2 = prepare_fx(model_2, qconfig_dict)
+        model_2 = prepare_static_fx(model_2, qconfig_dict)
 
         loaded_dict = torch.load(b)
         torch.quantization.load_observer_state_dict(model_2, loaded_dict)
 
-        quant_2 = convert_fx(model_2)
+        quant_2 = convert_static_fx(model_2)
 
         # Verify that loaded state dict produces same results.
         self.assertEqual(quant(x), quant_2(x))
@@ -767,7 +765,7 @@ def is_leaf_module(self, m, module_qualified_name):
             m = CustomTracer().trace(original_m).eval()
             qconfig_dict = {'': default_qconfig}
             # check prepared model
-            m = prepare_fx(m, qconfig_dict)
+            m = prepare_static_fx(m, qconfig_dict)
             # calibration
             m(data)
             # all activation observers are inserted in the top level module
@@ -777,7 +775,7 @@ def is_leaf_module(self, m, module_qualified_name):
             self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
 
             # check converted/quantized model
-            m = convert_fx(m)
+            m = convert_static_fx(m)
             count_check = {
                 ns.call_function(torch.quantize_per_tensor) : 1,
                 ns.call_module(nnq.Conv2d) : 1,
@@ -1349,7 +1347,7 @@ def forward(self, x):
         data = torch.rand(1, 3, 10, 10)
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M().eval()
+        m = M()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
@@ -1444,7 +1442,7 @@ def forward(self, x):
 
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M().eval()
+        m = M()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index fbdccbc5e3e2..ae9e92ccda26 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -25,6 +25,18 @@ def insert(fn):
 def get_quant_patterns():
     return QUANTIZATION_PATTERNS
 
+DYNAMIC_QUANTIZATION_PATTERNS = OrderedDict()
+# Register pattern for dynamic quantization
+def register_dynamic_quant_pattern(pattern):
+    def insert(fn):
+        DYNAMIC_QUANTIZATION_PATTERNS[pattern] = fn
+        return fn
+    return insert
+
+# Get patterns for dynamic quantization
+def get_dynamic_quant_patterns():
+    return DYNAMIC_QUANTIZATION_PATTERNS
+
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 # class ConvBNReLUFusion():
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 844351a30def..0cb076dd73c7 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -2,9 +2,6 @@
 from torch.fx.graph import (
     Node,
 )
-import torch.nn.quantized as nnq
-import torch.nn.quantized.dynamic as nnqd
-
 from ..quantization_mappings import (
     get_static_quant_module_class,
     get_quantized_operator,
@@ -14,15 +11,12 @@
 )
 from .pattern_utils import (
     register_quant_pattern,
+    register_dynamic_quant_pattern,
 )
 from .utils import (
     _parent_name,
     quantize_node,
     get_per_tensor_qparams,
-    activation_is_statically_quantized,
-    weight_is_quantized,
-    weight_dtype,
-    get_linear_prepack_op_for_dtype,
 )
 
 from abc import ABC, abstractmethod
@@ -241,7 +235,7 @@ def convert(self, quantizer, node, load_arg, debug=False):
 # for error checks
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Linear))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Linear))
-class LinearReLUQuantizeHandler(QuantizeHandler):
+class LinearReLU(QuantizeHandler):
     def __init__(self, quantizer, node):
         super().__init__(quantizer, node)
         self.relu_node = None
@@ -254,76 +248,50 @@ def __init__(self, quantizer, node):
             self.linear = quantizer.modules[self.linear_node.target]
 
     def convert(self, quantizer, node, load_arg, debug=False):
-        qconfig = quantizer.qconfig_map[node.name]
-        activation_statically_quantized = activation_is_statically_quantized(qconfig)
         # TODO: debug option for linear module
         if self.linear_node.op == 'call_module':
             # note that relu should already be fused into conv module in the fusion step
             assert self.relu_node is None, 'linear module and relu fusion is not executed, ' \
                 'please make sure to run fusion before prepare'
-            # 1. attach output activation post process to linear module
-            if node.name in quantizer.activation_post_process_map:
-                # this is the static quantization case
-                output_activation_post_process = quantizer.activation_post_process_map[node.name]
+            # 1. attach activation post process to module
+            if type(self.linear) == torch.nn.intrinsic.LinearReLU:
+                self.linear[1].activation_post_process = quantizer.activation_post_process_map[node.name]
             else:
-                output_activation_post_process = None
-
-            if output_activation_post_process:
-                if type(self.linear) == torch.nn.intrinsic.LinearReLU:
-                    float_linear_module = self.linear[1]
-                else:
-                    float_linear_module = self.linear
-                float_linear_module.activation_post_process = output_activation_post_process
-
-            # 2. select corresponding quantized linear class for the float linear class
+                self.linear.activation_post_process = quantizer.activation_post_process_map[node.name]
+            # 2. select quantized class
             if type(self.linear) in [torch.nn.Linear, torch.nn.qat.Linear]:
-                qlinear = nnq.Linear if activation_statically_quantized else nnqd.Linear
+                qlinear = torch.nn.quantized.Linear
             elif type(self.linear) in [torch.nn.intrinsic.LinearReLU, torch.nn.intrinsic.qat.LinearReLU]:
-                assert activation_statically_quantized, \
-                    'Only static quantization is supported for LinearReLU'
                 qlinear = torch.nn.intrinsic.quantized.LinearReLU
             else:
                 raise Exception("unhandled linear type:", type(self.linear))
             quantized = qlinear.from_float(self.linear)
             parent_name, name = _parent_name(self.linear_node.target)
             setattr(quantizer.modules[parent_name], name, quantized)
-            # activation needs to be quantized for static quantization
             return quantizer.quantized_graph.create_node(
                 'call_module',
-                self.linear_node.target,
-                (load_arg(quantized=activation_statically_quantized)(self.linear_node.args[0]),), {})
+                self.linear_node.target, (load_arg(quantized=True)(self.linear_node.args[0]),), {})
         elif self.linear_node.op == 'call_function':
             if debug:
-                quantized_input_idxs = []
-                if activation_statically_quantized:
-                    quantized_input_idxs.append(0)
-                if weight_is_quantized(qconfig):
-                    quantized_input_idxs.append(1)
-                args = load_arg(quantized=quantized_input_idxs)(self.linear_node.args)
+                args = load_arg(quantized=[0, 1])(self.linear_node.args)
                 args = load_arg(quantized=False)(self.linear_node.args)
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 linear_out = quantizer.quantized_graph.create_node(
                     'call_function', torch.nn.functional.linear, args, kwargs)
-                if activation_statically_quantized:
-                    # quantize output for statically quantized linear op
-                    root_module = quantizer.modules['']
-                    return quantize_node(
-                        root_module,
-                        quantizer.quantized_graph,
-                        linear_out,
-                        quantizer.activation_post_process_map[self.linear_node.name])
-                else:
-                    # output for dynamically quantized linear op is not quantized
-                    return linear_out
-            else:  # non-debug option
+                root_module = quantizer.modules['']
+                return quantize_node(
+                    root_module,
+                    quantizer.quantized_graph,
+                    linear_out,
+                    quantizer.activation_post_process_map[self.linear_node.name])
+            else:
+                # TODO: this code can be merged with dynamic linear code
                 # linear args
                 # (x, weight, bias, ...)
-                weight_quantized = weight_is_quantized(qconfig)
-                linear_weight = load_arg(quantized=weight_quantized)(self.linear_node.args[1])
-
-                # get other arguments
+                args = load_arg(quantized=[0, 1])(self.linear_node.args)
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 # pack weight
+                weight = load_arg(quantized=True)(self.linear_node.args[1])
                 bias = None
                 # all args after bias, including bias
                 other_args = load_arg(quantized=False)(self.linear_node.args[2:])
@@ -335,24 +303,17 @@ def convert(self, quantizer, node, load_arg, debug=False):
                         'expect bias provided as a keyword argument when it is not a positional argument'
                     bias = kwargs['bias']
                     kwargs.pop('bias')
-                prepack_args = (linear_weight, bias)
-                prepack_op = get_linear_prepack_op_for_dtype(weight_dtype(qconfig))
+                prepack_args = (weight, bias)
                 packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', prepack_op, prepack_args, {})
+                    'call_function', torch.ops.quantized.linear_prepack, prepack_args, {})
                 # construct linear input
-                if activation_statically_quantized:
-                    linear_input = load_arg(quantized=True)(self.linear_node.args[0])
-                    activation_post_process = \
-                        quantizer.activation_post_process_map[self.linear_node.name]
-                    scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
-                    qlinear_args = (linear_input, packed_weight, scale, zero_point)
-                    return quantizer.quantized_graph.create_node(
-                        'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
-                else:
-                    linear_input = load_arg(quantized=False)(self.linear_node.args[0])
-                    qlinear_args = (linear_input, packed_weight)
-                    return quantizer.quantized_graph.create_node(
-                        'call_function', torch.ops.quantized.linear_dynamic, qlinear_args, kwargs)
+                linear_input = load_arg(quantized=True)(self.linear_node.args[0])
+                activation_post_process = \
+                    quantizer.activation_post_process_map[self.linear_node.name]
+                scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
+                qlinear_args = (linear_input, packed_weight, scale, zero_point)
+                return quantizer.quantized_graph.create_node(
+                    'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
 
 @register_quant_pattern(torch.nn.BatchNorm2d)
 @register_quant_pattern(torch.nn.BatchNorm3d)
@@ -576,8 +537,10 @@ class StandaloneModuleQuantizeHandler(QuantizeHandler):
     """
     def convert(self, quantizer, node, load_arg, debug=False):
         assert node.op == 'call_module'
-        qconfig = quantizer.qconfig_map[node.name]
-        convert = torch.quantization.quantize_fx._convert_standalone_module_fx
+        if quantizer.is_dynamic_quant:
+            convert = torch.quantizations.quantize_fx._convert_dynamic_standalone_module_fx
+        else:
+            convert = torch.quantization.quantize_fx._convert_standalone_module_fx
         observed_standalone_module = quantizer.modules[node.target]
         quantized_standalone_module = convert(observed_standalone_module, debug=debug)
         parent_name, name = _parent_name(node.target)
@@ -585,3 +548,67 @@ def convert(self, quantizer, node, load_arg, debug=False):
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
         return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
+
+
+# 2. Post Training Dynamic Quantizatoin Patterns
+@register_dynamic_quant_pattern(torch.nn.Linear)
+@register_dynamic_quant_pattern(torch.nn.functional.linear)
+class DynamicLinear(QuantizeHandler):
+    def __init__(self, quantizer, node):
+        super().__init__(quantizer, node)
+        self.linear_node = node
+        if node.op == 'call_module':
+            assert isinstance(quantizer.modules[node.target], torch.nn.Linear)
+            self.linear = quantizer.modules[self.linear_node.target]
+
+    def convert(self, quantizer, node, load_arg, debug=False):
+        if self.linear_node.op == 'call_module':
+            quantized = torch.nn.quantized.dynamic.Linear.from_float(self.linear)
+            parent_name, name = _parent_name(self.linear_node.target)
+            setattr(quantizer.modules[parent_name], name, quantized)
+            return quantizer.quantized_graph.create_node(
+                'call_module',
+                self.linear_node.target,
+                (load_arg(quantized=False)(self.linear_node.args[0]),),
+                {})
+        elif self.linear_node.op == 'call_function':
+            if debug:
+                # quantize and dequantize weight
+                args = load_arg(quantized=[1])(self.linear_node.args)
+                args = load_arg(quantized=False)(self.linear_node.args)
+                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
+                return quantizer.quantized_graph.create_node(
+                    'call_function', torch.nn.functional.linear, args, kwargs)
+            else:
+                # linear args:
+                # (x, observed_weight, bias)
+                # get observer for the weight
+                weight_observer = quantizer.activation_post_process_map[self.linear_node.args[1].args[0].name]
+
+                if weight_observer.dtype == torch.float16:
+                    linear_weight = load_arg(quantized=False)(self.linear_node.args[1])
+                    prepack_op = torch.ops.quantized.linear_prepack_fp16
+                else:
+                    linear_weight = load_arg(quantized=True)(self.linear_node.args[1])
+                    prepack_op = torch.ops.quantized.linear_prepack
+                bias = None
+                # all args after bias, including bias
+                other_args = load_arg(quantized=False)(self.linear_node.args[2:])
+                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
+                if len(self.linear_node.args) > 2:
+                    bias = load_arg(quantized=False)(self.linear_node.args[2])
+                    other_args = other_args[1:]  # remove the bias argument
+                else:
+                    assert 'bias' in kwargs, \
+                        'expect bias provided as a keyword argument when it is not a positional argument'
+                    bias = kwargs['bias']
+                    kwargs.pop('bias')
+                prepack_args = (linear_weight, bias)
+                # pack weight
+                packed_weight = quantizer.quantized_graph.create_node(
+                    'call_function', prepack_op, prepack_args, {})
+                # construct dynamic linear input
+                non_quantized_input = load_arg(quantized=False)(self.linear_node.args[0])
+                qdynamic_linear_args = (non_quantized_input, packed_weight)
+                return quantizer.quantized_graph.create_node(
+                    'call_function', torch.ops.quantized.linear_dynamic, qdynamic_linear_args, kwargs)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 64c4fd18103a..eaebe4c04d55 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -31,6 +31,7 @@
 from .pattern_utils import (
     is_match,
     get_quant_patterns,
+    get_dynamic_quant_patterns,
 )
 
 from .standalone_module import (
@@ -43,7 +44,6 @@
 from .utils import (
     _parent_name,
     quantize_node,
-    activation_is_statically_quantized,
 )
 
 from collections import OrderedDict
@@ -304,7 +304,7 @@ def get_qconfig(module_name):
                 self.modules[node.target].qconfig = module_qconfig
                 self.qconfig_map[node.name] = module_qconfig
 
-    def _prepare(self, model, qconfig_dict, inplace, is_standalone_module):
+    def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
@@ -320,7 +320,11 @@ def _prepare(self, model, qconfig_dict, inplace, is_standalone_module):
         """
         if not inplace:
             model = copy.deepcopy(model)
-        self.patterns = get_quant_patterns()
+        self.is_dynamic_quant = is_dynamic_quant
+        if self.is_dynamic_quant:
+            self.patterns = get_dynamic_quant_patterns()
+        else:
+            self.patterns = get_quant_patterns()
 
         flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict)
         # TODO: support regex as well
@@ -400,7 +404,10 @@ def insert_observer(node, observer, device):
                     # observe standalone module
                     standalone_module = self.modules[node.target]
                     traced_standalone_module = symbolic_trace(standalone_module)
-                    prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
+                    if self.is_dynamic_quant:
+                        prepare = torch.quantization.quantize_fx._prepare_dynamic_standalone_module_fx
+                    else:
+                        prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
                     observed_standalone_module = prepare(traced_standalone_module, {'': qconfig})
                     observed_standalone_module.qconfig = qconfig
                     standalone_module_input_idxs = observed_standalone_module._standalone_module_observed_input_idxs
@@ -410,9 +417,8 @@ def insert_observer(node, observer, device):
                     self.modules[node.target] = observed_standalone_module
 
 
-                # don't need to insert observer for output if activation does not
-                # need to be statically quantized
-                if not activation_is_statically_quantized(qconfig):
+                # don't need to insert observer for output in dynamic quantization
+                if self.is_dynamic_quant:
                     continue
 
                 # inserting observers for output of observed module, or mark the output
@@ -510,13 +516,16 @@ def restore_state(self, observed):
         self.qconfig_map = observed._qconfig_map
 
     def prepare(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
-        return self._prepare(model, qconfig_dict, inplace, is_standalone_module=is_standalone_module)
+        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=is_standalone_module)
+
+    def prepare_dynamic(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
+        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=is_standalone_module)
 
     def _run_weight_observers(self, observed):
-        r''' Extract the subgraph that produces the weight for dynamic quant
-        or weight only quant node and run the subgraph to observe the weight.
-        Note that the observers of dynamic quant or weight only quant ops are run during
-        the convert step.
+        r''' Extract the subgraph that produces the weight for dynamically quantized
+        node and run the subgraph to observe the weight.
+        Note that the observers of dynamically quantized modules are run during
+        the conversion step.
         '''
         for node in observed.graph.nodes:
             if node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
@@ -531,7 +540,7 @@ def _run_weight_observers(self, observed):
                             weight_observer_module()
         return
 
-    def _convert(self, model, inplace=False, debug=False, is_standalone_module=False):
+    def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False, is_standalone_module=False):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
         For standalone module: the inputs will be quantized by parent module,
@@ -544,9 +553,11 @@ def _convert(self, model, inplace=False, debug=False, is_standalone_module=False
         self.restore_state(model)
         if not inplace:
             model = copy.deepcopy(model)
-        # always run weight observers in the top level forward method
-        # for dynamic quant ops or weight only quant ops
-        self._run_weight_observers(model)
+        self.is_dynamic_quant = is_dynamic_quant
+        # run weight observers before inserting quant dequant nodes
+        # for dynamic quantization
+        if self.is_dynamic_quant:
+            self._run_weight_observers(model)
 
         # move to cpu since we only have quantized cpu kernels
         model.eval().cpu()
@@ -647,7 +658,7 @@ def is_quantized(node):
                     result = self.quantized_graph.node_copy(node, load_non_quantized)
                     quantized = False
                 else:
-                    result = obj.convert(self, node, load_arg, debug=debug)
+                    result = obj.convert(self, node, load_arg)
                     if node.op == 'call_module' and is_observed_standalone_module(self.modules[node.target]):
                         quantized = self.modules[node.target]._output_is_observed
                     else:
@@ -662,7 +673,8 @@ def is_quantized(node):
                             'CopyNode of type ' + node.op + ' is not handled'
                         quantized = is_quantized(node.args[0])
 
-                    if not activation_is_statically_quantized(qconfig):
+                    # output of dynamic quantization is not quantized
+                    if self.is_dynamic_quant:
                         quantized = False
 
                 if quantized:
@@ -788,8 +800,8 @@ def load_arg(a):
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 
-    def convert(self, model, inplace=False, debug=False, is_standalone_module=False):
-        quantized = self._convert(model, inplace, debug, is_standalone_module)
+    def convert(self, model, inplace=False, debug=False, is_dynamic=False, is_standalone_module=False):
+        quantized = self._convert(model, inplace, debug, is_dynamic, is_standalone_module)
         if not debug:
             quantized = self._fold_weight(quantized)
         return quantized
@@ -891,8 +903,7 @@ def visit_arg(arg):
                     for i, node_arg in enumerate(node.args):
                         if arg is node_arg and i in WEIGHT_INDEX_DICT[node.target]:
                             is_weight = True
-                if qconfig is not None and \
-                   (activation_is_statically_quantized(qconfig) or is_weight):
+                if (not self.is_dynamic_quant) or is_weight:
                     # overwrite previous quant config
                     quants[arg.name] = (DefaultQuant(self, arg), qconfig, is_weight)
             return visit_arg
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 98f94a0633a0..5d5532dc48fc 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -138,30 +138,3 @@ def get_next_qparams_idx(module, qparams):
         qparam_full_path = key + str(idx)
         inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})
-
-def activation_is_statically_quantized(qconfig):
-    """ Given a qconfig, decide if the activation needs to be
-    statically quantized or not
-    """
-    assert qconfig is not None
-    activation = qconfig.activation()
-    return activation.dtype in [torch.quint8, torch.qint8]
-
-def weight_dtype(qconfig):
-    assert qconfig is not None
-    weight = qconfig.weight()
-    return weight.dtype
-
-def weight_is_quantized(qconfig):
-    """ Given a qconfig, decide if the activation needs to be
-    quantized or not
-    """
-    return weight_dtype(qconfig) in [torch.quint8, torch.qint8]
-
-def get_linear_prepack_op_for_dtype(dtype):
-    if dtype == torch.float16:
-        return torch.ops.quantized.linear_prepack_fp16
-    elif dtype == torch.qint8:
-        return torch.ops.quantized.linear_prepack
-    else:
-        raise Exception("can't get linear prepack op for dtype:", dtype)
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 7fa3fcbee82f..8c48a8dded95 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -20,10 +20,10 @@ def fuse_fx(graph_module, inplace=False):
     fuser = Fuser()
     return fuser.fuse(graph_module, inplace)
 
-def _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=False):
+def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module=False):
     r""" Internal helper function for prepare_fx
     Args:
-      `graph_modul`e, `qconfig_dict`, `inplace`: see docs for :func:`~torch.quantization.prepare_fx`
+      graph_module, qconfig_dict, inplace: see docs for :func:`~torch.quantization.prepare_fx`
       `is_standalone_module`: a boolean flag indicates whether we are
       quantizing a standalone module or not, a standalone module
       is a submodule of the parent module that is not inlined in the
@@ -34,7 +34,8 @@ def _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=False)
     _check_is_graph_module(graph_module)
     graph_module = fuse_fx(graph_module, inplace)
     quantizer = Quantizer()
-    return quantizer.prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
+    prepare = quantizer.prepare_dynamic if is_dynamic_quant else quantizer.prepare
+    return prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
 
 def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
     r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
@@ -51,81 +52,28 @@ def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
                                    custom module is observed or not
 
     """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=True)
+    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=True)
+
+def _prepare_dynamic_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
+    r""" See :func:`~torch.quantization.prepare_standalone_module_fx`
+    """
+    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=True)
 
 def prepare_fx(graph_module, qconfig_dict, inplace=False):
-    r""" Prepare a model for post training static quantization
+    r""" Prepare a model for post training static quantization or
+    qantization aware training, not for public use.
 
     Args:
-      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
       an eval model
-      `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
-      qconfig_dict = {
-      # optional, global config
-      "": qconfig?,
-
-      # optional, used for module and function types
-      # could also be split into module_types and function_types if we prefer
-      "object_type": [
-        (torch.nn.Conv2d, qconfig?),
-        (torch.nn.functional.add, qconfig?),
-        ...,
-       ],
-
-      # optional, used for module names
-      "module_name": [
-        ("foo.bar", qconfig?)
-        ...,
-      ],
-
-      # optional, matched in order, first match takes precedence
-      "module_name_regex": [
-        ("foo.*bar.*conv[0-9]+", qconfig?)
-        ...,
-      ],
-      # priority (in increasing order): global, object_type, module_name_regex, module_name
-      # qconfig == None means fusion and quantization should be skipped for anything
-      # matching the rule
-
-      # optional: specify the path for standalone modules
-      # These modules are symbolically traced and quantized as one unit
-      # User should also skip symbolic tracing through these modules
-      # so that the call to the submodule appears as one call_module
-      # node in the forward graph of the GraphModule
-      "standalone_module_name": [
-         "submodule.standalone"
-      ]
-      }
-      `inplace`: flag for carry out model transformations in-place,
-      the original module is mutated
-
+      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
 
     Return:
-      A GraphModule with observer (configured by qconfig_dict), ready for calibration
-
-    Example:
-    ```python
-    import torch
-    from torch.quantization import get_default_qconfig
-    from torch.quantization import prepare_fx
-
-    float_model.eval()
-    graph_module = torch.fx.symbolic_trace(float_model)
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-
-    qconfig_dict = {"": qconfig}
-    prepared_model = prepare_fx(graph_module, qconfig_dict)
-    # Run calibration
-    calibrate(prepared_model, sample_inference_data)
+      A GraphModule with observer or fake quant modules, ready for
+      calibration or quantization aware training
     """
-    assert not graph_module.training, 'prepare_fx only works for models in' + \
-        'eval mode'
-    return _prepare_fx(graph_module, qconfig_dict, inplace)
+    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False)
+    return prepared
 
 def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
     assert not graph_module.training, 'prepare_static_fx only works for models in ' + \
@@ -135,63 +83,39 @@ def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
 def prepare_qat_fx(graph_module, qconfig_dict, inplace=False):
     r""" Prepare a model for quantization aware training
     Args:
-      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
-       a train model
-      `qconfig_dict`: see :func:`~torch.quantization.prepare_fx`
-      `inplace`: flag for carry out model transformations in-place,
-       the original module is mutated
+      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+      a train model
+      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
 
     Return:
-      A GraphModule with fake quant modules (configured by qconfig_dict), ready for
-      quantization aware training
-
-    Example:
-    ```python
-    import torch
-    from torch.quantization import get_default_qat_qconfig
-    from torch.quantization import prepare_fx
-
-    float_model.train()
-    graph_module = torch.fx.symbolic_trace(float_model)
-    qconfig = get_default_qat_qconfig('fbgemm')
-    def train_loop(model, train_data):
-        model.train()
-        for image, target in data_loader:
-            ...
-
-    qconfig_dict = {"": qconfig}
-    prepared_model = prepare_fx(graph_module, qconfig_dict)
-    # Run calibration
-    train_loop(prepared_model, train_loop)
+      A GraphModule with observer or fake quant modules, ready for
+      calibration or quantization aware training
     """
     assert graph_module.training, 'prepare_qat_fx only works for models in ' + \
         'train mode'
-    return _prepare_fx(graph_module, qconfig_dict, inplace)
+    return prepare_fx(graph_module, qconfig_dict, inplace)
 
-def _convert_fx(graph_module, inplace, debug, is_standalone_module=False):
-    """ `is_standalone_module`: see docs in :func:`~torch.quantization.prepare_standalone_module_fx`
+def prepare_dynamic_fx(graph_module, qconfig_dict, inplace=False):
+    r""" Prepare a model for post training dynamic quantization
     """
+    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, True)
+    return prepared
+
+def _convert_fx(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module=False):
     _check_is_graph_module(graph_module)
     quantizer = Quantizer()
-    return quantizer.convert(graph_module, inplace, debug, is_standalone_module)
+    return quantizer.convert(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module)
 
 def convert_fx(graph_module, inplace=False, debug=False):
     r""" Convert a calibrated or trained model to a quantized model
-    Args:
-        `graph_module`: A prepared and calibrated/trained model (GraphModule)
-        `inplace`: flag for carry out model transformations in-place,
-        the original module is mutated
-        `debug`: flag for producing a debug friendly model (preserve weight attribute)
-    Return:
-        A quantized model (GraphModule)
-
-    Example:
-    ```python
-    # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
-    quantized_model = convert_fx(prepared_model)
-    ```
     """
-    return _convert_fx(graph_module, inplace, debug)
+    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False)
+
+convert_static_fx = convert_fx
+convert_qat_fx = convert_fx
+
+def convert_dynamic_fx(graph_module, inplace=False, debug=False):
+    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True)
 
 def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
@@ -204,4 +128,151 @@ def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
       A quantized standalone module which accepts quantized input(if needed)
       and produces quantized output (if needed).
     """
-    return _convert_fx(graph_module, inplace, debug, is_standalone_module=True)
+    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False, is_standalone_module=True)
+
+def _convert_dynamic_standalone_module_fx(graph_module, inplace=False, debug=False):
+    r""" See :func:`~torch.quantization.convert_standalone_module_fx`
+    """
+    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True, is_standalone_module=True)
+
+def _quantize_fx(model, qconfig_dict, run_fn=None, run_args=None, inplace=False,
+                 debug=False, is_dynamic_quant=False):
+    assert not model.training, 'quantize_fx is only used for post training ' + \
+        'quantization(eval mode), for quantization aware training please use ' + \
+        'prepare_qat_fx and convert_qat_fx.'
+
+    if is_dynamic_quant:
+        model = prepare_dynamic_fx(model, qconfig_dict, inplace)
+        # inplace is True since the inplace option is already applied in previous step
+        model = convert_dynamic_fx(model, inplace=True, debug=debug)
+    else:
+        assert run_fn, "Must provide calibration function for post training static quantization"
+        assert run_args, "Must provide calibration dataset for post training static quantization"
+        model = prepare_fx(model, qconfig_dict, inplace)
+        run_fn(model, *run_args)
+        # inplace is True since the inplace option is already applied in previous step
+        model = convert_fx(model, inplace=True, debug=debug)
+
+    return model
+
+
+def quantize_static_fx(model, qconfig_dict, run_fn, run_args, inplace=False, debug=False):
+    r"""Quantize the input float symbolically traced GraphModule model with
+    post training static quantization
+
+    First it will prepare the model for calibration, then it calls
+    `run_fn` which will run the calibration step, after that we will
+    convert the model to a quantized model.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
+        qconfig_dict = {
+        # optional, global config
+        "": qconfig?,
+
+        # optional, used for module and function types
+        # could also be split into module_types and function_types if we prefer
+        "object_type": [
+          (torch.nn.Conv2d, qconfig?),
+          (torch.nn.functional.add, qconfig?),
+          ...,
+         ],
+
+        # optional, used for module names
+        "module_name": [
+          ("foo.bar", qconfig?)
+          ...,
+        ],
+
+        # optional, matched in order, first match takes precedence
+        "module_name_regex": [
+          ("foo.*bar.*conv[0-9]+", qconfig?)
+          ...,
+        ],
+        # priority (in increasing order): global, object_type, module_name_regex, module_name
+        # qconfig == None means fusion and quantization should be skipped for anything
+        # matching the rule
+
+        # optional: specify the path for standalone modules
+        # These modules are symbolically traced and quantized as one unit
+        # User should also skip symbolic tracing through these modules
+        # so that the call to the submodule appears as one call_module
+        # node in the forward graph of the GraphModule
+        "standalone_module_name": [
+           "submodule.standalone"
+        ]
+        }
+        `run_fn`: a calibration function for calibrating the prepared model
+        `run_args`: positional arguments for `run_fn`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+
+    Return:
+        Quantized TorchSciprt model.
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import get_default_qconfig
+    from torch.quantization import quantize_fx
+
+    graph_module = torch.fx.symbolic_trace(float_model.eval())
+    qconfig = get_default_qconfig('fbgemm')
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+    quantized_model = quantize_fx(
+        graph_module,
+        {'': qconfig},
+        calibrate,
+        [data_loader_test])
+    ```
+    """
+    return _quantize_fx(
+        model, qconfig_dict, run_fn, run_args, inplace, debug, is_dynamic_quant=False)
+
+def quantize_dynamic_fx(model, qconfig_dict, inplace=False, debug=False):
+    r"""Quantize the input float symbolically traced GraphModule model with
+    post training dynamic quantization.
+    Currently only qint8 quantization of torch.nn.Linear is supported.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
+        qconfig for that module as value, please see detailed
+        descriptions in :func:`~torch.quantization.quantize_fx`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+
+    Return:
+        Quantized TorchSciprt model.
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import per_channel_dynamic_qconfig
+    from torch.quantization import quantize_dynmiac_fx
+
+    graph_module = torch.fx.symbolic_trace(float_model.eval())
+    qconfig = get_default_qconfig('fbgemm')
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+    quantized_model = quantize_dynamic_fx(
+        graph_module,
+        {'': qconfig},
+        calibrate,
+        [data_loader_test])
+    ```
+    """
+    return _quantize_fx(
+        model, qconfig_dict, inplace=inplace, debug=debug, is_dynamic_quant=True)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index ccbda8232952..bcdb766b997d 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -29,8 +29,9 @@
 from torch.quantization import (
     QuantType,
     prepare_fx,
-    prepare_qat_fx,
+    prepare_dynamic_fx,
     convert_fx,
+    convert_dynamic_fx,
 )
 
 import copy
@@ -629,46 +630,41 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         if type(inputs) == list:
             inputs = inputs[0]
         if quant_type == QuantType.QAT:
-            qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
+            qconfig_dict = {'': get_default_qat_qconfig(torch.backends.quantized.engine)}
             model.train()
-        elif quant_type == QuantType.STATIC:
-            qconfig = get_default_qconfig(torch.backends.quantized.engine)
-            model.eval()
         else:
-            qconfig = default_dynamic_qconfig
+            qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
             model.eval()
-
         original = symbolic_trace(model)
-        if quant_type == QuantType.QAT:
-            prepare = prepare_qat_fx
+
+        if quant_type == QuantType.DYNAMIC:
+            prepare = prepare_dynamic_fx
+            convert = convert_dynamic_fx
         else:
             prepare = prepare_fx
+            convert = convert_fx
 
-        qconfig_dict = {'': qconfig}
         prepared = prepare(original, qconfig_dict)
         prepared(*inputs)
-        qgraph = convert_fx(prepared)
-        qgraph_debug = convert_fx(prepared, debug=True)
+        qgraph = convert(prepared)
+        qgraph_debug = convert(prepared, debug=True)
 
         result = qgraph(*inputs)
         result_debug = qgraph_debug(*inputs)
 
-        # numeric match for debug option for dynamic
-        # quantized op is not needed right now
-        if quant_type != QuantType.DYNAMIC:
-            self.assertEqual((result - result_debug).abs().max(), 0), \
-                'Expecting debug and non-debug option to produce identical result'
+        self.assertEqual((result - result_debug).abs().max(), 0), \
+            'Expecting debug and non-debug option to produce identical result'
 
-        qgraph_to_check = qgraph_debug if debug else qgraph
         if print_debug_info:
             print()
             print('quant type:', quant_type)
             print('origianl graph module:', type(model))
             self.printGraphModule(original)
             print()
-            print('quantized graph module:', type(qgraph_to_check))
-            self.printGraphModule(qgraph_to_check)
+            print('quantized graph module:', type(qgraph))
+            self.printGraphModule(qgraph)
             print()
+        qgraph_to_check = qgraph_debug if debug else qgraph
         self.checkGraphModuleNodes(
             qgraph_to_check, expected_node, expected_node_occurrence, expected_node_list)
 

From 592b398e82a6e3657c8405cad0438abff078dfba Mon Sep 17 00:00:00 2001
From: generatedunixname89002005325674 <generatedunixname89002005325674@fb.com>
Date: Thu, 1 Oct 2020 05:17:32 -0700
Subject: [PATCH 335/449] [AutoAccept][Codemod][FBSourceGoogleJavaFormatLinter]
 Daily `arc lint --take GOOGLEJAVAFORMAT`

Reviewed By: zertosh

Differential Revision: D24044052

fbshipit-source-id: 50ac5b7480ed65af94617bf8b014252ea7b27c4f
---
 .../src/main/java/org/pytorch/Module.java                | 3 ++-
 .../src/main/java/org/pytorch/NativePeer.java            | 9 +++++++--
 .../src/main/java/org/pytorch/testapp/MainActivity.java  | 8 +++++---
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/android/pytorch_android/src/main/java/org/pytorch/Module.java b/android/pytorch_android/src/main/java/org/pytorch/Module.java
index 70273286f7e2..62db7042d57b 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/Module.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Module.java
@@ -11,7 +11,8 @@ public class Module {
   private INativePeer mNativePeer;
 
   /**
-   * Loads a serialized TorchScript module from the specified path on the disk to run on specified device.
+   * Loads a serialized TorchScript module from the specified path on the disk to run on specified
+   * device.
    *
    * @param modelPath path to file that contains the serialized TorchScript module.
    * @param device {@link org.pytorch.Device} to use for running specified module.
diff --git a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
index 723e9eef3fb3..76c0c6226755 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java
@@ -17,13 +17,18 @@ class NativePeer implements INativePeer {
 
   @DoNotStrip
   private static native HybridData initHybridAndroidAsset(
-      String assetName, /* android.content.res.AssetManager */ Object androidAssetManager, int deviceJniCode);
+      String assetName, /* android.content.res.AssetManager */
+      Object androidAssetManager,
+      int deviceJniCode);
 
   NativePeer(String moduleAbsolutePath, Device device) {
     mHybridData = initHybrid(moduleAbsolutePath, device.jniCode);
   }
 
-  NativePeer(String assetName, /* android.content.res.AssetManager */ Object androidAssetManager, Device device) {
+  NativePeer(
+      String assetName, /* android.content.res.AssetManager */
+      Object androidAssetManager,
+      Device device) {
     mHybridData = initHybridAndroidAsset(assetName, androidAssetManager, device.jniCode);
   }
 
diff --git a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
index 1510890a15b8..bd7469950f87 100644
--- a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
+++ b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java
@@ -127,9 +127,11 @@ protected Result doModuleForward() {
       mInputTensorBuffer = Tensor.allocateFloatBuffer((int) numElements);
       mInputTensor = Tensor.fromBlob(mInputTensorBuffer, BuildConfig.INPUT_TENSOR_SHAPE);
       PyTorchAndroid.setNumThreads(1);
-      mModule = BuildConfig.USE_VULKAN_DEVICE
-        ? PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.VULKAN)
-        : PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME);
+      mModule =
+          BuildConfig.USE_VULKAN_DEVICE
+              ? PyTorchAndroid.loadModuleFromAsset(
+                  getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.VULKAN)
+              : PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME);
     }
 
     final long startTime = SystemClock.elapsedRealtime();

From 84cf3372d16626f19c7dd95ed08b8ca91d661667 Mon Sep 17 00:00:00 2001
From: generatedunixname89002005325676 <generatedunixname89002005325676@fb.com>
Date: Thu, 1 Oct 2020 05:18:38 -0700
Subject: [PATCH 336/449] [AutoAccept][Codemod][FBSourceClangFormatLinter]
 Daily `arc lint --take CLANGFORMAT`

Reviewed By: zertosh

Differential Revision: D24044108

fbshipit-source-id: 6dfe2f1201304fa58e42472e3f53c72cbb63d7d2
---
 torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp | 3 ++-
 torch/csrc/jit/runtime/register_prim_ops.cpp                | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index afd6625d3466..fd49cf672e00 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -991,7 +991,8 @@ std::tuple<c10::QScheme, QParamVector> InsertQuantDeQuantHelper::
   if (isPlaceholderObserver(n->input(0))) {
     // get compute_dtype for dynamic quantization
     if (observer_module.hasattr("compute_dtype")) {
-      qparams.push_back(std::make_pair("_scalar_type", observer_module.attr("compute_dtype")));
+      qparams.push_back(std::make_pair(
+          "_scalar_type", observer_module.attr("compute_dtype")));
     }
     return std::make_tuple(qscheme, qparams);
   } else if (scalar_type == at::ScalarType::Half) {
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 2c4840bdcd13..d32dd998a040 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -112,7 +112,7 @@ RegisterOperators reg(
          // depends on the type hint and input. The implementation of this
          // operator below is intended to be as close to the Python
          // implementation in torch/csrc/utils/tensor_list.cpp as possible.
-         [](const Node* /*node*/) -> Operation {
+         [](const Node * /*node*/) -> Operation {
            return [](Stack* stack) {
              int elem_ty_val;
              int dim_val;

From 72bc3d9de4f8baa28ebc8f9399ad86cefe89d9f5 Mon Sep 17 00:00:00 2001
From: Michael Carilli <mcarilli@gmail.com>
Date: Thu, 1 Oct 2020 07:48:56 -0700
Subject: [PATCH 337/449] Use MTA for amp grad unscaling, enforce op math type
 in MTA functors, and allow op lambdas (#44778)

Summary:
Amp gradient unscaling is a great use case for multi tensor apply (in fact it's the first case I wrote it for).  This PR adds an MTA unscale+infcheck functor.  Really excited to have it for `torch.cuda.amp`. izdeby your interface was clean and straightforward to use, great work!

Labeled as bc-breaking because the native_functions.yaml exposure of unscale+infcheck changes from [`_amp_non_finite_check_and_unscale_` to `_amp_foreach_non_finite_check_and_unscale_`]( https://github.com/pytorch/pytorch/pull/44778/files#diff-f1e4b2c15de770d978d0eb77b53a4077L6289-L6293).

The PR also modifies Unary/Binary/Pointwise Functors to
- do ops' internal math in FP32 for FP16 or bfloat16 inputs, which improves precision ([and throughput, on some architectures!](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions)) and has no downside for the ops we care about.
- accept an instantiated op functor rather than an op functor template (`template<class> class Op`).  This allows calling code to pass lambdas.

Open question:  As written now, the PR has MTA Functors take care of pre- and post-casting FP16/bfloat16 inputs to FP32 before running the ops.  However, alternatively, the pre- and post-math casting could be deferred/written into the ops themselves, which gives them a bit more control.  I can easily rewrite it that way if you prefer.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44778

Reviewed By: gchanan

Differential Revision: D23944102

Pulled By: izdeby

fbshipit-source-id: 22b25ccad5f69b413c77afe8733fa9cacc8e766d
---
 aten/src/ATen/native/ForeachUtils.h           |  10 +-
 aten/src/ATen/native/cuda/AmpKernels.cu       | 147 +++++++++++++----
 .../ATen/native/cuda/ForeachBinaryOpList.cu   |  17 +-
 .../ATen/native/cuda/ForeachBinaryOpScalar.cu |  17 +-
 .../native/cuda/ForeachBinaryOpScalarList.cu  |  17 +-
 aten/src/ATen/native/cuda/ForeachFunctors.cuh | 149 +++++++++++-------
 .../ATen/native/cuda/ForeachPointwiseOp.cu    |  17 +-
 aten/src/ATen/native/cuda/ForeachUnaryOp.cu   |  15 +-
 .../src/ATen/native/cuda/MultiTensorApply.cuh |  15 +-
 aten/src/ATen/native/native_functions.yaml    |   5 +-
 .../check_backward_compatibility.py           |   1 +
 test/test_cuda.py                             | 120 +++++++++++---
 test/test_foreach.py                          | 110 +++++++++----
 tools/codegen/model.py                        |   2 +-
 torch/cuda/amp/grad_scaler.py                 |  32 ++--
 15 files changed, 498 insertions(+), 176 deletions(-)

diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index 44e6a50297db..f634d4804a6d 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -74,7 +74,7 @@ bool can_use_fast_route(TensorList tensors, Scalar scalar) {
       return false;
     }
 
-    // integral scalar + boolean tensor will result in integral tensor 
+    // integral scalar + boolean tensor will result in integral tensor
     if (scalar.isIntegral(/*includeBool*/ false) && t.dtype() == at::kBool) {
       return false;
     }
@@ -89,17 +89,17 @@ bool can_use_fast_route(TensorList tensors1, TensorList tensors2) {
   for (int64_t i = 0; i < tensors1.size(); i++) {
     TORCH_CHECK(tensors1[i].sizes() == tensors2[i].sizes(), "Corresponding tensors from tensor lists have different size.");
 
-    if (tensors1[i].device() != expected_device || 
+    if (tensors1[i].device() != expected_device ||
         tensors2[i].device() != expected_device) {
       return false;
     }
 
-    if (tensors1[i].layout() != at::kStrided || 
+    if (tensors1[i].layout() != at::kStrided ||
         tensors2[i].layout() != at::kStrided) {
       return false;
     }
 
-    if (tensors1[i].device() != expected_device || 
+    if (tensors1[i].device() != expected_device ||
         tensors2[i].device() != expected_device) {
       return false;
     }
@@ -108,7 +108,7 @@ bool can_use_fast_route(TensorList tensors1, TensorList tensors2) {
       return false;
     }
 
-    if (!tensors1[i].is_non_overlapping_and_dense() || 
+    if (!tensors1[i].is_non_overlapping_and_dense() ||
         !tensors2[i].is_non_overlapping_and_dense()) {
       return false;
     }
diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu
index 0d8b87f402de..7f5966739c21 100644
--- a/aten/src/ATen/native/cuda/AmpKernels.cu
+++ b/aten/src/ATen/native/cuda/AmpKernels.cu
@@ -3,9 +3,13 @@
 #include <math.h>
 
 #include <ATen/ATen.h>
+#include <ATen/DeviceGuard.h>
 #include <ATen/Dispatch.h>
-#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/TensorIterator.h>
+
 
 namespace {
 // Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
@@ -33,49 +37,136 @@ static __host__ __device__ __forceinline__ int isfinite_ensure_cuda_math(float v
 namespace at {
 namespace native {
 
-// Multiplies scaled_grad in-place by inv_scale.  If an element of scaled_grad was inf or NaN sets found_inf to 1.0.
-//
-// Args:
-// scaled_grad:  A (scaled) gradient tensor.  May contain infs or NaNs.
-// found_inf:  A single-element float tensor to which 1.0 will be written if any gradients contain infs/nans.
-//             Pre-zeroing found_inf, if appropriate, is the responsibility of the caller.
-// inv_scale:  The inverse of the scale factor by which scaled_grad is currently multiplied.
-//
-// Returns:
-// A tuple with references to scaled_grad, which is now unscaled in place, and found_inf,
-// which is now guaranteed to contain 1.0 if an inf or NaN was found in scaled_grad.
+namespace {
+// Single-tensor fallback for _amp_foreach_non_finite_check_and_unscale_cuda_.
+// Handles individual tensors that are acceptable to unscale but not MTA-safe.
 void _amp_non_finite_check_and_unscale_cuda_(Tensor& scaled_grad,
                                              Tensor& found_inf,
                                              const Tensor& inv_scale)
 {
-  TORCH_CHECK(scaled_grad.is_cuda(), "scaled_grad must be a CUDA tensor.");
+  // The only way we reach this function is through _amp_foreach_non_finite_check_and_unscale_cuda_, so no input checks.
+
+  // It's not obvious gpu_kernel always guards onto its argument.  Guarding here just in case.
+  const OptionalDeviceGuard device_guard(device_of(scaled_grad));
+
+  // Acts on scaled_grad in place.
+  auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+    iter.dtype(),
+    "_amp_non_finite_check_and_unscale_cuda",
+    [&iter, &found_inf, &inv_scale] {
+      auto* found_inf_ptr = found_inf.data_ptr<float>();
+      auto* inv_scale_ptr = inv_scale.data_ptr<float>();
+
+      using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+
+      gpu_kernel(iter,
+                 [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t {
+                   auto val = static_cast<opmath_t>(val_in);
+                   if (!isfinite_ensure_cuda_math(val)) {
+                     *found_inf_ptr = 1.f;
+                   }
+                   // Every thread accesses inv_scale, but it will hit in cache.
+                   const auto inv_scale_val = *inv_scale_ptr;
+                   return static_cast<scalar_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                 });
+    });
+}
+} // anonymous namespace
+
+
+// Multiplies each tensor in scaled_grads by inv_scale in-place.
+// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf to 1.0.
+// Uses multi tensor apply (MTA) to process all MTA-safe tensors.
+//
+// Args:
+// scaled_grads:  A TensorList of scaled gradient tensors.  May contain infs or NaNs.
+// found_inf:  A single-element float tensor to which 1.0 will be written if any gradient contain infs/nans.
+//             Pre-zeroing found_inf, if appropriate, is the responsibility of the caller.
+// inv_scale:  The inverse of the scale factor by which scaled_grads are currently multiplied.
+void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads,
+                                                     Tensor& found_inf,
+                                                     const Tensor& inv_scale)
+{
+  if (scaled_grads.size() == 0) {
+    return;
+  }
+
   TORCH_CHECK(inv_scale.is_cuda(), "inv_scale must be a CUDA tensor.");
   TORCH_CHECK(found_inf.is_cuda(), "found_inf must be a CUDA tensor.");
   TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor.");
   TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor.");
   TORCH_CHECK(inv_scale.scalar_type() == at::ScalarType::Float, "inv_scale must be a float tensor.");
   TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor.");
-  TORCH_CHECK(scaled_grad.layout() == at::kStrided, "scaled_grad must be a strided (not sparse) Tensor.");
 
-  // Act on scaled_grad in place.
-  auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad);
+  // Ensures client code (GradScaler) filtered scaled_grads by dtype.
+  check_foreach_api_restrictions(scaled_grads);
+
+  std::vector<std::vector<at::Tensor>> tensor_lists;
+
+  // is_non_overlapping_and_dense() is not available in Python.
+  // GradScaler can't filter for it. We need to filter here.
+  if (can_use_fast_route(scaled_grads)) {
+    // Hopefully common case.
+    // can_use_fast_route is true, which confirms:
+    //  - all scaled_grads are strided
+    //  - all scaled_grads are non overlapping and dense
+    //  - all scaled_grads are on the same device
+    TORCH_CHECK(scaled_grads[0].is_cuda(), "scaled_grads must be CUDA tensors.");
+    // Sets up MTA launch to use scaled_grads as-is.
+    tensor_lists.emplace_back(scaled_grads.vec());
+  } else {
+    // Hopefully uncommon case.
+    // can_use_fast_route is an all-or-nothing check.  In this path it was false,
+    // so any of the above confirmations could have gone wrong.
+    // We filter MTA-safe tensors into an MTA-able list.
+    // If a tensor is acceptable but not MTA-safe, we fall back to the TensorIterator kernel.
+    // If a tensor is unacceptable, we throw an error to blame GradScaler.
+    tensor_lists.resize(1);
+    tensor_lists[0].reserve(scaled_grads.size());
+    auto expected_device = scaled_grads[0].device();
+    for (const Tensor& t : scaled_grads) {
+      // Ensures GradScaler filtered scaled_grads by device.
+      TORCH_CHECK(t.is_cuda(), "one of scaled_grads was not a CUDA tensor.");
+      TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device.");
+      TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor.");
+      if (!t.is_non_overlapping_and_dense()) {
+        // t is acceptable but not MTA-safe.  Falls back to single-tensor TensorIterator kernel.
+        _amp_non_finite_check_and_unscale_cuda_(const_cast<Tensor&>(t),
+                                                found_inf,
+                                                inv_scale);
+      } else {
+        tensor_lists[0].push_back(t);
+      }
+    }
+    if (tensor_lists[0].size() == 0) {
+      return;
+    }
+  }
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    iter.dtype(),
-    "_amp_non_finite_check_and_unscale_cuda",
-    [&iter, &found_inf, &inv_scale] {
+    tensor_lists[0][0].scalar_type(),
+    "_amp_foreach_non_finite_check_and_unscale_cuda",
+    [&tensor_lists, &found_inf, &inv_scale] {
       auto* found_inf_ptr = found_inf.data_ptr<float>();
       auto* inv_scale_ptr = inv_scale.data_ptr<float>();
 
-      gpu_kernel(iter, [found_inf_ptr, inv_scale_ptr]GPU_LAMBDA(scalar_t val) -> scalar_t {
-          float fval = static_cast<float>(val);
-          // See isfinite_ensure_cuda_math above.
-          if (!isfinite_ensure_cuda_math(fval)) {
-            *found_inf_ptr = 1.f;
-          }
-          const auto inv_scale_val = *inv_scale_ptr; // Every thread accesses inv_scale, but it will hit in cache.
-          return static_cast<scalar_t>(inv_scale_val == 1.f ? fval : fval*inv_scale_val);
-        });
+      using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+
+      // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly.
+      multi_tensor_apply<1>(tensor_lists,
+                            UnaryOpFunctor_<scalar_t>(),
+                            [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t {
+                              // There is a slight asymmetry here with the TensorIterator kernel above.
+                              // MTA Functors ensure val comes in as opmath_t rather than scalar_t.
+                              if (!isfinite_ensure_cuda_math(val)) {
+                                *found_inf_ptr = 1.f;
+                              }
+                              // Every thread accesses inv_scale, but it will hit in cache.
+                              const auto inv_scale_val = *inv_scale_ptr;
+                              return static_cast<opmath_t>(inv_scale_val == 1.f ? val : val * inv_scale_val);
+                            });
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index 239859b9138c..cc01bb030cf4 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -6,8 +6,9 @@ namespace at { namespace native {
 
 template<template<class> class Op>
 std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tensors2, Scalar alpha = 1) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors1.size());
     for (const auto& t: tensors1) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -17,7 +18,11 @@ std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tenso
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda", [&]() {
-        multi_tensor_apply<3>(tensor_lists, BinaryOpListAlphaFunctor<scalar_t, Op>(), alpha.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<3>(tensor_lists,
+                              BinaryOpListAlphaFunctor<scalar_t>(),
+                              Op<opmath_t>(),
+                              alpha.to<opmath_t>());
     });
 
     return tensor_lists[2];
@@ -25,12 +30,16 @@ std::vector<Tensor> foreach_tensor_list_op(TensorList tensors1, TensorList tenso
 
 template<template<class> class Op>
 void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, Scalar alpha = 1) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors1.vec());
     tensor_lists.emplace_back(tensors2.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda_", [&]() {
-        multi_tensor_apply<2>(tensor_lists, BinaryOpListAlphaFunctor_<scalar_t, Op>(), alpha.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              BinaryOpListAlphaFunctor_<scalar_t>(),
+                              Op<opmath_t>(),
+                              alpha.to<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index 215410bbc2a5..71180785eb48 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -8,8 +8,9 @@ template<template<class> class Op>
 std::vector<Tensor> foreach_binary_op(TensorList tensors, Scalar scalar) {
     check_foreach_api_restrictions(tensors);
 
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
     for (const auto& t: tensors) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -18,7 +19,11 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, Scalar scalar) {
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda", [&]() {
-        multi_tensor_apply<2>(tensor_lists, BinaryOpScalarFunctor<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              BinaryOpScalarFunctor<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
     return tensor_lists[1];
 }
@@ -27,11 +32,15 @@ template<template<class> class Op>
 void foreach_binary_op_(TensorList tensors, Scalar scalar) {
     check_foreach_api_restrictions(tensors);
 
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda_", [&]() {
-        multi_tensor_apply<1>(tensor_lists, BinaryOpScalarFunctor_<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              BinaryOpScalarFunctor_<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
index 684f12732ffc..60f2bb737bf7 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -6,8 +6,9 @@ namespace at { namespace native {
 
 template<template<class> class Op>
 std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<double> scalars) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
     for (const auto& t: tensors) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -16,18 +17,26 @@ std::vector<Tensor> foreach_binary_op(TensorList tensors, at::ArrayRef<double> s
     tensor_lists.emplace_back(vec_res);
 
     AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() {
-        multi_tensor_apply<2>(tensor_lists, scalars, BinaryOpScalarListFunctor<scalar_t, Op>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              scalars,
+                              BinaryOpScalarListFunctor<scalar_t>(),
+                              Op<opmath_t>());
     });
     return tensor_lists[1];
 }
 
 template<template<class> class Op>
 void foreach_binary_op_(TensorList tensors, at::ArrayRef<double> scalars) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
     AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() {
-        multi_tensor_apply<1>(tensor_lists, scalars, BinaryOpScalarListFunctor_<scalar_t, Op>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              scalars,
+                              BinaryOpScalarListFunctor_<scalar_t>(),
+                              Op<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
index e83eca3dd8e1..dd01d584f045 100644
--- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh
+++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh
@@ -5,12 +5,19 @@ namespace at { namespace native {
 
 namespace {
 
-template<typename T, template<class> class Op>
+// For FP16 or BFloat16 inputs, ops should perform internal math in FP32.
+template<typename scalar_t> struct get_opmath_t { using opmath_t = scalar_t; };
+template<> struct get_opmath_t<at::Half> { using opmath_t = float; };
+template<> struct get_opmath_t<at::BFloat16> { using opmath_t = float; };
+
+template<typename T>
 struct BinaryOpScalarFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<1>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -29,7 +36,8 @@ struct BinaryOpScalarFunctor_ {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
                     // store
                     load_store(x, r_x, i_start, 0);
@@ -47,7 +55,8 @@ struct BinaryOpScalarFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -60,12 +69,14 @@ struct BinaryOpScalarFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct BinaryOpScalarFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<2>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -87,7 +98,8 @@ struct BinaryOpScalarFunctor {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
                     // store
                     load_store(out, r_x, i_start, 0);
@@ -105,7 +117,8 @@ struct BinaryOpScalarFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), scalar);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    static_cast<opmath_t>(scalar)));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -118,11 +131,14 @@ struct BinaryOpScalarFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct BinaryOpScalarListFunctor_ {
-    __device__ void operator() (
+    using io_t = T;
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListScalarListMetadata<1>& tl) {
+        TensorListScalarListMetadata<opmath_t, 1>& tl,
+        Op op) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -130,7 +146,7 @@ struct BinaryOpScalarListFunctor_ {
             T* x = (T*)tl.addresses[0][tensor_loc];
             x += chunk_idx * chunk_size;
 
-            double y = tl.scalar_vals[tensor_loc];
+            opmath_t y = tl.scalar_vals[tensor_loc];
 
             n -= chunk_idx * chunk_size;
 
@@ -143,7 +159,7 @@ struct BinaryOpScalarListFunctor_ {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
                     }
                     // store
                     load_store(x, r_x, i_start, 0);
@@ -161,7 +177,7 @@ struct BinaryOpScalarListFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -174,11 +190,14 @@ struct BinaryOpScalarListFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct BinaryOpScalarListFunctor {
-    __device__ void operator() (
+    using io_t = T;
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListScalarListMetadata<2>& tl) {
+        TensorListScalarListMetadata<opmath_t, 2>& tl,
+        Op op) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -189,7 +208,7 @@ struct BinaryOpScalarListFunctor {
             T* out = (T*)tl.addresses[1][tensor_loc];
             out += chunk_idx * chunk_size;
 
-            double y = tl.scalar_vals[tensor_loc];
+            opmath_t y = tl.scalar_vals[tensor_loc];
 
             n -= chunk_idx * chunk_size;
 
@@ -202,7 +221,7 @@ struct BinaryOpScalarListFunctor {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
                     }
                     // store
                     load_store(out, r_x, i_start, 0);
@@ -220,7 +239,7 @@ struct BinaryOpScalarListFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), y);
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]), y));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -233,12 +252,14 @@ struct BinaryOpScalarListFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct BinaryOpListAlphaFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListMetadata<2>& tl, 
-        T alpha) {
+        TensorListMetadata<2>& tl,
+        Op op,
+        opmath_t alpha) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -262,7 +283,8 @@ struct BinaryOpListAlphaFunctor_ {
                     load_store(r_y, y, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
                     // store
                     load_store(x, r_x, i_start , 0);
@@ -282,7 +304,8 @@ struct BinaryOpListAlphaFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -295,12 +318,14 @@ struct BinaryOpListAlphaFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct BinaryOpListAlphaFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<3>& tl,
-        T alpha) {
+        Op op,
+        opmath_t alpha) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -327,7 +352,8 @@ struct BinaryOpListAlphaFunctor {
                     load_store(r_y, y, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
                     // store
                     load_store(out, r_x, i_start , 0);
@@ -347,7 +373,8 @@ struct BinaryOpListAlphaFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]), alpha * static_cast<T>(r_y[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii]),
+                                                    alpha * static_cast<opmath_t>(r_y[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -360,11 +387,13 @@ struct BinaryOpListAlphaFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct UnaryOpFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListMetadata<1>& tl) {
+        TensorListMetadata<1>& tl,
+        Op op) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -383,7 +412,7 @@ struct UnaryOpFunctor_ {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
                     // store
                     load_store(x, r_x, i_start, 0);
@@ -401,7 +430,7 @@ struct UnaryOpFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -414,11 +443,13 @@ struct UnaryOpFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct UnaryOpFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
-        TensorListMetadata<2>& tl) {
+        TensorListMetadata<2>& tl,
+        Op op) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -440,7 +471,7 @@ struct UnaryOpFunctor {
                     load_store(r_x, x, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
                     // store
                     load_store(out, r_x, i_start, 0);
@@ -458,7 +489,7 @@ struct UnaryOpFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = Op<T>()(static_cast<T>(r_x[ii]));
+                        r_x[ii] = static_cast<T>(op(static_cast<opmath_t>(r_x[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -471,12 +502,14 @@ struct UnaryOpFunctor {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct PointwiseOpFunctor_ {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<3>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -505,7 +538,9 @@ struct PointwiseOpFunctor_ {
                     load_store(r_z, z, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
                     // store
                     load_store(x, r_x, i_start, 0);
@@ -527,7 +562,9 @@ struct PointwiseOpFunctor_ {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
@@ -540,12 +577,14 @@ struct PointwiseOpFunctor_ {
         }
 };
 
-template<typename T, template<class> class Op>
+template<typename T>
 struct PointwiseOpFunctor {
-    __device__ void operator() (
+    using opmath_t = typename get_opmath_t<T>::opmath_t;
+    template<typename Op> __device__ __forceinline__ void operator() (
         int chunk_size,
         TensorListMetadata<4>& tl,
-        T scalar) {
+        Op op,
+        opmath_t scalar) {
             int tensor_loc = tl.block_to_tensor[blockIdx.x];
             int chunk_idx = tl.block_to_chunk[blockIdx.x];
             int n = tl.sizes[tensor_loc];
@@ -577,7 +616,9 @@ struct PointwiseOpFunctor {
                     load_store(r_z, z, 0 , i_start);
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
                     // store
                     load_store(out, r_x, i_start, 0);
@@ -600,7 +641,9 @@ struct PointwiseOpFunctor {
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
-                        r_x[ii] = static_cast<T>(r_x[ii]) + scalar * Op<T>()(static_cast<T>(r_y[ii]), static_cast<T>(r_z[ii]));
+                        r_x[ii] = static_cast<T>(static_cast<opmath_t>(r_x[ii]) +
+                                                 scalar * op(static_cast<opmath_t>(r_y[ii]),
+                                                             static_cast<opmath_t>(r_z[ii])));
                     }
 #pragma unroll
                     for(int ii = 0; ii < kILP; ii++) {
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index 32b3ca97f77b..7ce2fc566110 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -6,8 +6,9 @@ namespace at { namespace native {
 
 template<template<class> class Op>
 std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1, TensorList tensors2, Scalar scalar) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(input.size());
     for (const auto& t: input) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -18,7 +19,11 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() {
-        multi_tensor_apply<4>(tensor_lists, PointwiseOpFunctor<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<4>(tensor_lists,
+                              PointwiseOpFunctor<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
 
     return tensor_lists[3];
@@ -26,13 +31,17 @@ std::vector<Tensor> foreach_pointwise_op(TensorList input, TensorList tensors1,
 
 template<template<class> class Op>
 void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList tensors2, Scalar scalar) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(input.vec());
     tensor_lists.emplace_back(tensors1.vec());
     tensor_lists.emplace_back(tensors2.vec());
 
     AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op__cuda", [&]() {
-        multi_tensor_apply<3>(tensor_lists, PointwiseOpFunctor_<scalar_t, Op>(), scalar.to<scalar_t>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<3>(tensor_lists,
+                              PointwiseOpFunctor_<scalar_t>(),
+                              Op<opmath_t>(),
+                              scalar.to<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index 32bb6ab6b509..1160d64bba6d 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -6,8 +6,9 @@ namespace at { namespace native {
 
 template <template<class> class Op>
 std::vector<Tensor> foreach_unary_op(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     std::vector<at::Tensor> vec_res;
+    vec_res.reserve(tensors.size());
     for (const auto& t: tensors) {
         vec_res.emplace_back(at::native::empty_like(t));
     }
@@ -16,18 +17,24 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
     tensor_lists.emplace_back(std::move(vec_res));
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half,  tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() {
-        multi_tensor_apply<2>(tensor_lists, UnaryOpFunctor<scalar_t, Op>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<2>(tensor_lists,
+                              UnaryOpFunctor<scalar_t>(),
+                              Op<opmath_t>());
     });
     return tensor_lists[1];
 }
 
 template <template<class> class Op>
 void foreach_unary_op_(TensorList tensors) {
-    std::vector<std::vector<at::Tensor>> tensor_lists; 
+    std::vector<std::vector<at::Tensor>> tensor_lists;
     tensor_lists.emplace_back(tensors.vec());
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() {
-        multi_tensor_apply<1>(tensor_lists, UnaryOpFunctor_<scalar_t, Op>());
+        using opmath_t = get_opmath_t<scalar_t>::opmath_t;
+        multi_tensor_apply<1>(tensor_lists,
+                              UnaryOpFunctor_<scalar_t>(),
+                              Op<opmath_t>());
     });
 }
 
diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
index d162af19fd1b..f0f8f97fabb1 100644
--- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh
+++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
@@ -36,24 +36,24 @@ template<int n> struct TensorListMetadata
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
-template<int n> struct TensorListScalarListMetadata
+template<typename scalar_vals_t, int n> struct TensorListScalarListMetadata
 {
   void* addresses[n][depth_to_max_tensors_scalarlist[n-1]];
   int sizes[depth_to_max_tensors_scalarlist[n-1]];
-  double scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
+  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n-1]];
   unsigned char block_to_tensor[depth_to_max_blocks[n-1]];
   int block_to_chunk[depth_to_max_blocks[n-1]];
 };
 
 template<typename T, typename U, typename... ArgTypes>
 C10_LAUNCH_BOUNDS_1(kBlockSize)
-__global__ void 
+__global__ void
 multi_tensor_apply_kernel(
     T tensorListMeta,
     U callable,
     ArgTypes... args) {
   // Hand the chunk information to the user-supplied functor to process however it likes.
-  callable(kChunkSize, tensorListMeta, args...); 
+  callable(kChunkSize, tensorListMeta, args...);
 }
 
 template<int depth, typename T, typename... ArgTypes>
@@ -65,7 +65,8 @@ void multi_tensor_apply(
         TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth.");
         const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0]));
         size_t n_tensors = tensor_lists[0].size();
-        TensorListScalarListMetadata<depth> tensorListMeta;
+        using scalar_vals_t = typename T::opmath_t;
+        TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
 
         int loc_block_info = 0;
         int loc_tensor_info = 0;
@@ -101,7 +102,7 @@ void multi_tensor_apply(
                     // Reset.
                     loc_block_info = 0;
                     if(chunk == chunks - 1) {
-                        loc_tensor_info = 0; 
+                        loc_tensor_info = 0;
                     }
                     else {
                         tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
@@ -158,7 +159,7 @@ void multi_tensor_apply(
                     // Reset.
                     loc_block_info = 0;
                     if(chunk == chunks - 1) {
-                        loc_tensor_info = 0; 
+                        loc_tensor_info = 0;
                     }
                     else {
                         tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1];
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 72bb4131375f..60ec28595156 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6291,11 +6291,12 @@
   dispatch:
     CPU: legacy::cpu::_th_std
 
-- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
   use_c10_dispatcher: full
+  device_guard: False
   variants: function
   dispatch:
-    CUDA: _amp_non_finite_check_and_unscale_cuda_
+    CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
 
 - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
   use_c10_dispatcher: full
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 6ba9ff66de29..0e4bcd14ae01 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -108,6 +108,7 @@
     ("aten::_foreach_sub_", datetime.date(2020, 10, 1)),
     ("aten::_foreach_div", datetime.date(2020, 10, 1)),
     ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
+    ("aten::_amp_non_finite_check_and_unscale_", datetime.date(9999, 1, 1)),
     ("aten::choose_qparams_optimized", datetime.date(2020, 10, 5)),
 ]
 
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 6c904a67e619..e3d94671e997 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1749,32 +1749,102 @@ def worker(rank):
 t2.start()
 """])
 
-    def test_grad_scaling_builtins(self, device="cuda", dtype=torch.float):
-        inv_scale = torch.tensor([0.25], dtype=dtype, device=device)
+    def test_grad_scaling_unscale(self, dtype=torch.float):
+        inv_scale = torch.full((1,), 0.25, dtype=torch.float, device="cuda:0")
+        found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
+
+        size = 10
+        g = torch.full((size, size), 4.0, dtype=dtype, device="cuda:0")
+        ginf = g.clone()
+        ginf[2, 2] = float('inf')
+        gnan = g.clone()
+        gnan[2, 2] = float('nan')
+
+        # Tries selected combinations of
+        #  - contiguous grads
+        #  - g.clone().t() which is not contiguous but still non overlapping and dense
+        #  - variants of g.clone()[:, :5] which are not non overlapping and dense
+        # Non overlapping and dense grads route into a multi tensor apply kernel,
+        # others use a fallback per-tensor kernel, so we should try both.
+        cases = (
+            ([g.clone(), g.clone()], False),
+            ([g.clone(), g.clone().t()], False),
+            ([g.clone(), g.clone()[:, :5]], False),
+            ([g.clone()[:, :5], g.clone()[:, :5]], False),
+            ([g.clone(), ginf.clone()], True),
+            ([g.clone(), gnan.clone()], True),
+            ([g.clone(), ginf.clone()[:, :5]], True),
+            ([g.clone(), gnan.clone()[:, :5]], True),
+            ([ginf.clone(), g.clone()[:, :5]], True),
+            ([ginf.clone()[:, :5], g.clone()[:, :5]], True),
+        )
+
+        for grads, has_inf in cases:
+            found_inf.zero_()
+            torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale)
+            if has_inf:
+                self.assertEqual(found_inf, 1.0)
+            else:
+                self.assertEqual(found_inf, 0.0)
+                for grad in grads:
+                    self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
 
-        found_inf = torch.tensor([0.0], dtype=dtype, device=device)
-        g = torch.tensor([4.0], dtype=dtype, device=device)
-        torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale)
-        self.assertEqual(found_inf, 0.0)
-        self.assertTrue(torch.allclose(g, torch.ones(10, dtype=torch.float32, device="cuda"), atol=1e-7))
+        # Passing lists with mismatched devices or dtypes to a raw
+        # _amp_foreach_non_finite_check_and_unscale_ call should raise errors.
+        with self.assertRaisesRegex(RuntimeError, r"must have the same dtype"):
+            torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(dtype=torch.float16)],
+                                                             found_inf,
+                                                             inv_scale)
 
-        found_inf.zero_()
-        g = torch.tensor([float('inf')], dtype=dtype, device=device)
-        torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale)
-        self.assertEqual(found_inf, 1.0)
+        if TEST_MULTIGPU:
+            with self.assertRaisesRegex(RuntimeError, r"scaled_grads must be on the same device."):
+                torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(device="cuda:1")],
+                                                                 found_inf,
+                                                                 inv_scale)
+
+        # Creates a list of grads with mismatched dtypes and devices, to ensure
+        # scaler._unscale_grads_ organizes grads by dtype and device before calling
+        # _amp_foreach_non_finite_check_and_unscale_ on each set.
+        # If inject_inf >= 0, writes an inf into one grad for _unscale_grads_ to find.
+        def perfect_storm_grads(inject_inf):
+            grads = [g.clone(), g.clone()[:, :5], g.to(dtype=torch.float16), g.to(dtype=torch.float16)]
+            if TEST_MULTIGPU:
+                grads += [g.to(device="cuda:1"),
+                          g.to(device="cuda:1")[:, :5],
+                          g.to(device="cuda:1", dtype=torch.float16),
+                          g.to(device="cuda:1", dtype=torch.float16)]
+            if inject_inf >= 0:
+                grads[inject_inf][2, 2] = float('inf')
+            return grads
 
-        found_inf.zero_()
-        g = torch.tensor([float('nan')], dtype=dtype, device=device)
-        torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale)
-        self.assertEqual(found_inf, 1.0)
+        scaler = torch.cuda.amp.GradScaler()
+        dummy_params = [torch.empty_like(g) for g in perfect_storm_grads(-1)]
+        dummy_opt = torch.optim.SGD(dummy_params, lr=1.)
+
+        # Ensures the inf/nan checking can find an inf injected onto any grad in the perfect storm.
+        for inject_inf in range(-1, len(dummy_params)):
+            found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
+            grads = perfect_storm_grads(inject_inf)
+            for i, p in enumerate(dummy_params):
+                p.grad = grads[i]
+            found_inf_per_device = scaler._unscale_grads_(dummy_opt, inv_scale, found_inf, True)
+            if inject_inf < 0:
+                # No inf was injected, ensures unscaling worked normally.
+                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0)
+                for grad in grads:
+                    self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7))
+            else:
+                # inf was injected, ensures inf was found.
+                self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1)
 
+    def test_grad_scaling_update_scale(self, device="cuda", dtype=torch.float):
         growth = 2.0
         backoff = 0.25
         growth_interval = 2
-        scale = torch.tensor([4.0], dtype=dtype, device=device)
-        growth_tracker = torch.tensor([0], dtype=torch.int32, device=device)
+        scale = torch.full((1,), 4.0, dtype=dtype, device=device)
+        growth_tracker = torch.full((1,), 0.0, dtype=torch.int32, device=device)
+        found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0")
 
-        found_inf.zero_()
         # Simulates 2 consecutive unskipped iterations
         scale = torch._amp_update_scale(growth_tracker, scale, found_inf, growth, backoff, growth_interval)
         self.assertEqual(growth_tracker, 1)
@@ -1792,7 +1862,7 @@ def test_grad_scaling_builtins(self, device="cuda", dtype=torch.float):
     def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float):
         scaler = torch.cuda.amp.GradScaler()
 
-        inv_scale = torch.tensor([0.25], dtype=dtype, device=device)
+        inv_scale = torch.full((1,), 0.25, dtype=dtype, device=device)
         found_inf = torch.empty((1,), dtype=dtype, device=device)
         cur = found_inf.device
 
@@ -1855,6 +1925,7 @@ def test_grad_scaling_device_as_key(self):
         # are treated as identical keys by dicts.  GradScaler relies on this behavior, and may
         # error otherwise in a way that's difficult to detect (a silent performance hit).
         d = {}
+        t = torch.empty((1,), device="cuda:0")
         dev0a = torch.device("cuda:0")
         dev0b = torch.device("cuda:0")
         dev1a = torch.device("cuda:1")
@@ -1867,6 +1938,9 @@ def test_grad_scaling_device_as_key(self):
         d[dev0b] = "0b"
         self.assertTrue(len(d) == 1)
         self.assertTrue(d[dev0a] == "0b")
+        d[t.device] = "t"
+        self.assertTrue(len(d) == 1)
+        self.assertTrue(d[dev0a] == "t")
 
         d[dev1a] = "1a"
         d[dev1b] = "1b"
@@ -1876,8 +1950,8 @@ def test_grad_scaling_device_as_key(self):
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_grad_scaling_scale(self):
         scaler = torch.cuda.amp.GradScaler(init_scale=2.)
-        t0 = torch.tensor([4.0], dtype=torch.float32, device="cuda:0")
-        t1 = torch.tensor([4.0], dtype=torch.float32, device="cuda:1")
+        t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")
+        t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1")
         # Create some nested iterables of tensors on different devices.
         outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())])
         outputs = scaler.scale(outputs)
@@ -1895,7 +1969,7 @@ def test_grad_scaling_state_dict(self):
 
             if lazy_init_scale:
                 # Dummy scale() call to ensure the scale tensor is lazily initialized.
-                s1.scale(torch.tensor([4.0], dtype=torch.float32, device="cuda:0"))
+                s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0"))
                 self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor))
 
             s1.load_state_dict(s0.state_dict())
@@ -2406,7 +2480,7 @@ def cast(val, to_type):
                             "{} not found as an attribute on either Tensor or the requested module {}".format(
                             op, module))
 
-            # Accounts for ops that return tuples and other non-Tensors.
+            # Accounts for ops that return Tensors, iterables, and other non-Tensors.
             # For example, lstm_cell returns a tuple and equal returns bool.
             def compare(first, second):
                 if isinstance(first, torch.Tensor):
diff --git a/test/test_foreach.py b/test/test_foreach.py
index b91735a8e62f..683b4fe28167 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -1,6 +1,6 @@
 import torch
 import unittest
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM
 from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm
 
 class TestForeach(TestCase):
@@ -54,20 +54,33 @@ def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op):
             tensors1 = self._get_test_data(device, dtype, N)
             tensors2 = self._get_test_data(device, dtype, N)
 
-            expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)]
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors1[i].to(dtype=control_dtype),
+                                 tensors2[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
             res = foreach_op(tensors1, tensors2)
             foreach_op_(tensors1, tensors2)
             self.assertEqual(res, tensors1)
-            self.assertEqual(tensors1, res)
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors1, expected)
 
     def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
         for N in [30, 300]:
             tensors1 = self._get_test_data(device, dtype, N)
-            expected = [torch_op(tensors1[i]) for i in range(N)]
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)]
             res = foreach_op(tensors1)
             foreach_op_(tensors1)
             self.assertEqual(res, tensors1)
-            self.assertEqual(tensors1, expected)
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors1, expected)
 
     def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
         for N in [30, 300]:
@@ -76,26 +89,43 @@ def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op):
             tensors2 = self._get_test_data(device, dtype, N)
             value = 2
 
-            expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)]
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors[i].to(dtype=control_dtype),
+                                 tensors1[i].to(dtype=control_dtype),
+                                 tensors2[i].to(dtype=control_dtype), value=value).to(dtype=dtype) for i in range(N)]
 
             res = foreach_op(tensors, tensors1, tensors2, value)
             foreach_op_(tensors, tensors1, tensors2, value)
             self.assertEqual(res, tensors)
-            self.assertEqual(tensors, expected)
-
-    def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20):
-        tensors1 = self._get_test_data(device, dtype, N)
-        tensors2 = self._get_test_data(device, dtype, N)
-        alpha = 2
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors, expected)
 
-        expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)]
-        res = foreach_op(tensors1, tensors2, alpha=alpha)
-        foreach_op_(tensors1, tensors2, alpha=alpha)
-        self.assertEqual(res, tensors1)
+    def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op):
+        for N in [30, 300]:
+            tensors1 = self._get_test_data(device, dtype, N)
+            tensors2 = self._get_test_data(device, dtype, N)
+            alpha = 2
+
+            # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+            control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                              (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+            expected = [torch_op(tensors1[i].to(dtype=control_dtype),
+                                 torch.mul(tensors2[i].to(dtype=control_dtype),
+                                 alpha)).to(dtype=dtype) for i in range(N)]
+            res = foreach_op(tensors1, tensors2, alpha=alpha)
+            foreach_op_(tensors1, tensors2, alpha=alpha)
+            self.assertEqual(res, tensors1)
 
-        if dtype == torch.bool:
-            expected = [e.to(torch.bool) for e in expected]
-        self.assertEqual(tensors1, expected)
+            if dtype == torch.bool:
+                expected = [e.to(torch.bool) for e in expected]
+            if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+            else:
+                self.assertEqual(tensors1, expected)
 
     #
     # Unary ops
@@ -179,7 +209,7 @@ def test_int_scalar(self, device, dtype):
                     foreach_bin_op_(tensors, scalar)
                     self.assertEqual(tensors, expected)
 
-    # TODO[Fix scalar list]: 
+    # TODO[Fix scalar list]:
     # We need to update codegen to correctly handle function overloads with float[] and int[].
     # As optimizers work with float tensors, the result will always be torch.float32 for now.
     # Current schema is using 'float[]' as scalar list type.
@@ -217,7 +247,7 @@ def test_int_scalarlist(self, device, dtype):
                     else:
                         # TODO[type promotion]: Fix once type promotion is enabled.
                         self.assertEqual(res, [e.to(dtype) for e in expected])
-                else: 
+                else:
                     self.assertEqual(res, expected)
 
                 if dtype in torch.testing.integral_types() and self.device_type == 'cpu':
@@ -236,7 +266,14 @@ def test_float_scalar(self, device, dtype):
                                                                      self.torch_bin_ops):
                 tensors = self._get_test_data(device, dtype, N)
                 scalar = 3.3
-                expected = [torch_bin_op(t, scalar) for t in tensors]
+
+                # Mimics cuda kernel dtype flow.  With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16.
+                control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                                  (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+                expected = [torch_bin_op(t.to(dtype=control_dtype),
+                                         scalar) for t in tensors]
+                if (dtype is torch.float16 or dtype is torch.bfloat16):
+                    expected = [e.to(dtype=dtype) for e in expected]
 
                 if dtype == torch.bool:
                     if foreach_bin_op == torch._foreach_sub:
@@ -248,7 +285,10 @@ def test_float_scalar(self, device, dtype):
                     return
 
                 res = foreach_bin_op(tensors, scalar)
-                self.assertEqual(res, expected)
+                if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                    self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                else:
+                    self.assertEqual(res, expected)
 
                 if dtype in torch.testing.integral_types():
                     with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
@@ -256,7 +296,10 @@ def test_float_scalar(self, device, dtype):
                     return
 
                 foreach_bin_op_(tensors, scalar)
-                self.assertEqual(tensors, expected)
+                if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                    self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                else:
+                    self.assertEqual(tensors, expected)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_float_scalarlist(self, device, dtype):
@@ -266,7 +309,14 @@ def test_float_scalarlist(self, device, dtype):
                                                                      self.torch_bin_ops):
                 tensors = self._get_test_data(device, dtype, N)
                 scalars = [1.1 for _ in range(N)]
-                expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)]
+
+                # If incoming dtype is float16 or bfloat16, runs in float32 and casts output back to dtype.
+                control_dtype = torch.float32 if (self.device_type == 'cuda' and
+                                                  (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype
+                expected = [torch_bin_op(t.to(dtype=control_dtype),
+                                         s) for t, s in zip(tensors, scalars)]
+                if (dtype is torch.float16 or dtype is torch.bfloat16):
+                    expected = [e.to(dtype=dtype) for e in expected]
 
                 # we dont support bool and complex types on CUDA for now
                 if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda':
@@ -295,7 +345,10 @@ def test_float_scalarlist(self, device, dtype):
                     self.assertEqual(tensors, res)
                     return
                 else:
-                    self.assertEqual(res, expected)
+                    if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                        self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                    else:
+                        self.assertEqual(res, expected)
 
                 if dtype in torch.testing.integral_types() and self.device_type == "cpu":
                     with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"):
@@ -303,7 +356,10 @@ def test_float_scalarlist(self, device, dtype):
                     return
 
                 foreach_bin_op_(tensors, scalars)
-                self.assertEqual(tensors, expected)
+                if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM:
+                    self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0])
+                else:
+                    self.assertEqual(tensors, expected)
 
     @dtypes(*torch.testing.get_all_dtypes())
     def test_complex_scalar(self, device, dtype):
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 5b4b25096f35..e39a14b9328b 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -303,7 +303,7 @@ def __post_init__(self) -> None:
         if self.name.name.inplace:
             # TODO: fixme
             if str(self.name) not in [
-                    '_amp_non_finite_check_and_unscale_',
+                    '_amp_foreach_non_finite_check_and_unscale_',
                     '_foreach_add_scalar_list_',
                     '_foreach_sub_scalar_list_',
                     '_foreach_mul_scalar_list_',
diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py
index c70ab2596555..dc35d21daf00 100644
--- a/torch/cuda/amp/grad_scaler.py
+++ b/torch/cuda/amp/grad_scaler.py
@@ -191,13 +191,20 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
         per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
         per_device_found_inf = _MultiDeviceReplicator(found_inf)
 
-        for group in optimizer.param_groups:
-            for param in group["params"]:
-                if param.grad is None:
-                    continue
-                if (not allow_fp16) and param.grad.dtype == torch.float16:
-                    raise ValueError("Attempting to unscale FP16 gradients.")
-                with torch.no_grad():
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError("Attempting to unscale FP16 gradients.")
                     if param.grad.is_sparse:
                         # is_coalesced() == False means the sparse grad has values with duplicate indices.
                         # coalesce() deduplicates indices and adds all values that have the same index.
@@ -209,9 +216,14 @@ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
                     else:
                         to_unscale = param.grad
 
-                    torch._amp_non_finite_check_and_unscale_(to_unscale,
-                                                             per_device_found_inf.get(param.grad.device),
-                                                             per_device_inv_scale.get(param.grad.device))
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._amp_foreach_non_finite_check_and_unscale_(grads,
+                                                                     per_device_found_inf.get(device),
+                                                                     per_device_inv_scale.get(device))
 
         return per_device_found_inf._per_device_tensors
 

From a242ac8c279c50cdd275ffa193350c9dd40d416f Mon Sep 17 00:00:00 2001
From: iurii zdebskyi <47012416+izdeby@users.noreply.github.com>
Date: Thu, 1 Oct 2020 08:29:11 -0700
Subject: [PATCH 338/449] Update torchvision version to current latest master
 (#45342)

Summary:
Updating torchvision version to the current latest master.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45342

Reviewed By: seemethere

Differential Revision: D23933572

Pulled By: izdeby

fbshipit-source-id: c374156eb608e882a1e2107143e39f03b7399081
---
 .jenkins/caffe2/test.sh          | 2 +-
 .jenkins/pytorch/common_utils.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 61fb7de08fe5..58b3979f7829 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -163,7 +163,7 @@ pip install --user pytest-sugar
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # Check out torch/vision at Jun 11 2020 commit
   # This hash must match one in .jenkins/pytorch/test.sh
-  pip install -q --user git+https://github.com/pytorch/vision.git@c2e8a00885e68ae1200eb6440f540e181d9125de
+  pip install -q --user git+https://github.com/pytorch/vision.git@e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb
   pip install -q --user ninja
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh
index 682dd29b4cff..24d6f5676f7d 100644
--- a/.jenkins/pytorch/common_utils.sh
+++ b/.jenkins/pytorch/common_utils.sh
@@ -66,7 +66,7 @@ function get_bazel() {
   chmod +x tools/bazel
 }
 
-TORCHVISION_COMMIT=c2e8a00885e68ae1200eb6440f540e181d9125de
+TORCHVISION_COMMIT=e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb
 
 function install_torchvision() {
   # Check out torch/vision at Jun 11 2020 commit

From 41bd5a5ee03ac9053e79c00574735ca737cc2260 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 1 Oct 2020 08:37:08 -0700
Subject: [PATCH 339/449] Switch all Sequences in tools.codegen.model to Tuple
 (#45127)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45127

I thought I was being clever by using Sequence, which doesn't commit to
List or Tuple, but forces read-onlyness in the type system.  However,
there is runtime implication to using List or Tuple: Lists can't be
hashed, but Tuples can be!  This is important because I shortly want
to group by FunctionSchema, and to do this I need FunctionSchema to
be hashable.  Switch everything to Tuple for true immutability.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: gchanan

Differential Revision: D23872527

Pulled By: ezyang

fbshipit-source-id: 5c8fae1c50a5ae47b4167543646d94ddcafff8c3
---
 tools/codegen/model.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index e39a14b9328b..42856d580666 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -1,7 +1,7 @@
 import re
 
 from dataclasses import dataclass
-from typing import List, Sequence, Dict, Optional, Iterator, Tuple, Set, NoReturn
+from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn
 from enum import Enum
 import itertools
 
@@ -255,18 +255,17 @@ class FunctionSchema:
     # The name of the operator this function schema describes.
     name: 'OperatorName'
 
-    # NB: Sequence here is intentional, to make it read only
-    arguments: Sequence['Argument']
-    kwarg_only_arguments: Sequence['Argument']  # but not including out args
+    arguments: Tuple['Argument', ...]
+    kwarg_only_arguments: Tuple['Argument', ...]  # but not including out args
     # Unlike in the previous codegen, we have factored out 'out' arguments
     # in the canonical representation, removing them from kwarg
     # arguments.  This choice is justified by numerous downstream
     # transformations which treat out arguments specially; additionally,
     # you can see that canonicity is not violated!
-    out_arguments: Sequence['Argument']  # these are also kwarg-only
+    out_arguments: Tuple['Argument', ...]  # these are also kwarg-only
 
     # TODO: Need to handle collisions with argument names at some point
-    returns: Sequence['Return']
+    returns: Tuple['Return', ...]
 
     def schema_order_arguments(self) -> Iterator['Argument']:
         return itertools.chain(self.arguments, self.kwarg_only_arguments, self.out_arguments)
@@ -376,14 +375,14 @@ def __str__(self) -> str:
 class Annotation:
     # Typically only has one element.  Not actually a set so
     # we can conveniently assume it is canonically ordered
-    alias_set: Sequence[str]
+    alias_set: Tuple[str, ...]
     is_write: bool
 
     @staticmethod
     def parse(ann: str) -> 'Annotation':
         m = re.match(r'^([a-z])(!?)$', ann)
         assert m is not None, f'unrecognized alias annotation {ann}'
-        alias_set = [m.group(1)]
+        alias_set = (m.group(1),)
         is_write = m.group(2) == '!'
         r = Annotation(alias_set=alias_set, is_write=is_write)
         assert str(r) == ann, f'{r} != {ann}'
@@ -729,21 +728,18 @@ def __str__(self) -> str:
 
 # Helper functions for parsing argument lists (both inputs and returns)
 
-def parse_returns(return_decl: str) -> Sequence[Return]:
+def parse_returns(return_decl: str) -> Tuple[Return, ...]:
     """
     Input: '()'
     Output: []
     """
     if return_decl == '()':
-        return []
+        return ()
     if return_decl[0] == '(' and return_decl[-1] == ')':
         return_decl = return_decl[1:-1]
-    returns = []
-    for arg in return_decl.split(', '):
-        returns.append(Return.parse(arg))
-    return returns
+    return tuple(Return.parse(arg) for arg in return_decl.split(', '))
 
-def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument], Sequence[Argument]]:
+def parse_arguments(args: str) -> Tuple[Tuple[Argument, ...], Tuple[Argument, ...], Tuple[Argument, ...]]:
     """
     Input: 'int x, int y, int z'
     Output: positional args, kwarg only args
@@ -778,4 +774,4 @@ def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument],
             assert arguments_acc is not out_arguments
         arguments_acc.append(parg)
 
-    return arguments, kwarg_only_arguments, out_arguments
+    return tuple(arguments), tuple(kwarg_only_arguments), tuple(out_arguments)

From 4583edb5d677fcb435f96be6c950c499a316577a Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 1 Oct 2020 08:37:08 -0700
Subject: [PATCH 340/449] Add NativeFunction.signature and kind. (#45131)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45131

These make it easier to group native functions together and determine
what kind of native function it is (inplace/out/functional).  Currently
they are not used but they may be useful for tools.autograd porters.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Reviewed By: zhangguanheng66

Differential Revision: D23872526

Pulled By: ezyang

fbshipit-source-id: 1d6e429ab9a1f0fdb764be4228c5bca4dce8f24e
---
 tools/codegen/model.py | 72 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index 42856d580666..7dd1f6ff505c 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -197,6 +197,8 @@ def __post_init__(self) -> None:
                 "otherwise you will tickle a Python argument binding bug " \
                 "(which usually manifests itself as the result variable being undefined.)"
 
+SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out'))
+
 # The function schema is undoubtedly the most important data structure
 # in all of the codegen, as it defines the type signature for operators,
 # and most of the code generation we do is type directed (e.g., look at
@@ -350,6 +352,76 @@ def is_out_fn(self) -> bool:
         #     we only do this check in tools/
         return bool(self.out_arguments)
 
+    def kind(self) -> SchemaKind:
+        """
+        What kind of schema is this?  A functional schema is one
+        that returns a newly allocated output; an inplace schema
+        modifies the self argument inplace; an out schema writes
+        the result into an explicitly provided out argument.
+        """
+        is_inplace = self.name.name.inplace
+        is_out = bool(self.out_arguments)
+        assert not (is_inplace and is_out)
+        if is_inplace:
+            return SchemaKind.inplace
+        elif is_out:
+            return SchemaKind.out
+        else:
+            return SchemaKind.functional
+
+    # WARNING: This method is not currently tested in any meaningful way
+    def signature(self) -> 'FunctionSchema':
+        """
+        Certain schemas are 'related', in that they are simply
+        inplace/out/functional versions of the same function.  This method
+        factors these schemas into the "core" functional signature which
+        is equal across all versions.
+
+        Here is what normalization happens to the schema to convert
+        it to a signature:
+        - The overload name is stripped (name is retained, since
+          it expresses semantic content about what the function does)
+        - Inplace is set False
+        - Out arguments are stripped
+        - Mutability annotations are stripped  (this is sound
+          because you cannot overload on mutability annotation)
+
+        This function is based off of get_signature in
+        tools.autograd.load_derivatives
+        """
+
+        # dataclasses.replace could be used here, but it is less
+        # type safe so for now I've opted to type everything out
+        def strip_arg_annotation(a: Argument) -> Argument:
+            return Argument(
+                name=a.name,
+                type=a.type,
+                default=a.default,  # hmmm
+                annotation=None,
+            )
+
+        def strip_ret_annotation(r: Return) -> Return:
+            return Return(
+                name=r.name,
+                type=r.type,
+                annotation=None,
+            )
+
+        return FunctionSchema(
+            name=OperatorName(
+                name=BaseOperatorName(
+                    base=self.name.name.base,
+                    inplace=False,
+                    dunder_method=self.name.name.dunder_method,
+                ),
+                overload_name="",  # stripped
+            ),
+            arguments=tuple(map(strip_arg_annotation, self.arguments)),
+            kwarg_only_arguments=tuple(map(strip_arg_annotation, self.kwarg_only_arguments)),
+            out_arguments=(),  # stripped
+            returns=tuple(map(strip_ret_annotation, self.returns)),
+        )
+
     def __str__(self) -> str:
         all_arguments: List[str] = []
         all_arguments.extend(map(str, self.arguments))

From 77cd8e006b92a8faf90a1ffcfe601860df587726 Mon Sep 17 00:00:00 2001
From: Ivan Yashchuk <ivan.yashchuk@aalto.fi>
Date: Thu, 1 Oct 2020 08:55:22 -0700
Subject: [PATCH 341/449] Added support for complex torch.symeig (#45121)

Summary:
This PR adds support for complex-valued input for `torch.symeig`.

TODO:
- [ ] complex cuda tests raise `RuntimeError: _th_bmm_out not supported on CUDAType for ComplexFloat`
Update: Added xfailing tests for complex dtypes on CUDA. Once support for complex `bmm` is added these tests will work.

Fixes https://github.com/pytorch/pytorch/issues/45061.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45121

Reviewed By: mrshenli

Differential Revision: D24049649

Pulled By: anjali411

fbshipit-source-id: 2cd11f0e47d37c6ad96ec786762f2da57f25dac5
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp   | 43 ++++++++---
 .../ATen/native/cuda/BatchLinearAlgebra.cu    | 71 ++++++++++++++-----
 test/test_torch.py                            | 39 ++++++++--
 torch/testing/_internal/common_utils.py       |  8 +++
 4 files changed, 128 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index fc21464b153d..5e37fc0dc53d 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -66,6 +66,8 @@ extern "C" void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau
 extern "C" void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
 
 // syev
+extern "C" void zheev_(char *jobz, char *uplo, int *n, std::complex<double> *a, int *lda, double *w, std::complex<double> *work, int *lwork, double *rwork, int *info);
+extern "C" void cheev_(char *jobz, char *uplo, int *n, std::complex<float> *a, int *lda, float *w, std::complex<float> *work, int *lwork, float *rwork, int *info);
 extern "C" void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
 extern "C" void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
 
@@ -116,8 +118,8 @@ void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *wo
 template<class scalar_t>
 void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
 
-template<class scalar_t>
-void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, scalar_t *w, scalar_t *work, int lwork, int *info);
+template<class scalar_t, class value_t=scalar_t>
+void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info);
 
 template<class scalar_t, class value_t=scalar_t>
 void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda,
@@ -255,11 +257,21 @@ template<> void lapackOrgqr<float>(int m, int n, int k, float *a, int lda, float
   sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
 }
 
-template<> void lapackSymeig<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, int *info) {
+template<> void lapackSymeig<c10::complex<double>, double>(char jobz, char uplo, int n, c10::complex<double> *a, int lda, double *w, c10::complex<double> *work, int lwork, double *rwork, int *info) {
+  zheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<double>*>(a), &lda, w, reinterpret_cast<std::complex<double>*>(work), &lwork, rwork, info);
+}
+
+template<> void lapackSymeig<c10::complex<float>, float>(char jobz, char uplo, int n, c10::complex<float> *a, int lda, float *w, c10::complex<float> *work, int lwork, float *rwork, int *info) {
+  cheev_(&jobz, &uplo, &n, reinterpret_cast<std::complex<float>*>(a), &lda, w, reinterpret_cast<std::complex<float>*>(work), &lwork, rwork, info);
+}
+
+template<> void lapackSymeig<double>(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, double* rwork, int *info) {
+  (void)rwork;  // unused
   dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
 }
 
-template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, int *info) {
+template<> void lapackSymeig<float>(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, float* rwork, int *info) {
+  (void)rwork;  // unused
   ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
 }
 
@@ -859,7 +871,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
 #else
   using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
-  auto eigvals_data = eigvals.data_ptr<scalar_t>();
+  auto eigvals_data = eigvals.data_ptr<value_t>();
   auto self_matrix_stride = matrixStride(self);
   auto eigvals_stride = eigvals.size(-1);
   auto batch_size = batchCount(self);
@@ -875,16 +887,26 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   int lwork = -1;
   scalar_t wkopt;
-  lapackSymeig<scalar_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, &info);
+
+  Tensor rwork;
+  value_t* rwork_data = nullptr;
+  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
+    int64_t lrwork = std::max(int64_t(1), 3 * n - 2);
+    ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
+    rwork = at::empty({lrwork}, self.options().dtype(dtype));
+    rwork_data = rwork.data_ptr<value_t>();
+  }
+
+  lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, rwork_data, &info);
   lwork = static_cast<int>(real_impl<scalar_t, value_t>(wkopt));
   Tensor work = at::empty({lwork}, self.options());
 
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
+    value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
 
     // now compute the eigenvalues and the eigenvectors (optionally)
-    lapackSymeig<scalar_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr<scalar_t>(), lwork, &info);
+    lapackSymeig<scalar_t, value_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr<scalar_t>(), lwork, rwork_data, &info);
     infos[i] = info;
     if (info != 0) {
       return;
@@ -898,14 +920,15 @@ std::tuple<Tensor, Tensor> _symeig_helper_cpu(const Tensor& self, bool eigenvect
 
   auto self_sizes = self.sizes().vec();
   self_sizes.pop_back();
-  auto eigvals = at::empty(self_sizes, self.options());
+  ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
+  auto eigvals = at::empty(self_sizes, self.options().dtype(dtype));
 
   if (self.numel() == 0) {
     return std::tuple<Tensor, Tensor>(eigvals, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
   }
 
   auto self_working_copy = cloneBatchedColumnMajor(self);
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cpu", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cpu", [&]{
     apply_symeig<scalar_t>(self_working_copy, eigvals, eigenvectors, upper, infos);
   });
 
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index c86f355a67c2..74915830fa7f 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -9,6 +9,7 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/cuda/BatchLinearAlgebraLib.h>
+#include <ATen/native/cpu/zmath.h>
 
 #include <THC/THC.h> // for USE_MAGMA
 
@@ -116,11 +117,11 @@ void magmaOrgqr(
     magma_int_t m, magma_int_t n, magma_int_t k, scalar_t* dA,
     magma_int_t ldda, scalar_t* tau, scalar_t* dT, magma_int_t nb, magma_int_t* info);
 
-template<class scalar_t>
+template<class scalar_t, class value_t=scalar_t>
 void magmaSymeig(
     magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, scalar_t* dA, magma_int_t ldda,
-    scalar_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info);
+    value_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork, value_t* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info);
 
 template<class scalar_t>
 void magmaSvd(
@@ -487,8 +488,10 @@ void magmaOrgqr<float>(
 template<>
 void magmaSymeig<double>(
     magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double* dA, magma_int_t ldda,
-    double* w, double* wA, magma_int_t ldwa, double* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+    double* w, double* wA, magma_int_t ldwa, double* work, magma_int_t lwork, double* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  (void)rwork;  // unused
+  (void)lrwork;  // unused
   MagmaStreamSyncGuard guard;
   magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
@@ -497,13 +500,39 @@ void magmaSymeig<double>(
 template<>
 void magmaSymeig<float>(
     magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, float* dA, magma_int_t ldda,
-    float* w, float* wA, magma_int_t ldwa, float* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+    float* w, float* wA, magma_int_t ldwa, float* work, magma_int_t lwork, float* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  (void)rwork;  // unused
+  (void)lrwork;  // unused
   MagmaStreamSyncGuard guard;
   magma_ssyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaSymeig<c10::complex<double>, double>(
+    magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, c10::complex<double>* dA, magma_int_t ldda,
+    double* w, c10::complex<double>* wA, magma_int_t ldwa, c10::complex<double>* work, magma_int_t lwork, double* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zheevd_gpu(
+      jobz, uplo, n, reinterpret_cast<magmaDoubleComplex*>(dA), ldda, w, reinterpret_cast<magmaDoubleComplex*>(wA),
+      ldwa, reinterpret_cast<magmaDoubleComplex*>(work), lwork, rwork, lrwork, iwork, liwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaSymeig<c10::complex<float>, float>(
+    magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, c10::complex<float>* dA, magma_int_t ldda,
+    float* w, c10::complex<float>* wA, magma_int_t ldwa, c10::complex<float>* work, magma_int_t lwork, float* rwork,
+    magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cheevd_gpu(
+      jobz, uplo, n, reinterpret_cast<magmaFloatComplex*>(dA), ldda, w, reinterpret_cast<magmaFloatComplex*>(wA),
+      ldwa, reinterpret_cast<magmaFloatComplex*>(work), lwork, rwork, lrwork, iwork, liwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaSvd<double>(
     magma_vec_t jobz, magma_int_t m, magma_int_t n, double* A,
@@ -1235,8 +1264,9 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool
 AT_ERROR("symeig: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
 #else
+  using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
-  auto eigvals_data = eigvals.data_ptr<scalar_t>();
+  auto eigvals_data = eigvals.data_ptr<value_t>();
   auto self_matrix_stride = matrixStride(self);
   auto eigvals_stride = eigvals.size(-1);
   int64_t batch_size = batchCount(self);
@@ -1257,20 +1287,28 @@ AT_ERROR("symeig: MAGMA library not found in "
   scalar_t wkopt;
   magma_int_t liwork = -1;
   magma_int_t iwkopt;
-  magmaSymeig<scalar_t>(jobz, uplo, n, self_data, n, eigvals_data, wA, n, &wkopt, lwork, &iwkopt, liwork, &info);
+  magma_int_t lrwork = -1;
+  value_t rwkopt;
+  magmaSymeig<scalar_t, value_t>(jobz, uplo, n, self_data, n, eigvals_data, wA, n, &wkopt, lwork, &rwkopt, lrwork, &iwkopt, liwork, &info);
 
   scalar_t* work;
   magma_int_t* iwork;
-  lwork = magma_int_cast(wkopt, "work_size");
+  lwork = magma_int_cast(real_impl<scalar_t, value_t>(wkopt), "work_size");
   liwork = magma_int_cast(iwkopt, "iwork_size");
   ALLOCATE_ARRAY(work, scalar_t, lwork);
   ALLOCATE_ARRAY(iwork, magma_int_t, liwork);
 
+  value_t* rwork = nullptr;
+  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
+    lrwork = magma_int_cast(rwkopt, "rwork_size");
+    ALLOCATE_ARRAY(rwork, value_t, lrwork);
+  }
+
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_matrix_stride];
-    scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
-    magmaSymeig<scalar_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr,
-                          wA, n, work, lwork, iwork, liwork, &info);
+    value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride];
+    magmaSymeig<scalar_t, value_t>(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr,
+                          wA, n, work, lwork, rwork, lrwork, iwork, liwork, &info);
     infos[i] = info;
     if (info != 0) {
       return;
@@ -1284,6 +1322,7 @@ std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvec
 
   auto self_sizes = self.sizes().vec();
   self_sizes.pop_back();
+  ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype()));
 
   // magmaSymeig uses a hybrid CPU-GPU algorithm to compute the eigenvalues and eigenvectors.
   // The driver routine magma_(d/s)syev_gpu accepts a tensor on the CPU for eigvalenvalues.
@@ -1291,15 +1330,15 @@ std::tuple<Tensor, Tensor> _symeig_helper_cuda(const Tensor& self, bool eigenvec
   // In the case where self.numel() == 0, we just return an empty tensor of
   // dimensions on the CUDA (to avoid the unnecessary "to(at::kCUDA)")
   auto eigvals_working_copy = self.numel() == 0
-                              ? at::empty(self_sizes, self.options())
-                              : at::empty(self_sizes, self.options().device(at::kCPU));
+                              ? at::empty(self_sizes, self.options().dtype(dtype))
+                              : at::empty(self_sizes, self.options().dtype(dtype).device(at::kCPU));
 
   if (self.numel() == 0) {
     return std::tuple<Tensor, Tensor>(eigvals_working_copy, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT));
   }
 
   auto self_working_copy = cloneBatchedColumnMajor(self);
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cuda", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cuda", [&]{
     apply_symeig<scalar_t>(self_working_copy, eigvals_working_copy, eigenvectors, upper, infos);
   });
 
diff --git a/test/test_torch.py b/test/test_torch.py
index 5f43089a9a63..126cbda5815f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -9554,20 +9554,26 @@ def test_rpow(self, device):
         assert m.dim() == 0, "m is intentionally a scalar"
         self.assertEqual(torch.pow(2, m), 2**m)
 
+    @precisionOverride({torch.float32: 1e-5, torch.complex64: 1e-5})
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypesIfCPU(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @dtypesIfCUDA(torch.float32, torch.float64)
     def test_symeig(self, device, dtype):
-        from torch.testing._internal.common_utils import random_symmetric_matrix
+        from torch.testing._internal.common_utils import random_hermitian_matrix
 
         def run_test(dims, eigenvectors, upper):
-            x = random_symmetric_matrix(*dims, dtype=dtype, device=device)
-            oute = torch.empty(dims[1:] + dims[:1], dtype=dtype, device=device)
+            x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
+            if dtype.is_complex:
+                real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+            else:
+                real_dtype = dtype
+            oute = torch.empty(dims[1:] + dims[:1], dtype=real_dtype, device=device)
             outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device)
             torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv))
 
             if eigenvectors:
-                x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute)), outv.transpose(-2, -1))
+                x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute.to(dtype))), outv.transpose(-2, -1).conj())
                 self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T')
             else:
                 eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper)
@@ -9579,14 +9585,14 @@ def run_test(dims, eigenvectors, upper):
             self.assertEqual(resv, outv, msg="outputs of symeig and symeig with out don't match")
 
             # test non-contiguous
-            x = random_symmetric_matrix(*dims, dtype=dtype, device=device)
+            x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
             n_dim = len(dims) + 1
             # Reverse the batch dimensions and the matrix dimensions and then concat them
             x = x.permute(tuple(range(n_dim - 3, -1, -1)) + (n_dim - 1, n_dim - 2))
             assert not x.is_contiguous(), "x is intentionally non-contiguous"
             rese, resv = torch.symeig(x, eigenvectors=eigenvectors, upper=upper)
             if eigenvectors:
-                x_recon = torch.matmul(torch.matmul(resv, torch.diag_embed(rese)), resv.transpose(-2, -1))
+                x_recon = torch.matmul(torch.matmul(resv, torch.diag_embed(rese.to(dtype))), resv.transpose(-2, -1).conj())
                 self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T')
             else:
                 eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper)
@@ -9597,6 +9603,25 @@ def run_test(dims, eigenvectors, upper):
         for batch_dims, eigenvectors, upper in product(batch_dims_set, (True, False), (True, False)):
             run_test((5,) + batch_dims, eigenvectors, upper)
 
+    # TODO: once there is more support for complex dtypes on GPU, they shall be added to above test
+    # particularly when RuntimeError: _th_bmm_out not supported on CUDAType for ComplexFloat is fixed
+    @unittest.expectedFailure
+    @onlyCUDA
+    @skipCUDAIfNoMagma
+    @dtypes(torch.complex64, torch.complex128)
+    def test_symeig_complex_xfailed(self, device, dtype):
+        from torch.testing._internal.common_utils import random_hermitian_matrix
+
+        dims = (5, 3)
+        x = random_hermitian_matrix(*dims, dtype=dtype, device=device)
+        real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64
+        oute = torch.empty(dims[1:] + dims[:1], dtype=real_dtype, device=device)
+        outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device)
+        torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv))
+
+        x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute.to(dtype))), outv.transpose(-2, -1).conj())
+        self.assertEqual(x, x_recon, atol=1e-8, rtol=0)
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 6b49f9c9d899..4890cf30b9d6 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1503,6 +1503,14 @@ def random_symmetric_matrix(l, *batches, **kwargs):
     return A
 
 
+def random_hermitian_matrix(l, *batches, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
+    A = (A + A.transpose(-2, -1).conj()).div_(2)
+    return A
+
+
 def random_symmetric_psd_matrix(l, *batches, **kwargs):
     dtype = kwargs.get('dtype', torch.double)
     device = kwargs.get('device', 'cpu')

From 1efdbfabcc1b86094b3ebc20db33ef189653fe20 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Thu, 1 Oct 2020 11:48:47 -0700
Subject: [PATCH 342/449] [docs] Fix back quote rendering in loss modules docs
 (#45662)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/42855. Previously, back quotes weren't rendering correctly in
equations. This is because we were quoting things like `'mean'`. In
order to backquote properly in latex in text-mode, the back-quote needs
to be written as a back-tick.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45662

Test Plan:
- built docs locally and viewed the changes.

For NLLLoss (which is not the original module mentioned in the issue, but it has the same problem), we can see how the back quotes now render properly:

![image](https://user-images.githubusercontent.com/5652049/94819862-c5676a00-03cd-11eb-9e92-01380ee52bd6.png)

Reviewed By: glaringlee

Differential Revision: D24049880

Pulled By: zou3519

fbshipit-source-id: 61a1257994144549eb8f29f19d639aea962dfec0
---
 torch/nn/modules/loss.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 13dac25b68a2..56f676157a02 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -42,8 +42,8 @@ class L1Loss(_Loss):
     .. math::
         \ell(x, y) =
         \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
@@ -130,9 +130,9 @@ class NLLLoss(_WeightedLoss):
     .. math::
         \ell(x, y) = \begin{cases}
             \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
-            \text{if reduction} = \text{'mean';}\\
+            \text{if reduction} = \text{`mean';}\\
             \sum_{n=1}^N l_n,  &
-            \text{if reduction} = \text{'sum'.}
+            \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     Can also be used for higher dimension inputs, such as 2D images, by providing
@@ -322,8 +322,8 @@ class KLDivLoss(_Loss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';} \\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';} \\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     In default :attr:`reduction` mode ``'mean'``, the losses are averaged for each minibatch over observations
@@ -396,8 +396,8 @@ class MSELoss(_Loss):
     .. math::
         \ell(x, y) =
         \begin{cases}
-            \operatorname{mean}(L), &  \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  &  \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
@@ -461,8 +461,8 @@ class BCELoss(_WeightedLoss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     This is used for measuring the error of a reconstruction in for example
@@ -548,8 +548,8 @@ class BCEWithLogitsLoss(_Loss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     This is used for measuring the error of a reconstruction in for example
@@ -651,8 +651,8 @@ class HingeEmbeddingLoss(_Loss):
 
     .. math::
         \ell(x, y) = \begin{cases}
-            \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
-            \operatorname{sum}(L),  & \text{if reduction} = \text{'sum'.}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
         \end{cases}
 
     where :math:`L = \{l_1,\dots,l_N\}^\top`.

From 381f6d32a77d76795df49945f1e960453d0b2cc9 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Thu, 1 Oct 2020 12:17:02 -0700
Subject: [PATCH 343/449] [docs] Fix hyperlinks for nn.CrossEntropyLoss
 (#45660)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45460. This PR makes it so that LogSoftmax and NLLLoss are correctly linked from the nn.CrossEntropyLoss documentation.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45660

Test Plan:
- built and viewed docs locally

![image](https://user-images.githubusercontent.com/5652049/94816513-ee85fb80-03c9-11eb-8289-56642c133e11.png)

Reviewed By: glaringlee

Differential Revision: D24049009

Pulled By: zou3519

fbshipit-source-id: 3bd0660acb8575d753cefd2d0f1e523ca58a25b6
---
 torch/nn/modules/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 56f676157a02..564125a02b31 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -861,7 +861,7 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
 
 
 class CrossEntropyLoss(_WeightedLoss):
-    r"""This criterion combines :func:`nn.LogSoftmax` and :func:`nn.NLLLoss` in one single class.
+    r"""This criterion combines :class:`~torch.nn.LogSoftmax` and :class:`~torch.nn.NLLLoss` in one single class.
 
     It is useful when training a classification problem with `C` classes.
     If provided, the optional argument :attr:`weight` should be a 1D `Tensor`

From dcda11c4d3e1de2156b14fb321b510fea8b28098 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 1 Oct 2020 12:40:04 -0700
Subject: [PATCH 344/449] Disable tcuda_fuser tests in Profiling Mode (#45638)

Summary:
Disable tcuda_fuser tests in Profiling Mode to address flakey tests until fuser switches to the new approach.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45638

Reviewed By: mrshenli

Differential Revision: D24057230

Pulled By: Krovatkin

fbshipit-source-id: 8f7a47610d9b7da6ad3057208057a5a596e1bffa
---
 test/test_jit_cuda_fuser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 762d01f19556..43b28ae0d8e4 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -804,5 +804,5 @@ def test_register_fuser(self):
 
 
 if __name__ == '__main__':
-    if not TEST_WITH_ROCM:
+    if not TEST_WITH_ROCM and GRAPH_EXECUTOR != ProfilingMode.PROFILING:
         run_tests()

From 03e4e94d2408a38dc1f79efb5a34a1d98b321af8 Mon Sep 17 00:00:00 2001
From: Wang Xu <scottxu0730@gmail.com>
Date: Thu, 1 Oct 2020 13:04:25 -0700
Subject: [PATCH 345/449] Find single partition (#45429)

Summary:
WIP: This PR is working in progress for the partition of fx graph module. _class partitioner_ generates partitions for the graph module. _class partition_ is a partition node in the partitions.
_Partitioner()_ : create a partitioner
_partition_graph(self, fx_module: GraphModule, devices: List[str]) -> None_:
use fx graph module and devices as the input and create partition_ids for each node inside the graph module

_dump_partition_DAG(self) -> None_:
print out the information about each partition, including its id, its backend type (what type of device this partition uses), all the nodes included in this partition,  its parent partitions, children partitions, input nodes, and output nodes.

So far, only a single partition is considered, which means there is only one device with unlimited memory.
A test unit call _test_find_single_partition()_ is added to test if all nodes in the graph are marked for the only partition.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45429

Reviewed By: izdeby

Differential Revision: D24026268

Pulled By: scottxu0730

fbshipit-source-id: 119d506f33049a59b54ad993670f4ba5d8e15b0b
---
 test/test_fx.py                      |  24 +++-
 torch/fx/experimental/Partitioner.py | 181 +++++++++++++++++++++++++++
 2 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 torch/fx/experimental/Partitioner.py

diff --git a/test/test_fx.py b/test/test_fx.py
index c4fba462421f..6ae422e84581 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -8,6 +8,7 @@
 from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Tracer, Graph
 from torch.fx.experimental import GraphManipulation
 from torch.fx.experimental import shape_prop
+from torch.fx.experimental.Partitioner import DAG, Partitioner
 
 from torch.fx.proxy import TraceError
 
@@ -757,8 +758,27 @@ def forward(self, x):
         shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4))
         self.assertEqual(tc_traced.graph.result.shape, ref_out.shape)
 
-
-
+    def test_find_single_partition(self):
+        class testModule(torch.nn.Module):
+            def forward(self, a, b):
+                return a + b
+        m = testModule()
+        traced = symbolic_trace(m)
+        partitioner = Partitioner()
+        devices = [{"name": "dev_0", "available_mem": float('inf')}]
+        dag = partitioner.partition_graph(traced, devices)
+        for node in traced.graph.nodes:
+            assert node.partition_ids == [1]
+        nodes = traced.graph.nodes
+        res_dag = DAG()
+        res_dag.create_node(0, [], [1], [], [])
+        res_dag.create_node(1, [0], [], [nodes[0], nodes[1]], [nodes[2]])
+        for r, d in zip(res_dag.nodes, dag.nodes):
+            assert(r.partition_id == d.partition_id)
+            assert(r.parents == d.parents)
+            assert(r.children == d.children)
+            assert(r.input_nodes == d.input_nodes)
+            assert(r.output_nodes == d.output_nodes)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/Partitioner.py b/torch/fx/experimental/Partitioner.py
new file mode 100644
index 000000000000..d3318916da27
--- /dev/null
+++ b/torch/fx/experimental/Partitioner.py
@@ -0,0 +1,181 @@
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
+from torch.fx.experimental import GraphManipulation
+from typing import Dict, List, Union
+
+class DAGNode:
+    """
+    DAGNode class maintains useful information for a partition.
+    It includes parent partitions' ids, child partitions' ids, inputs(Node) and outputs(Node) of the partition.
+    """
+    def __init__(
+        self,
+        partition_id: int,
+        parents_ids: List[int],
+        children_ids: List[int],
+        input_nodes: List[Node],
+        output_nodes: List[Node]
+    ) -> None:
+        self.partition_id = partition_id
+        self.parents = parents_ids
+        self.children = children_ids
+        self.input_nodes = input_nodes
+        self.output_nodes = output_nodes
+
+    def __str__(self):
+        line: str = 'partition id: ' + str(self.partition_id) + '\n'
+        line += 'parent partitions:'
+        for parent in self.parents:
+            line += 'partition_' + str(parent) + ' '
+        line += '\n'
+        line += 'children partitions:'
+        for child in self.children:
+            line += 'partition_' + str(child) + ' '
+        line += '\n'
+        line += 'input nodes: '
+        for node in self.input_nodes:
+            line += '(' + node.name + ':' + node.op + ') '
+        line += '\n'
+        line += 'output nodes: '
+        output_nodes = self.output_nodes
+        for node in output_nodes:
+            line += '(' + node.name + ':' + node.op + ') '
+        return line
+
+class DAG:
+    """DAG class contains all the DAG nodes"""
+    def __init__(self) -> None:
+        self.nodes: List[DAGNode] = []
+
+    def create_node(
+        self,
+        partition_id: int,
+        parents_ids: List[int],
+        children_ids: List[int],
+        input_nodes: List[Node],
+        output_nodes: List[Node]
+    ) -> None:
+        node = DAGNode(partition_id, parents_ids, children_ids, input_nodes, output_nodes)
+        self.nodes.append(node)
+
+class Partition:
+    """Partition class contains all the information about an individual partition.
+    It also provides necessary methods for manipulation the partition.
+    """
+    def __init__(self, partition_id: int, fx_module: GraphModule) -> None:
+        self.graph_module = fx_module
+        self.nodes: List[Node] = []
+        self.partition_id = partition_id
+        self.parents: List[Partition] = []
+        self.children: List[Partition] = []
+
+    def add_node(self, node: Node) -> None:
+        """Append a new node into the partition."""
+        self.nodes.append(node)
+
+    def add_parent(self, partition: 'Partition') -> None:
+        self.parents.append(partition)
+
+    def add_child(self, partition: 'Partition') -> None:
+        self.children.append(partition)
+
+    def get_children(self) -> List['Partition']:
+        return self.children
+
+    def get_parents(self) -> List['Partition']:
+        return self.parents
+
+    def get_input_nodes(self) -> List[Node]:
+        """Input nodes are coming from two places:
+        placeholder and output from its parents output.
+        """
+        input_nodes: List[Node] = []
+        for node in self.nodes:
+            if node.op == 'placeholder':
+                input_nodes.append(node)
+        for partition in self.parents:
+            input_nodes += partition.get_output_nodes()
+        return input_nodes
+
+    def get_output_nodes(self) -> List[Node]:
+        """Output nodes are the nodes that without any user inside this partition."""
+        output_nodes: List[Node] = []
+        for node in self.nodes:
+            index = self.graph_module.graph.nodes.index(node)
+            user_indexes = GraphManipulation.get_all_users_of(self.graph_module, index)
+            user_nodes = {self.graph_module.graph.nodes[i] for i in user_indexes}
+            # check if user nodes has an intersection with self.nodes
+            if not set(self.nodes).intersection(user_nodes):
+                output_nodes.append(node)
+        return output_nodes
+
+    def __str__(self) -> str:
+        return str(self.partition_id)
+
+class Partitioner:
+    """A graph module may not fit into one device.
+    Partitioner class helps cut one graph into subgraphs (partitions),
+    so that each partition could fit into a different device.
+    """
+    def __init__(self) -> None:
+        """
+        After a partitioner is created,
+        it first check if multiple types of devices (backends) are involved.
+        So far, we assume that there is only one backend and one device with unlimited memory
+        """
+        self.partitions: List[Partition] = []
+        self.devices: List[Dict[str, Union[str, float]]] = []
+        self.node_to_partitions: Dict[Node, List[int]] = {}
+
+    def partition_graph(self, fx_module: GraphModule, devices: List[Dict[str, Union[str, float]]]) -> DAG:
+        """
+        Given the fx_module and devices, find the partitions, do the partitions,
+        and then return a dictionary representing the DAG of partitions
+        """
+        self.graph_module = fx_module
+        self.devices = devices
+        # Create a dummy partition as root.
+        self.root_partition = self.create_partition()
+        # So far, the whole could fit into one device since we assume the memory is unlimited.
+        # TODO: Check the available memory size for each device and see if one device is enough or multiple partitions are needed.
+        self.find_single_partition()
+        self.do_partition()
+        dag = self.dump_partition_DAG()
+        return dag
+
+    def find_single_partition(self) -> None:
+        """Only one partition (one graph on one device)."""
+        partition_0 = self.create_partition()
+        for i, node in enumerate(self.graph_module.graph.nodes):
+            self.node_to_partitions[node] = [partition_0.partition_id]
+            partition_0.add_node(node)
+        # Connect the partition to the root.
+        self.root_partition.add_child(partition_0)
+        partition_0.add_parent(self.root_partition)
+        return
+
+    def do_partition(self) -> None:
+        """Mark the partition on each node in the fx_module."""
+        for node in self.graph_module.graph.nodes:
+            node.partition_ids = self.node_to_partitions[node]
+        return
+
+    def dump_partition_DAG(self) -> DAG:
+        dag = DAG()
+        for i, partition in enumerate(self.partitions):
+            input_nodes = partition.get_input_nodes()
+            output_nodes = partition.get_output_nodes()
+            parents = partition.parents
+            parent_ids = [self.partitions.index(p) for p in parents]
+            children = partition.children
+            children_ids = [self.partitions.index(c) for c in children]
+            dag.create_node(i, parent_ids, children_ids, input_nodes, output_nodes)
+        return dag
+
+    def create_partition(self) -> Partition:
+        """Create a partition and append it to self.partitions."""
+        partition_id = len(self.partitions)
+        assert isinstance(self.graph_module, GraphModule)
+        partition = Partition(partition_id, self.graph_module)
+        self.partitions.append(partition)
+        return partition

From 4564444c9173e32bea4fc452b0c1af83cce5ad4c Mon Sep 17 00:00:00 2001
From: Kunal Bhalla <kunalb@fb.com>
Date: Thu, 1 Oct 2020 14:19:08 -0700
Subject: [PATCH 346/449] [RFC][caffe2] TaskGroup.__repr__ shouldn't have side
 effects

Summary: `__repr__` calling self.tasks() ends up marking the instance as "used", which doesn't seem appropriate. I was debugging a value being passed around and then ran into `Cannot add Task to an already used TaskGroup.` because the value had been logged once.

Test Plan:
Added a unit test -- didn't see a clean public method to test it, but I'm happy to add one if that makes sense.

Will wait for sandcastle to trigger everything else; I'm not at all familiar with this code so any other recommendations would be great!

Reviewed By: cryptopic

Differential Revision: D23541198

fbshipit-source-id: 5d1ec674a1ddaedf113140133b90e0da6afa7270
---
 caffe2/python/task.py      |  8 +++-----
 caffe2/python/task_test.py | 10 +++++-----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/caffe2/python/task.py b/caffe2/python/task.py
index f1b25ee26092..853433d5c38e 100644
--- a/caffe2/python/task.py
+++ b/caffe2/python/task.py
@@ -1,10 +1,6 @@
 ## @package task
 # Module caffe2.python.task
 
-
-
-
-
 from caffe2.python import core, context
 from caffe2.python.schema import Field, from_blob_list
 from collections import defaultdict
@@ -354,7 +350,9 @@ def workspace_type(self):
 
     def __repr__(self):
         return "TaskGroup(tasks={}, workspace_type={}, remote_nets={})".format(
-            self.tasks(), self.workspace_type(), self.remote_nets())
+            self._tasks + self._tasks_to_add,
+            self.workspace_type(),
+            self.remote_nets())
 
 
 class TaskOutput(object):
diff --git a/caffe2/python/task_test.py b/caffe2/python/task_test.py
index c44e93a3704c..31adb41a0ac9 100644
--- a/caffe2/python/task_test.py
+++ b/caffe2/python/task_test.py
@@ -1,8 +1,3 @@
-
-
-
-
-
 import unittest
 from caffe2.python import task
 
@@ -22,3 +17,8 @@ def testRepr(self):
         ]
         for obj, want in cases:
             self.assertEqual(obj.__repr__(), want)
+
+    def testEffectlessRepr(self):
+        task_group = task.TaskGroup()
+        _repr = task_group.__repr__()
+        self.assertFalse(task_group._already_used)

From 93be03cec0b3bdd14e1039ff29bf0911a2b35b5d Mon Sep 17 00:00:00 2001
From: Zafar <cc.rafaz@zafar.cc>
Date: Thu, 1 Oct 2020 14:42:26 -0700
Subject: [PATCH 347/449] [quant] torch.mean add path for unsupported QNNPACK
 modes (#45533)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45533

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D24030446

Pulled By: z-a-f

fbshipit-source-id: a392402ef701c5e45e244ac440bc151ef942cccd
---
 aten/src/ATen/native/quantized/cpu/qreduction.cpp | 9 ++++++++-
 test/quantization/test_quantized_op.py            | 8 +++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qreduction.cpp b/aten/src/ATen/native/quantized/cpu/qreduction.cpp
index 739638b7a67e..74b266114230 100644
--- a/aten/src/ATen/native/quantized/cpu/qreduction.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qreduction.cpp
@@ -83,7 +83,14 @@ Tensor& mean_out_quantized_cpu(
     c10::optional<ScalarType> opt_dtype) {
 #ifdef USE_PYTORCH_QNNPACK
   if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
-      self.scalar_type() == kQUInt8) {
+      self.scalar_type() == kQUInt8 &&
+      // QNNPACK currently is only supported for NCHW + dim=(2, 3)
+      // Remove these checks after generic version is implemented.
+      self.ndimension() == 4 &&
+      dim.size() == 2 &&
+      dim[0] == 2 &&
+      dim[1] == 3
+     ){
     result = qnnpack_mean(self, dim);
     return result;
   }
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 93bb2c9f8bce..68a1b446d773 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -1726,12 +1726,14 @@ def test_cat_nhwc(self, X, relu):
         torch.testing.assert_allclose(out.dequantize(), ref.dequantize())
         self.assertNotEqual(out.stride(), sorted(out.stride()))
 
-    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=3,
-                                              min_side=1, max_side=2),
+    @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=1, max_dims=5,
+                                              min_side=1, max_side=4),
                        qparams=hu.qparams()),
-           dim=st.integers(1, 2))
+           dim=st.integers(-1, 5))
+    @override_qengines
     def test_mean(self, X, dim):
         X, (scale, zero_point, torch_type) = X
+        assume(dim < X.ndim)
         qX = torch.quantize_per_tensor(torch.tensor(X).float(), scale, zero_point, torch_type)
 
         Y = torch.mean(qX.dequantize(), dim)

From 5959de3aeb866cbb718fb915de3de2f07a476173 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Thu, 1 Oct 2020 14:43:43 -0700
Subject: [PATCH 348/449] setup: Only include dataclasses for py < 3.8 (#45611)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45611

dataclasses was made a standard library item in 3.8

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: walterddr

Differential Revision: D24031740

Pulled By: seemethere

fbshipit-source-id: 15bdf1fe0d8de9b8ba7912e4a651f06b18d516ee
---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 059188875e77..cb6c054fdf3a 100644
--- a/setup.py
+++ b/setup.py
@@ -340,7 +340,11 @@ def check_file(f):
 ################################################################################
 
 # the list of runtime dependencies required by this built package
-install_requires = ['future', 'typing_extensions', 'dataclasses']
+install_requires = [
+    'future',
+    'typing_extensions',
+    'dataclasses; python_version < "3.8"'
+]
 
 missing_pydep = '''
 Missing build dependency: Unable to `import {importname}`.

From 18253f4a48be03432ed03f8fef69438ca98af342 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@suo-fedora-mj0c3k9r.dhcp.thefacebook.com>
Date: Thu, 1 Oct 2020 14:56:59 -0700
Subject: [PATCH 349/449] Fix BUILD_CAFFE2 if FBGEMM and NNPACK are not built
 (#45610)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45610

Also add to the usual documentation places that this option exists.

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D24058199

Pulled By: suo

fbshipit-source-id: 81574fbd042f47587e2c7820c726fac0f68af2a7
---
 CONTRIBUTING.md             |  3 +--
 caffe2/CMakeLists.txt       | 10 +++++-----
 caffe2/utils/CMakeLists.txt |  6 +++++-
 setup.py                    |  3 +++
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 03ad14dd843e..b01184603918 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -489,8 +489,7 @@ only interested in a specific component.
 - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
   rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
   `make` if you don't have ninja installed).
-- Don't need Caffe2?  Pass `BUILD_CAFFE2_OPS=0` to disable build of
-  Caffe2 operators.
+- Don't need Caffe2?  Pass `BUILD_CAFFE2=0` to disable Caffe2 build.
 
 On the initial build, you can also speed things up with the environment
 variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`.
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index eeb4801577ae..38b583be2486 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -111,7 +111,7 @@ endif()
 add_subdirectory(core)
 add_subdirectory(serialize)
 add_subdirectory(utils)
-if(BUILD_CAFFE2)
+if(BUILD_CAFFE2 OR (NOT USE_FBGEMM))
   add_subdirectory(perfkernels)
 endif()
 
@@ -971,7 +971,7 @@ if(USE_DISTRIBUTED)
   target_compile_definitions(torch_cpu PRIVATE
     USE_DISTRIBUTED
   )
-  # Pass USE_RPC in order to reduce use of 
+  # Pass USE_RPC in order to reduce use of
   # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
   # need to be removed when RPC is supported
   if(NOT WIN32)
@@ -1291,8 +1291,8 @@ if(BUILD_TEST)
   foreach(test_src ${ATen_VEC256_TEST_SRCS})
     foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
         get_filename_component(test_name ${test_src} NAME_WE)
-        list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY) 
-        list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)  
+        list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
+        list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
         separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}")
         add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
         target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main)
@@ -1302,7 +1302,7 @@ if(BUILD_TEST)
         target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY}  CPU_CAPABILITY_${CPU_CAPABILITY})
         target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE  ${FLAGS})
         if(NOT MSVC)
-              target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE -Wno-ignored-qualifiers) 
+              target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE -Wno-ignored-qualifiers)
         endif(NOT MSVC)
         add_test(NAME ${test_name}_${CPU_CAPABILITY} COMMAND $<TARGET_FILE:${test_name}_${CPU_CAPABILITY}>)
     endforeach()
diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt
index 798985953b89..62190501cdac 100644
--- a/caffe2/utils/CMakeLists.txt
+++ b/caffe2/utils/CMakeLists.txt
@@ -1,9 +1,13 @@
 if((NOT BUILD_CAFFE2) OR (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE))
   list(APPEND Caffe2_CPU_SRCS
     utils/string_utils.cc
-    utils/threadpool/pthreadpool-cpp.cc
     utils/threadpool/ThreadPool.cc
   )
+
+  if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL)
+    list(APPEND Caffe2_CPU_SRCS utils/threadpool/pthreadpool-cpp.cc)
+  endif()
+
   if(NOT BUILD_CAFFE2)
     list(APPEND Caffe2_CPU_SRCS
       utils/proto_wrap.cc
diff --git a/setup.py b/setup.py
index cb6c054fdf3a..b2270db497cf 100644
--- a/setup.py
+++ b/setup.py
@@ -61,6 +61,9 @@
 #   BUILD_CAFFE2_OPS=0
 #     disable Caffe2 operators build
 #
+#   BUILD_CAFFE2=0
+#     disable Caffe2 build
+#
 #   USE_IBVERBS
 #     toggle features related to distributed support
 #

From 4f685ecc25ee1e723e45b824e58118721122891a Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Thu, 1 Oct 2020 15:45:14 -0700
Subject: [PATCH 350/449] [reland][quant][graphmode][fx] Merge all quantization
 mode (#45292) (#45672)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45672

This PR merges all quantization mode and will only expose the following top level functions:
```
prepare_fx
prepare_qat_fx
convert_fx
```

Test Plan:
Imported from OSS

Imported from OSS

Reviewed By: z-a-f

Differential Revision: D24053439

fbshipit-source-id: 03d545e26a36bc22a73349061b751eeb35171e64
---
 test/quantization/test_quantize_fx.py         |  85 +++--
 torch/quantization/fx/pattern_utils.py        |  12 -
 .../quantization/fx/quantization_patterns.py  | 167 ++++------
 torch/quantization/fx/quantize.py             |  55 ++--
 torch/quantization/fx/utils.py                |  27 ++
 torch/quantization/quantize_fx.py             | 301 +++++++-----------
 .../testing/_internal/common_quantization.py  |  36 ++-
 7 files changed, 296 insertions(+), 387 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index e526d096a878..116581b330a6 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -14,13 +14,8 @@
 # graph mode quantization based on fx
 from torch.quantization import (
     QuantType,
-    fuse_fx,
     prepare_fx,
     convert_fx,
-    prepare_static_fx,
-    convert_static_fx,
-    quantize_static_fx,
-    quantize_dynamic_fx,
     prepare_qat_fx,
     register_observed_custom_module_mapping,
     register_quantized_custom_module_mapping,
@@ -158,11 +153,11 @@ def test_functional_debug(self):
             quant_type = QuantType.DYNAMIC if is_dynamic else QuantType.STATIC
             node_occurrence = dict()
             if weight_prepack_node:
-                node_occurrence[weight_prepack_node] = 1
+                node_occurrence[weight_prepack_node] = 0
+                node_occurrence[quantized_node] = 0
             self.checkGraphModeFxOp(
                 ModuleClass(*module_constructor_inputs),
                 inputs, quant_type,
-                expected_node=quantized_node,
                 expected_node_occurrence=node_occurrence,
                 debug=True)
 
@@ -183,7 +178,8 @@ def forward(self, x):
         original = symbolic_trace(m)
         qconfig = default_dynamic_qconfig
         qconfig_dict = {'': qconfig}
-        quantized = quantize_dynamic_fx(original, qconfig_dict, debug=True)
+        prepared = prepare_fx(original, qconfig_dict)
+        quantized = convert_fx(prepared, debug=True)
         qparams = (quantized._scale_0, quantized._zero_point_0)
         weight_obs = qconfig.weight()
         weight_obs(quantized.weight)
@@ -226,14 +222,12 @@ def forward(self, x):
             for debug in [True, False]:
                 node_occurrence = dict()
                 if weight_prepack_node:
-                    if debug:
-                        node_occurrence[weight_prepack_node] = 1
-                    else:
-                        node_occurrence[weight_prepack_node] = 0
+                    node_occurrence[weight_prepack_node] = 0
                 m = ModuleClass(*module_constructor_inputs).eval()
                 m = symbolic_trace(m)
                 qconfig_dict = {"": float16_dynamic_qconfig}
-                m = quantize_dynamic_fx(m, qconfig_dict, debug=debug)
+                m = prepare_fx(m, qconfig_dict)
+                m = convert_fx(m, debug=debug)
                 self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence)
 
 
@@ -269,8 +263,7 @@ def forward(self, x):
         model = symbolic_trace(model)
 
         # QAT prepare
-        model = fuse_fx(model)
-        model = prepare_fx(model, qconfig_dict)
+        model = prepare_qat_fx(model, qconfig_dict)
 
         # ensure that running an input on CUDA works without any needed changes
         input = torch.randn(4, 1, 4, 4, device=device)
@@ -293,13 +286,19 @@ def __init__(self):
             def forward(self, x):
                 return self.conv(x)
 
-        model = symbolic_trace(M().eval())
+        model = M().eval()
+        model = symbolic_trace(model)
         qconfig_dict = {'': default_qconfig}
-        non_inplace_model = quantize_static_fx(
-            model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=False)
-        inplace_model = model
-        inplace_model = quantize_static_fx(
-            inplace_model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=True)
+        prepared = prepare_fx(
+            model, qconfig_dict, inplace=False)
+        test_only_eval_fn(model, self.img_data_2d)
+        non_inplace_model = convert_fx(prepared, inplace=True)
+
+        prepared = prepare_fx(
+            model, qconfig_dict, inplace=True)
+        test_only_eval_fn(model, self.img_data_2d)
+        inplace_model = convert_fx(prepared, inplace=True)
+
         non_inplace_res = non_inplace_model(self.img_data_2d[0][0])
         inplace_res = inplace_model(self.img_data_2d[0][0])
         self.assertEqual(non_inplace_res, inplace_res)
@@ -319,9 +318,9 @@ def forward(self, x):
         dict_input = {"input": torch.randn(1, 1, 1, 1)}
         m = symbolic_trace(M()).eval()
         qconfig_dict = {"": default_qconfig}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         m(dict_input)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(dict_input)
 
     def test_standalone_module_class(self):
@@ -431,10 +430,10 @@ def forward(self, x):
         m = symbolic_trace(m)
         qconfig_dict = {"": default_qconfig,
                         "module_name": [("conv2", None)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -460,10 +459,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"object_type": [(torch.nn.Conv2d, default_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -485,10 +484,10 @@ def forward(self, x, y):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"object_type": [(operator.add, default_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data, data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data, data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -513,10 +512,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {"module_name_regex": [("conv*", default_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         # first conv is quantized, second conv is not quantized
         node_list = [
@@ -558,7 +557,7 @@ def forward(self, x):
             "object_type": [(nn.Conv2d, object_type_qconfig)],
             "module_name_regex": [("module_conv*", module_name_regex_qconfig)],
             "module_name": [("module_conv2", module_name_qconfig)]}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         self.assertEqual(m.linear.qconfig, global_qconfig)
         self.assertEqual(m.conv.qconfig, object_type_qconfig)
         self.assertEqual(m.module_conv1.qconfig, module_name_regex_qconfig)
@@ -577,10 +576,10 @@ def forward(self, x):
         m = M().eval()
         m = symbolic_trace(m)
         qconfig_dict = {'': default_qconfig}
-        m = prepare_static_fx(m, qconfig_dict)
+        m = prepare_fx(m, qconfig_dict)
         data = torch.randn(1, 1, 1, 1)
         m(data)
-        m = convert_static_fx(m)
+        m = convert_fx(m)
         m(data)
         for name, module in m.named_modules():
             self.assertFalse(hasattr(module, 'qconfig'),
@@ -632,12 +631,13 @@ def test_save_observer_state_dict(self):
         qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')}
         # symbolically trace
         model = symbolic_trace(model)
-        model = prepare_static_fx(model, qconfig_dict)
+        model = prepare_fx(model, qconfig_dict)
+
         # run it through input
         x = torch.randn(5, 5)
         model(x)
 
-        quant = convert_static_fx(model)
+        quant = convert_fx(model)
 
         # save state_dict of model
         obs_dict = torch.quantization.get_observer_state_dict(model)
@@ -648,12 +648,12 @@ def test_save_observer_state_dict(self):
         # Load the stats into new model
         model_2 = orig
         model_2 = symbolic_trace(model_2)
-        model_2 = prepare_static_fx(model_2, qconfig_dict)
+        model_2 = prepare_fx(model_2, qconfig_dict)
 
         loaded_dict = torch.load(b)
         torch.quantization.load_observer_state_dict(model_2, loaded_dict)
 
-        quant_2 = convert_static_fx(model_2)
+        quant_2 = convert_fx(model_2)
 
         # Verify that loaded state dict produces same results.
         self.assertEqual(quant(x), quant_2(x))
@@ -765,7 +765,7 @@ def is_leaf_module(self, m, module_qualified_name):
             m = CustomTracer().trace(original_m).eval()
             qconfig_dict = {'': default_qconfig}
             # check prepared model
-            m = prepare_static_fx(m, qconfig_dict)
+            m = prepare_fx(m, qconfig_dict)
             # calibration
             m(data)
             # all activation observers are inserted in the top level module
@@ -775,7 +775,7 @@ def is_leaf_module(self, m, module_qualified_name):
             self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
 
             # check converted/quantized model
-            m = convert_static_fx(m)
+            m = convert_fx(m)
             count_check = {
                 ns.call_function(torch.quantize_per_tensor) : 1,
                 ns.call_module(nnq.Conv2d) : 1,
@@ -1347,7 +1347,7 @@ def forward(self, x):
         data = torch.rand(1, 3, 10, 10)
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M()
+        m = M().eval()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
@@ -1442,7 +1442,7 @@ def forward(self, x):
 
         # This model is not executable since we just put all ops
         # in the same forward
-        m = M()
+        m = M().eval()
         original = symbolic_trace(m)
         # nothing to fuse so skipping the fuse step
         qconfig_dict = {'': default_qconfig}
@@ -1511,7 +1511,6 @@ def _test_model_impl(
         if mode != 'static':
             model.train()
 
-        graph_module = fuse_fx(graph_module)
         prepared = prepare_fx(graph_module, qconfig_dict)
 
         if mode == 'ddp':
diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py
index ae9e92ccda26..fbdccbc5e3e2 100644
--- a/torch/quantization/fx/pattern_utils.py
+++ b/torch/quantization/fx/pattern_utils.py
@@ -25,18 +25,6 @@ def insert(fn):
 def get_quant_patterns():
     return QUANTIZATION_PATTERNS
 
-DYNAMIC_QUANTIZATION_PATTERNS = OrderedDict()
-# Register pattern for dynamic quantization
-def register_dynamic_quant_pattern(pattern):
-    def insert(fn):
-        DYNAMIC_QUANTIZATION_PATTERNS[pattern] = fn
-        return fn
-    return insert
-
-# Get patterns for dynamic quantization
-def get_dynamic_quant_patterns():
-    return DYNAMIC_QUANTIZATION_PATTERNS
-
 # Example use of register pattern function:
 # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 # class ConvBNReLUFusion():
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index 0cb076dd73c7..844351a30def 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -2,6 +2,9 @@
 from torch.fx.graph import (
     Node,
 )
+import torch.nn.quantized as nnq
+import torch.nn.quantized.dynamic as nnqd
+
 from ..quantization_mappings import (
     get_static_quant_module_class,
     get_quantized_operator,
@@ -11,12 +14,15 @@
 )
 from .pattern_utils import (
     register_quant_pattern,
-    register_dynamic_quant_pattern,
 )
 from .utils import (
     _parent_name,
     quantize_node,
     get_per_tensor_qparams,
+    activation_is_statically_quantized,
+    weight_is_quantized,
+    weight_dtype,
+    get_linear_prepack_op_for_dtype,
 )
 
 from abc import ABC, abstractmethod
@@ -235,7 +241,7 @@ def convert(self, quantizer, node, load_arg, debug=False):
 # for error checks
 @register_quant_pattern((torch.nn.ReLU, torch.nn.Linear))
 @register_quant_pattern((torch.nn.functional.relu, torch.nn.Linear))
-class LinearReLU(QuantizeHandler):
+class LinearReLUQuantizeHandler(QuantizeHandler):
     def __init__(self, quantizer, node):
         super().__init__(quantizer, node)
         self.relu_node = None
@@ -248,50 +254,76 @@ def __init__(self, quantizer, node):
             self.linear = quantizer.modules[self.linear_node.target]
 
     def convert(self, quantizer, node, load_arg, debug=False):
+        qconfig = quantizer.qconfig_map[node.name]
+        activation_statically_quantized = activation_is_statically_quantized(qconfig)
         # TODO: debug option for linear module
         if self.linear_node.op == 'call_module':
             # note that relu should already be fused into conv module in the fusion step
             assert self.relu_node is None, 'linear module and relu fusion is not executed, ' \
                 'please make sure to run fusion before prepare'
-            # 1. attach activation post process to module
-            if type(self.linear) == torch.nn.intrinsic.LinearReLU:
-                self.linear[1].activation_post_process = quantizer.activation_post_process_map[node.name]
+            # 1. attach output activation post process to linear module
+            if node.name in quantizer.activation_post_process_map:
+                # this is the static quantization case
+                output_activation_post_process = quantizer.activation_post_process_map[node.name]
             else:
-                self.linear.activation_post_process = quantizer.activation_post_process_map[node.name]
-            # 2. select quantized class
+                output_activation_post_process = None
+
+            if output_activation_post_process:
+                if type(self.linear) == torch.nn.intrinsic.LinearReLU:
+                    float_linear_module = self.linear[1]
+                else:
+                    float_linear_module = self.linear
+                float_linear_module.activation_post_process = output_activation_post_process
+
+            # 2. select corresponding quantized linear class for the float linear class
             if type(self.linear) in [torch.nn.Linear, torch.nn.qat.Linear]:
-                qlinear = torch.nn.quantized.Linear
+                qlinear = nnq.Linear if activation_statically_quantized else nnqd.Linear
             elif type(self.linear) in [torch.nn.intrinsic.LinearReLU, torch.nn.intrinsic.qat.LinearReLU]:
+                assert activation_statically_quantized, \
+                    'Only static quantization is supported for LinearReLU'
                 qlinear = torch.nn.intrinsic.quantized.LinearReLU
             else:
                 raise Exception("unhandled linear type:", type(self.linear))
             quantized = qlinear.from_float(self.linear)
             parent_name, name = _parent_name(self.linear_node.target)
             setattr(quantizer.modules[parent_name], name, quantized)
+            # activation needs to be quantized for static quantization
             return quantizer.quantized_graph.create_node(
                 'call_module',
-                self.linear_node.target, (load_arg(quantized=True)(self.linear_node.args[0]),), {})
+                self.linear_node.target,
+                (load_arg(quantized=activation_statically_quantized)(self.linear_node.args[0]),), {})
         elif self.linear_node.op == 'call_function':
             if debug:
-                args = load_arg(quantized=[0, 1])(self.linear_node.args)
+                quantized_input_idxs = []
+                if activation_statically_quantized:
+                    quantized_input_idxs.append(0)
+                if weight_is_quantized(qconfig):
+                    quantized_input_idxs.append(1)
+                args = load_arg(quantized=quantized_input_idxs)(self.linear_node.args)
                 args = load_arg(quantized=False)(self.linear_node.args)
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 linear_out = quantizer.quantized_graph.create_node(
                     'call_function', torch.nn.functional.linear, args, kwargs)
-                root_module = quantizer.modules['']
-                return quantize_node(
-                    root_module,
-                    quantizer.quantized_graph,
-                    linear_out,
-                    quantizer.activation_post_process_map[self.linear_node.name])
-            else:
-                # TODO: this code can be merged with dynamic linear code
+                if activation_statically_quantized:
+                    # quantize output for statically quantized linear op
+                    root_module = quantizer.modules['']
+                    return quantize_node(
+                        root_module,
+                        quantizer.quantized_graph,
+                        linear_out,
+                        quantizer.activation_post_process_map[self.linear_node.name])
+                else:
+                    # output for dynamically quantized linear op is not quantized
+                    return linear_out
+            else:  # non-debug option
                 # linear args
                 # (x, weight, bias, ...)
-                args = load_arg(quantized=[0, 1])(self.linear_node.args)
+                weight_quantized = weight_is_quantized(qconfig)
+                linear_weight = load_arg(quantized=weight_quantized)(self.linear_node.args[1])
+
+                # get other arguments
                 kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
                 # pack weight
-                weight = load_arg(quantized=True)(self.linear_node.args[1])
                 bias = None
                 # all args after bias, including bias
                 other_args = load_arg(quantized=False)(self.linear_node.args[2:])
@@ -303,17 +335,24 @@ def convert(self, quantizer, node, load_arg, debug=False):
                         'expect bias provided as a keyword argument when it is not a positional argument'
                     bias = kwargs['bias']
                     kwargs.pop('bias')
-                prepack_args = (weight, bias)
+                prepack_args = (linear_weight, bias)
+                prepack_op = get_linear_prepack_op_for_dtype(weight_dtype(qconfig))
                 packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear_prepack, prepack_args, {})
+                    'call_function', prepack_op, prepack_args, {})
                 # construct linear input
-                linear_input = load_arg(quantized=True)(self.linear_node.args[0])
-                activation_post_process = \
-                    quantizer.activation_post_process_map[self.linear_node.name]
-                scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
-                qlinear_args = (linear_input, packed_weight, scale, zero_point)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
+                if activation_statically_quantized:
+                    linear_input = load_arg(quantized=True)(self.linear_node.args[0])
+                    activation_post_process = \
+                        quantizer.activation_post_process_map[self.linear_node.name]
+                    scale, zero_point, _ = get_per_tensor_qparams(activation_post_process)
+                    qlinear_args = (linear_input, packed_weight, scale, zero_point)
+                    return quantizer.quantized_graph.create_node(
+                        'call_function', torch.ops.quantized.linear, qlinear_args, kwargs)
+                else:
+                    linear_input = load_arg(quantized=False)(self.linear_node.args[0])
+                    qlinear_args = (linear_input, packed_weight)
+                    return quantizer.quantized_graph.create_node(
+                        'call_function', torch.ops.quantized.linear_dynamic, qlinear_args, kwargs)
 
 @register_quant_pattern(torch.nn.BatchNorm2d)
 @register_quant_pattern(torch.nn.BatchNorm3d)
@@ -537,10 +576,8 @@ class StandaloneModuleQuantizeHandler(QuantizeHandler):
     """
     def convert(self, quantizer, node, load_arg, debug=False):
         assert node.op == 'call_module'
-        if quantizer.is_dynamic_quant:
-            convert = torch.quantizations.quantize_fx._convert_dynamic_standalone_module_fx
-        else:
-            convert = torch.quantization.quantize_fx._convert_standalone_module_fx
+        qconfig = quantizer.qconfig_map[node.name]
+        convert = torch.quantization.quantize_fx._convert_standalone_module_fx
         observed_standalone_module = quantizer.modules[node.target]
         quantized_standalone_module = convert(observed_standalone_module, debug=debug)
         parent_name, name = _parent_name(node.target)
@@ -548,67 +585,3 @@ def convert(self, quantizer, node, load_arg, debug=False):
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
         return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None))
-
-
-# 2. Post Training Dynamic Quantizatoin Patterns
-@register_dynamic_quant_pattern(torch.nn.Linear)
-@register_dynamic_quant_pattern(torch.nn.functional.linear)
-class DynamicLinear(QuantizeHandler):
-    def __init__(self, quantizer, node):
-        super().__init__(quantizer, node)
-        self.linear_node = node
-        if node.op == 'call_module':
-            assert isinstance(quantizer.modules[node.target], torch.nn.Linear)
-            self.linear = quantizer.modules[self.linear_node.target]
-
-    def convert(self, quantizer, node, load_arg, debug=False):
-        if self.linear_node.op == 'call_module':
-            quantized = torch.nn.quantized.dynamic.Linear.from_float(self.linear)
-            parent_name, name = _parent_name(self.linear_node.target)
-            setattr(quantizer.modules[parent_name], name, quantized)
-            return quantizer.quantized_graph.create_node(
-                'call_module',
-                self.linear_node.target,
-                (load_arg(quantized=False)(self.linear_node.args[0]),),
-                {})
-        elif self.linear_node.op == 'call_function':
-            if debug:
-                # quantize and dequantize weight
-                args = load_arg(quantized=[1])(self.linear_node.args)
-                args = load_arg(quantized=False)(self.linear_node.args)
-                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.nn.functional.linear, args, kwargs)
-            else:
-                # linear args:
-                # (x, observed_weight, bias)
-                # get observer for the weight
-                weight_observer = quantizer.activation_post_process_map[self.linear_node.args[1].args[0].name]
-
-                if weight_observer.dtype == torch.float16:
-                    linear_weight = load_arg(quantized=False)(self.linear_node.args[1])
-                    prepack_op = torch.ops.quantized.linear_prepack_fp16
-                else:
-                    linear_weight = load_arg(quantized=True)(self.linear_node.args[1])
-                    prepack_op = torch.ops.quantized.linear_prepack
-                bias = None
-                # all args after bias, including bias
-                other_args = load_arg(quantized=False)(self.linear_node.args[2:])
-                kwargs = load_arg(quantized=False)(self.linear_node.kwargs)
-                if len(self.linear_node.args) > 2:
-                    bias = load_arg(quantized=False)(self.linear_node.args[2])
-                    other_args = other_args[1:]  # remove the bias argument
-                else:
-                    assert 'bias' in kwargs, \
-                        'expect bias provided as a keyword argument when it is not a positional argument'
-                    bias = kwargs['bias']
-                    kwargs.pop('bias')
-                prepack_args = (linear_weight, bias)
-                # pack weight
-                packed_weight = quantizer.quantized_graph.create_node(
-                    'call_function', prepack_op, prepack_args, {})
-                # construct dynamic linear input
-                non_quantized_input = load_arg(quantized=False)(self.linear_node.args[0])
-                qdynamic_linear_args = (non_quantized_input, packed_weight)
-                return quantizer.quantized_graph.create_node(
-                    'call_function', torch.ops.quantized.linear_dynamic, qdynamic_linear_args, kwargs)
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index eaebe4c04d55..64c4fd18103a 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -31,7 +31,6 @@
 from .pattern_utils import (
     is_match,
     get_quant_patterns,
-    get_dynamic_quant_patterns,
 )
 
 from .standalone_module import (
@@ -44,6 +43,7 @@
 from .utils import (
     _parent_name,
     quantize_node,
+    activation_is_statically_quantized,
 )
 
 from collections import OrderedDict
@@ -304,7 +304,7 @@ def get_qconfig(module_name):
                 self.modules[node.target].qconfig = module_qconfig
                 self.qconfig_map[node.name] = module_qconfig
 
-    def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module):
+    def _prepare(self, model, qconfig_dict, inplace, is_standalone_module):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
@@ -320,11 +320,7 @@ def _prepare(self, model, qconfig_dict, inplace, is_dynamic_quant, is_standalone
         """
         if not inplace:
             model = copy.deepcopy(model)
-        self.is_dynamic_quant = is_dynamic_quant
-        if self.is_dynamic_quant:
-            self.patterns = get_dynamic_quant_patterns()
-        else:
-            self.patterns = get_quant_patterns()
+        self.patterns = get_quant_patterns()
 
         flattened_qconfig_dict = get_flattened_qconfig_dict(qconfig_dict)
         # TODO: support regex as well
@@ -404,10 +400,7 @@ def insert_observer(node, observer, device):
                     # observe standalone module
                     standalone_module = self.modules[node.target]
                     traced_standalone_module = symbolic_trace(standalone_module)
-                    if self.is_dynamic_quant:
-                        prepare = torch.quantization.quantize_fx._prepare_dynamic_standalone_module_fx
-                    else:
-                        prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
+                    prepare = torch.quantization.quantize_fx._prepare_standalone_module_fx
                     observed_standalone_module = prepare(traced_standalone_module, {'': qconfig})
                     observed_standalone_module.qconfig = qconfig
                     standalone_module_input_idxs = observed_standalone_module._standalone_module_observed_input_idxs
@@ -417,8 +410,9 @@ def insert_observer(node, observer, device):
                     self.modules[node.target] = observed_standalone_module
 
 
-                # don't need to insert observer for output in dynamic quantization
-                if self.is_dynamic_quant:
+                # don't need to insert observer for output if activation does not
+                # need to be statically quantized
+                if not activation_is_statically_quantized(qconfig):
                     continue
 
                 # inserting observers for output of observed module, or mark the output
@@ -516,16 +510,13 @@ def restore_state(self, observed):
         self.qconfig_map = observed._qconfig_map
 
     def prepare(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=is_standalone_module)
-
-    def prepare_dynamic(self, model, qconfig_dict, inplace=False, is_standalone_module=False):
-        return self._prepare(model, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=is_standalone_module)
+        return self._prepare(model, qconfig_dict, inplace, is_standalone_module=is_standalone_module)
 
     def _run_weight_observers(self, observed):
-        r''' Extract the subgraph that produces the weight for dynamically quantized
-        node and run the subgraph to observe the weight.
-        Note that the observers of dynamically quantized modules are run during
-        the conversion step.
+        r''' Extract the subgraph that produces the weight for dynamic quant
+        or weight only quant node and run the subgraph to observe the weight.
+        Note that the observers of dynamic quant or weight only quant ops are run during
+        the convert step.
         '''
         for node in observed.graph.nodes:
             if node.op == 'call_function' and node.target in WEIGHT_INDEX_DICT:
@@ -540,7 +531,7 @@ def _run_weight_observers(self, observed):
                             weight_observer_module()
         return
 
-    def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False, is_standalone_module=False):
+    def _convert(self, model, inplace=False, debug=False, is_standalone_module=False):
         """ standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
         For standalone module: the inputs will be quantized by parent module,
@@ -553,11 +544,9 @@ def _convert(self, model, inplace=False, debug=False, is_dynamic_quant=False, is
         self.restore_state(model)
         if not inplace:
             model = copy.deepcopy(model)
-        self.is_dynamic_quant = is_dynamic_quant
-        # run weight observers before inserting quant dequant nodes
-        # for dynamic quantization
-        if self.is_dynamic_quant:
-            self._run_weight_observers(model)
+        # always run weight observers in the top level forward method
+        # for dynamic quant ops or weight only quant ops
+        self._run_weight_observers(model)
 
         # move to cpu since we only have quantized cpu kernels
         model.eval().cpu()
@@ -658,7 +647,7 @@ def is_quantized(node):
                     result = self.quantized_graph.node_copy(node, load_non_quantized)
                     quantized = False
                 else:
-                    result = obj.convert(self, node, load_arg)
+                    result = obj.convert(self, node, load_arg, debug=debug)
                     if node.op == 'call_module' and is_observed_standalone_module(self.modules[node.target]):
                         quantized = self.modules[node.target]._output_is_observed
                     else:
@@ -673,8 +662,7 @@ def is_quantized(node):
                             'CopyNode of type ' + node.op + ' is not handled'
                         quantized = is_quantized(node.args[0])
 
-                    # output of dynamic quantization is not quantized
-                    if self.is_dynamic_quant:
+                    if not activation_is_statically_quantized(qconfig):
                         quantized = False
 
                 if quantized:
@@ -800,8 +788,8 @@ def load_arg(a):
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 
-    def convert(self, model, inplace=False, debug=False, is_dynamic=False, is_standalone_module=False):
-        quantized = self._convert(model, inplace, debug, is_dynamic, is_standalone_module)
+    def convert(self, model, inplace=False, debug=False, is_standalone_module=False):
+        quantized = self._convert(model, inplace, debug, is_standalone_module)
         if not debug:
             quantized = self._fold_weight(quantized)
         return quantized
@@ -903,7 +891,8 @@ def visit_arg(arg):
                     for i, node_arg in enumerate(node.args):
                         if arg is node_arg and i in WEIGHT_INDEX_DICT[node.target]:
                             is_weight = True
-                if (not self.is_dynamic_quant) or is_weight:
+                if qconfig is not None and \
+                   (activation_is_statically_quantized(qconfig) or is_weight):
                     # overwrite previous quant config
                     quants[arg.name] = (DefaultQuant(self, arg), qconfig, is_weight)
             return visit_arg
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index 5d5532dc48fc..98f94a0633a0 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -138,3 +138,30 @@ def get_next_qparams_idx(module, qparams):
         qparam_full_path = key + str(idx)
         inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})
+
+def activation_is_statically_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    statically quantized or not
+    """
+    assert qconfig is not None
+    activation = qconfig.activation()
+    return activation.dtype in [torch.quint8, torch.qint8]
+
+def weight_dtype(qconfig):
+    assert qconfig is not None
+    weight = qconfig.weight()
+    return weight.dtype
+
+def weight_is_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    quantized or not
+    """
+    return weight_dtype(qconfig) in [torch.quint8, torch.qint8]
+
+def get_linear_prepack_op_for_dtype(dtype):
+    if dtype == torch.float16:
+        return torch.ops.quantized.linear_prepack_fp16
+    elif dtype == torch.qint8:
+        return torch.ops.quantized.linear_prepack
+    else:
+        raise Exception("can't get linear prepack op for dtype:", dtype)
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index 8c48a8dded95..7fa3fcbee82f 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -20,10 +20,10 @@ def fuse_fx(graph_module, inplace=False):
     fuser = Fuser()
     return fuser.fuse(graph_module, inplace)
 
-def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant, is_standalone_module=False):
+def _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=False):
     r""" Internal helper function for prepare_fx
     Args:
-      graph_module, qconfig_dict, inplace: see docs for :func:`~torch.quantization.prepare_fx`
+      `graph_modul`e, `qconfig_dict`, `inplace`: see docs for :func:`~torch.quantization.prepare_fx`
       `is_standalone_module`: a boolean flag indicates whether we are
       quantizing a standalone module or not, a standalone module
       is a submodule of the parent module that is not inlined in the
@@ -34,8 +34,7 @@ def _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant, is_standa
     _check_is_graph_module(graph_module)
     graph_module = fuse_fx(graph_module, inplace)
     quantizer = Quantizer()
-    prepare = quantizer.prepare_dynamic if is_dynamic_quant else quantizer.prepare
-    return prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
+    return quantizer.prepare(graph_module, qconfig_dict, inplace=True, is_standalone_module=is_standalone_module)
 
 def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
     r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
@@ -52,28 +51,81 @@ def _prepare_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
                                    custom module is observed or not
 
     """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False, is_standalone_module=True)
-
-def _prepare_dynamic_standalone_module_fx(graph_module, qconfig_dict, inplace=False):
-    r""" See :func:`~torch.quantization.prepare_standalone_module_fx`
-    """
-    return _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=True, is_standalone_module=True)
+    return _prepare_fx(graph_module, qconfig_dict, inplace, is_standalone_module=True)
 
 def prepare_fx(graph_module, qconfig_dict, inplace=False):
-    r""" Prepare a model for post training static quantization or
-    qantization aware training, not for public use.
+    r""" Prepare a model for post training static quantization
 
     Args:
-      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
       an eval model
-      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
+      `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
+      qconfig_dict = {
+      # optional, global config
+      "": qconfig?,
+
+      # optional, used for module and function types
+      # could also be split into module_types and function_types if we prefer
+      "object_type": [
+        (torch.nn.Conv2d, qconfig?),
+        (torch.nn.functional.add, qconfig?),
+        ...,
+       ],
+
+      # optional, used for module names
+      "module_name": [
+        ("foo.bar", qconfig?)
+        ...,
+      ],
+
+      # optional, matched in order, first match takes precedence
+      "module_name_regex": [
+        ("foo.*bar.*conv[0-9]+", qconfig?)
+        ...,
+      ],
+      # priority (in increasing order): global, object_type, module_name_regex, module_name
+      # qconfig == None means fusion and quantization should be skipped for anything
+      # matching the rule
+
+      # optional: specify the path for standalone modules
+      # These modules are symbolically traced and quantized as one unit
+      # User should also skip symbolic tracing through these modules
+      # so that the call to the submodule appears as one call_module
+      # node in the forward graph of the GraphModule
+      "standalone_module_name": [
+         "submodule.standalone"
+      ]
+      }
+      `inplace`: flag for carry out model transformations in-place,
+      the original module is mutated
+
 
     Return:
-      A GraphModule with observer or fake quant modules, ready for
-      calibration or quantization aware training
+      A GraphModule with observer (configured by qconfig_dict), ready for calibration
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import get_default_qconfig
+    from torch.quantization import prepare_fx
+
+    float_model.eval()
+    graph_module = torch.fx.symbolic_trace(float_model)
+    qconfig = get_default_qconfig('fbgemm')
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+    qconfig_dict = {"": qconfig}
+    prepared_model = prepare_fx(graph_module, qconfig_dict)
+    # Run calibration
+    calibrate(prepared_model, sample_inference_data)
     """
-    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, is_dynamic_quant=False)
-    return prepared
+    assert not graph_module.training, 'prepare_fx only works for models in' + \
+        'eval mode'
+    return _prepare_fx(graph_module, qconfig_dict, inplace)
 
 def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
     assert not graph_module.training, 'prepare_static_fx only works for models in ' + \
@@ -83,39 +135,63 @@ def prepare_static_fx(graph_module, qconfig_dict, inplace=False):
 def prepare_qat_fx(graph_module, qconfig_dict, inplace=False):
     r""" Prepare a model for quantization aware training
     Args:
-      graph_module: model from symbolic_tracing (torch.fx.symbolic_trace), must be
-      a train model
-      qconfig_dict: see :func:`~torch.quantization.quantize_fx`
+      `graph_module`: model from symbolic_tracing (torch.fx.symbolic_trace), must be
+       a train model
+      `qconfig_dict`: see :func:`~torch.quantization.prepare_fx`
+      `inplace`: flag for carry out model transformations in-place,
+       the original module is mutated
 
     Return:
-      A GraphModule with observer or fake quant modules, ready for
-      calibration or quantization aware training
+      A GraphModule with fake quant modules (configured by qconfig_dict), ready for
+      quantization aware training
+
+    Example:
+    ```python
+    import torch
+    from torch.quantization import get_default_qat_qconfig
+    from torch.quantization import prepare_fx
+
+    float_model.train()
+    graph_module = torch.fx.symbolic_trace(float_model)
+    qconfig = get_default_qat_qconfig('fbgemm')
+    def train_loop(model, train_data):
+        model.train()
+        for image, target in data_loader:
+            ...
+
+    qconfig_dict = {"": qconfig}
+    prepared_model = prepare_fx(graph_module, qconfig_dict)
+    # Run calibration
+    train_loop(prepared_model, train_loop)
     """
     assert graph_module.training, 'prepare_qat_fx only works for models in ' + \
         'train mode'
-    return prepare_fx(graph_module, qconfig_dict, inplace)
+    return _prepare_fx(graph_module, qconfig_dict, inplace)
 
-def prepare_dynamic_fx(graph_module, qconfig_dict, inplace=False):
-    r""" Prepare a model for post training dynamic quantization
+def _convert_fx(graph_module, inplace, debug, is_standalone_module=False):
+    """ `is_standalone_module`: see docs in :func:`~torch.quantization.prepare_standalone_module_fx`
     """
-    prepared = _prepare_fx(graph_module, qconfig_dict, inplace, True)
-    return prepared
-
-def _convert_fx(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module=False):
     _check_is_graph_module(graph_module)
     quantizer = Quantizer()
-    return quantizer.convert(graph_module, inplace, debug, is_dynamic_quant, is_standalone_module)
+    return quantizer.convert(graph_module, inplace, debug, is_standalone_module)
 
 def convert_fx(graph_module, inplace=False, debug=False):
     r""" Convert a calibrated or trained model to a quantized model
-    """
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False)
-
-convert_static_fx = convert_fx
-convert_qat_fx = convert_fx
+    Args:
+        `graph_module`: A prepared and calibrated/trained model (GraphModule)
+        `inplace`: flag for carry out model transformations in-place,
+        the original module is mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+    Return:
+        A quantized model (GraphModule)
 
-def convert_dynamic_fx(graph_module, inplace=False, debug=False):
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True)
+    Example:
+    ```python
+    # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+    quantized_model = convert_fx(prepared_model)
+    ```
+    """
+    return _convert_fx(graph_module, inplace, debug)
 
 def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
@@ -128,151 +204,4 @@ def _convert_standalone_module_fx(graph_module, inplace=False, debug=False):
       A quantized standalone module which accepts quantized input(if needed)
       and produces quantized output (if needed).
     """
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=False, is_standalone_module=True)
-
-def _convert_dynamic_standalone_module_fx(graph_module, inplace=False, debug=False):
-    r""" See :func:`~torch.quantization.convert_standalone_module_fx`
-    """
-    return _convert_fx(graph_module, inplace, debug, is_dynamic_quant=True, is_standalone_module=True)
-
-def _quantize_fx(model, qconfig_dict, run_fn=None, run_args=None, inplace=False,
-                 debug=False, is_dynamic_quant=False):
-    assert not model.training, 'quantize_fx is only used for post training ' + \
-        'quantization(eval mode), for quantization aware training please use ' + \
-        'prepare_qat_fx and convert_qat_fx.'
-
-    if is_dynamic_quant:
-        model = prepare_dynamic_fx(model, qconfig_dict, inplace)
-        # inplace is True since the inplace option is already applied in previous step
-        model = convert_dynamic_fx(model, inplace=True, debug=debug)
-    else:
-        assert run_fn, "Must provide calibration function for post training static quantization"
-        assert run_args, "Must provide calibration dataset for post training static quantization"
-        model = prepare_fx(model, qconfig_dict, inplace)
-        run_fn(model, *run_args)
-        # inplace is True since the inplace option is already applied in previous step
-        model = convert_fx(model, inplace=True, debug=debug)
-
-    return model
-
-
-def quantize_static_fx(model, qconfig_dict, run_fn, run_args, inplace=False, debug=False):
-    r"""Quantize the input float symbolically traced GraphModule model with
-    post training static quantization
-
-    First it will prepare the model for calibration, then it calls
-    `run_fn` which will run the calibration step, after that we will
-    convert the model to a quantized model.
-
-    Args:
-        `model`: input float TorchScript model
-        `qconfig_dict`: qconfig_dict is a dictionary with the following configurations:
-        qconfig_dict = {
-        # optional, global config
-        "": qconfig?,
-
-        # optional, used for module and function types
-        # could also be split into module_types and function_types if we prefer
-        "object_type": [
-          (torch.nn.Conv2d, qconfig?),
-          (torch.nn.functional.add, qconfig?),
-          ...,
-         ],
-
-        # optional, used for module names
-        "module_name": [
-          ("foo.bar", qconfig?)
-          ...,
-        ],
-
-        # optional, matched in order, first match takes precedence
-        "module_name_regex": [
-          ("foo.*bar.*conv[0-9]+", qconfig?)
-          ...,
-        ],
-        # priority (in increasing order): global, object_type, module_name_regex, module_name
-        # qconfig == None means fusion and quantization should be skipped for anything
-        # matching the rule
-
-        # optional: specify the path for standalone modules
-        # These modules are symbolically traced and quantized as one unit
-        # User should also skip symbolic tracing through these modules
-        # so that the call to the submodule appears as one call_module
-        # node in the forward graph of the GraphModule
-        "standalone_module_name": [
-           "submodule.standalone"
-        ]
-        }
-        `run_fn`: a calibration function for calibrating the prepared model
-        `run_args`: positional arguments for `run_fn`
-        `inplace`: carry out model transformations in-place, the original module is
-        mutated
-        `debug`: flag for producing a debug friendly model (preserve weight attribute)
-
-    Return:
-        Quantized TorchSciprt model.
-
-    Example:
-    ```python
-    import torch
-    from torch.quantization import get_default_qconfig
-    from torch.quantization import quantize_fx
-
-    graph_module = torch.fx.symbolic_trace(float_model.eval())
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-
-    quantized_model = quantize_fx(
-        graph_module,
-        {'': qconfig},
-        calibrate,
-        [data_loader_test])
-    ```
-    """
-    return _quantize_fx(
-        model, qconfig_dict, run_fn, run_args, inplace, debug, is_dynamic_quant=False)
-
-def quantize_dynamic_fx(model, qconfig_dict, inplace=False, debug=False):
-    r"""Quantize the input float symbolically traced GraphModule model with
-    post training dynamic quantization.
-    Currently only qint8 quantization of torch.nn.Linear is supported.
-
-    Args:
-        `model`: input float TorchScript model
-        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
-        qconfig for that module as value, please see detailed
-        descriptions in :func:`~torch.quantization.quantize_fx`
-        `inplace`: carry out model transformations in-place, the original module is
-        mutated
-        `debug`: flag for producing a debug friendly model (preserve weight attribute)
-
-    Return:
-        Quantized TorchSciprt model.
-
-    Example:
-    ```python
-    import torch
-    from torch.quantization import per_channel_dynamic_qconfig
-    from torch.quantization import quantize_dynmiac_fx
-
-    graph_module = torch.fx.symbolic_trace(float_model.eval())
-    qconfig = get_default_qconfig('fbgemm')
-    def calibrate(model, data_loader):
-        model.eval()
-        with torch.no_grad():
-            for image, target in data_loader:
-                model(image)
-
-    quantized_model = quantize_dynamic_fx(
-        graph_module,
-        {'': qconfig},
-        calibrate,
-        [data_loader_test])
-    ```
-    """
-    return _quantize_fx(
-        model, qconfig_dict, inplace=inplace, debug=debug, is_dynamic_quant=True)
+    return _convert_fx(graph_module, inplace, debug, is_standalone_module=True)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index bcdb766b997d..ccbda8232952 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -29,9 +29,8 @@
 from torch.quantization import (
     QuantType,
     prepare_fx,
-    prepare_dynamic_fx,
+    prepare_qat_fx,
     convert_fx,
-    convert_dynamic_fx,
 )
 
 import copy
@@ -630,41 +629,46 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         if type(inputs) == list:
             inputs = inputs[0]
         if quant_type == QuantType.QAT:
-            qconfig_dict = {'': get_default_qat_qconfig(torch.backends.quantized.engine)}
+            qconfig = get_default_qat_qconfig(torch.backends.quantized.engine)
             model.train()
+        elif quant_type == QuantType.STATIC:
+            qconfig = get_default_qconfig(torch.backends.quantized.engine)
+            model.eval()
         else:
-            qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
+            qconfig = default_dynamic_qconfig
             model.eval()
-        original = symbolic_trace(model)
 
-        if quant_type == QuantType.DYNAMIC:
-            prepare = prepare_dynamic_fx
-            convert = convert_dynamic_fx
+        original = symbolic_trace(model)
+        if quant_type == QuantType.QAT:
+            prepare = prepare_qat_fx
         else:
             prepare = prepare_fx
-            convert = convert_fx
 
+        qconfig_dict = {'': qconfig}
         prepared = prepare(original, qconfig_dict)
         prepared(*inputs)
-        qgraph = convert(prepared)
-        qgraph_debug = convert(prepared, debug=True)
+        qgraph = convert_fx(prepared)
+        qgraph_debug = convert_fx(prepared, debug=True)
 
         result = qgraph(*inputs)
         result_debug = qgraph_debug(*inputs)
 
-        self.assertEqual((result - result_debug).abs().max(), 0), \
-            'Expecting debug and non-debug option to produce identical result'
+        # numeric match for debug option for dynamic
+        # quantized op is not needed right now
+        if quant_type != QuantType.DYNAMIC:
+            self.assertEqual((result - result_debug).abs().max(), 0), \
+                'Expecting debug and non-debug option to produce identical result'
 
+        qgraph_to_check = qgraph_debug if debug else qgraph
         if print_debug_info:
             print()
             print('quant type:', quant_type)
             print('origianl graph module:', type(model))
             self.printGraphModule(original)
             print()
-            print('quantized graph module:', type(qgraph))
-            self.printGraphModule(qgraph)
+            print('quantized graph module:', type(qgraph_to_check))
+            self.printGraphModule(qgraph_to_check)
             print()
-        qgraph_to_check = qgraph_debug if debug else qgraph
         self.checkGraphModuleNodes(
             qgraph_to_check, expected_node, expected_node_occurrence, expected_node_list)
 

From de3a48013a35bf6ebfba7fb4c5408339e57010cb Mon Sep 17 00:00:00 2001
From: Abaho Katabarwa <abkataba@microsoft.com>
Date: Thu, 1 Oct 2020 16:39:01 -0700
Subject: [PATCH 351/449] Use CAFFE2_USE_MSVC_STATIC_RUNTIME to determine when
 to avoid waiting for global destructors on Windows (#43532)

Summary:
We are trying to build libtorch statically (BUILD_SHARED_LIBS=OFF) then link it into a DLL. Our setup hits the infinite loop mentioned [here](https://github.com/pytorch/pytorch/blob/54c05fa34e1fbbb5096746a8ae92e27a08116de4/torch/csrc/autograd/engine.cpp#L228) because we build with `BUILD_SHARED_LIBS=OFF` but still link it all into a DLL at the end of the day.

This PR fixes the issue by changing the condition to guard on which windows runtime the build links against using the `CAFFE2_USE_MSVC_STATIC_RUNTIME` flag. `CAFFE2_USE_MSVC_STATIC_RUNTIME` defaults to ON when `BUILD_SHARED_LIBS=OFF`, so backwards compatibility is maintained.

I'm not entirely confident I understand the subtleties of the windows runtime versus linking setup, but this setup works for us and should not affect the existing builds.

Fixes https://github.com/pytorch/pytorch/issues/44470

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43532

Reviewed By: mrshenli

Differential Revision: D24053767

Pulled By: albanD

fbshipit-source-id: 1127fefe5104d302a4fc083106d4e9f48e50add8
---
 c10/CMakeLists.txt             | 1 +
 c10/macros/cmake_macros.h.in   | 1 +
 cmake/Summary.cmake            | 1 +
 tools/setup_helpers/cmake.py   | 1 +
 torch/csrc/autograd/engine.cpp | 2 +-
 5 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 818226cdc893..48bceb440954 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -17,6 +17,7 @@ set(C10_USE_GFLAGS ${USE_GFLAGS}) # used in cmake_macros.h.in
 set(C10_USE_GLOG ${USE_GLOG}) # used in cmake_macros.h.in
 set(C10_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in cmake_macros.h.in
 set(C10_USE_NUMA ${USE_NUMA})
+set(C10_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME})
 configure_file(
     ${CMAKE_CURRENT_LIST_DIR}/macros/cmake_macros.h.in
     ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h)
diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in
index 5e42506f20dc..2845fa1cd8d2 100644
--- a/c10/macros/cmake_macros.h.in
+++ b/c10/macros/cmake_macros.h.in
@@ -8,6 +8,7 @@
 #cmakedefine C10_USE_GLOG
 #cmakedefine C10_USE_GFLAGS
 #cmakedefine C10_USE_NUMA
+#cmakedefine C10_USE_MSVC_STATIC_RUNTIME
 
 // Used by libtorch mobile build to enable features that are not enabled by
 // caffe2 mobile build. Should only use it when necessary as we are committed
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index 3d4da7f06176..9d848c60c987 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -44,6 +44,7 @@ function(caffe2_print_configuration_summary)
     message(STATUS "    Python site-packages: ${PYTHON_SITE_PACKAGES}")
   endif()
   message(STATUS "  BUILD_SHARED_LIBS     : ${BUILD_SHARED_LIBS}")
+  message(STATUS "  CAFFE2_USE_MSVC_STATIC_RUNTIME     : ${CAFFE2_USE_MSVC_STATIC_RUNTIME}")
   message(STATUS "  BUILD_TEST            : ${BUILD_TEST}")
   message(STATUS "  BUILD_JNI             : ${BUILD_JNI}")
   message(STATUS "  BUILD_MOBILE_AUTOGRAD : ${BUILD_MOBILE_AUTOGRAD}")
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 83253cc3a526..abbfb6e7a65f 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -245,6 +245,7 @@ def generate(self, version, cmake_python_library, build_python, build_test, my_e
              'MKL_THREADING',
              'MKLDNN_CPU_RUNTIME',
              'MSVC_Z7_OVERRIDE',
+             'CAFFE2_USE_MSVC_STATIC_RUNTIME',
              'Numa_INCLUDE_DIR',
              'Numa_LIBRARIES',
              'ONNX_ML',
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 62ca26e46939..5ddaf4a4855d 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -227,7 +227,7 @@ Engine::~Engine() {
     // Do not wait for termination of global threads on Windows
     // Because CRT terminates DLL threads before calling
     // global object destructors
-#if !defined(_WIN32) || !defined(C10_BUILD_SHARED_LIBS)
+#if !defined(_WIN32) || defined(C10_USE_MSVC_STATIC_RUNTIME)
     std::unique_lock<std::mutex> lk(non_reentrant_device_thread_mutex_);
     while(non_reentrant_device_thread_count_.load() != 0) {
       non_reentrant_device_thread_condvar_.wait(lk);

From e8e0fca99e1fa8a02d9620e47abbb117a4dafc91 Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Thu, 1 Oct 2020 16:40:01 -0700
Subject: [PATCH 352/449] [iOS][CI] Update the dev cert (#45651)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45651

### Summary

1. Update the iOS developer certificates. The new expiration date is 10/01/2021.
2. Restore the iOS arm64 jobs and the nightly.

### Test Plan

The following CI jobs succeed

- ci/circleci: pytorch_ios_11_2_1_arm64_build
- ci/circleci: pytorch_ios_11_2_1_arm64_custom_build
- ci/circleci: pytorch_ios_11_2_1_x86_64_build

Test Plan: Imported from OSS

Reviewed By: husthyc

Differential Revision: D24065648

Pulled By: xta0

fbshipit-source-id: 758f41de8296fdfbd3cfad87e9445c2acafd5f94
---
 .../cimodel/data/simple/ios_definitions.py    |  4 +--
 .circleci/cimodel/data/simple/nightly_ios.py  |  2 +-
 .circleci/config.yml                          | 26 +++++++++++++++++--
 .circleci/scripts/binary_ios_test.sh          |  4 +--
 .../job-specs/job-specs-custom.yml            |  4 +--
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/.circleci/cimodel/data/simple/ios_definitions.py b/.circleci/cimodel/data/simple/ios_definitions.py
index 4446fa24fc28..1418bd5d8050 100644
--- a/.circleci/cimodel/data/simple/ios_definitions.py
+++ b/.circleci/cimodel/data/simple/ios_definitions.py
@@ -62,8 +62,8 @@ def gen_tree(self):
 
 WORKFLOW_DATA = [
     IOSJob(IOS_VERSION, ArchVariant("x86_64"), is_org_member_context=False),
-    # IOSJob(IOS_VERSION, ArchVariant("arm64")),
-    # IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
+    IOSJob(IOS_VERSION, ArchVariant("arm64")),
+    IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
 ]
 
 
diff --git a/.circleci/cimodel/data/simple/nightly_ios.py b/.circleci/cimodel/data/simple/nightly_ios.py
index 6c01479dde80..580dfa3d7ae8 100644
--- a/.circleci/cimodel/data/simple/nightly_ios.py
+++ b/.circleci/cimodel/data/simple/nightly_ios.py
@@ -60,7 +60,7 @@ def gen_tree(self):
 
 
 WORKFLOW_DATA = BUILD_CONFIGS + [
-    # IOSNightlyJob("binary", is_upload=True),
+    IOSNightlyJob("binary", is_upload=True),
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index f911b32c317c..65f1c4d3f05d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1541,7 +1541,7 @@ jobs:
             rm cert.txt
             bundle exec fastlane install_cert
             # install the provisioning profile
-            PROFILE=TestApp_CI.mobileprovision
+            PROFILE=PyTorch_CI_2021.mobileprovision
             PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
             mkdir -pv "${PROVISIONING_PROFILES}"
             cd "${PROVISIONING_PROFILES}"
@@ -1599,7 +1599,7 @@ jobs:
           command: |
             set -e
             PROJ_ROOT=/Users/distiller/project
-            PROFILE=TestApp_CI
+            PROFILE=PyTorch_CI_2021
             # run the ruby build script
             if ! [ -x "$(command -v xcodebuild)" ]; then
               echo 'Error: xcodebuild is not installed.'
@@ -6982,6 +6982,19 @@ workflows:
           ios_arch: x86_64
           ios_platform: SIMULATOR
           name: pytorch_ios_11_2_1_x86_64_build
+      - pytorch_ios_build:
+          build_environment: pytorch-ios-11.2.1-arm64_build
+          context: org-member
+          ios_arch: arm64
+          ios_platform: OS
+          name: pytorch_ios_11_2_1_arm64_build
+      - pytorch_ios_build:
+          build_environment: pytorch-ios-11.2.1-arm64_custom_build
+          context: org-member
+          ios_arch: arm64
+          ios_platform: OS
+          name: pytorch_ios_11_2_1_arm64_custom_build
+          op_list: mobilenetv2.yaml
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-mobile-build
           build_only: "1"
@@ -7165,6 +7178,15 @@ workflows:
           ios_arch: arm64
           ios_platform: OS
           name: pytorch_ios_11_2_1_nightly_arm64_build
+      - binary_ios_upload:
+          build_environment: libtorch-ios-11.2.1-nightly-binary-build-upload
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+          requires:
+            - pytorch_ios_11_2_1_nightly_x86_64_build
+            - pytorch_ios_11_2_1_nightly_arm64_build
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c
diff --git a/.circleci/scripts/binary_ios_test.sh b/.circleci/scripts/binary_ios_test.sh
index be281120016a..863b21724a5d 100644
--- a/.circleci/scripts/binary_ios_test.sh
+++ b/.circleci/scripts/binary_ios_test.sh
@@ -13,7 +13,7 @@ base64 --decode cert.txt -o Certificates.p12
 rm cert.txt
 bundle exec fastlane install_cert
 # install the provisioning profile
-PROFILE=TestApp_CI.mobileprovision
+PROFILE=PyTorch_CI_2021.mobileprovision
 PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
 mkdir -pv "${PROVISIONING_PROFILES}"
 cd "${PROVISIONING_PROFILES}"
@@ -25,5 +25,5 @@ if ! [ -x "$(command -v xcodebuild)" ]; then
     echo 'Error: xcodebuild is not installed.'
     exit 1
 fi 
-PROFILE=TestApp_CI
+PROFILE=PyTorch_CI_2021
 ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID}
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 70a38e45733a..1580b4fb8cec 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -396,7 +396,7 @@
             rm cert.txt
             bundle exec fastlane install_cert
             # install the provisioning profile
-            PROFILE=TestApp_CI.mobileprovision
+            PROFILE=PyTorch_CI_2021.mobileprovision
             PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles
             mkdir -pv "${PROVISIONING_PROFILES}"
             cd "${PROVISIONING_PROFILES}"
@@ -454,7 +454,7 @@
           command: |
             set -e
             PROJ_ROOT=/Users/distiller/project
-            PROFILE=TestApp_CI
+            PROFILE=PyTorch_CI_2021
             # run the ruby build script
             if ! [ -x "$(command -v xcodebuild)" ]; then
               echo 'Error: xcodebuild is not installed.'

From 0de5824f360a4886421e88c67bf0035ad30a16dc Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Thu, 1 Oct 2020 16:40:01 -0700
Subject: [PATCH 353/449] [iOS][CI] Upgrade xcode version to 12.0 (#45677)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45677

Test Plan: Imported from OSS

Reviewed By: husthyc

Differential Revision: D24065647

Pulled By: xta0

fbshipit-source-id: f2535b1d93e58cf79e7075bf56b0613a3ded16eb
---
 .../cimodel/data/simple/ios_definitions.py    |  2 +-
 .circleci/config.yml                          | 40 +++++++++----------
 .../job-specs/binary-job-specs.yml            |  8 ++--
 .../job-specs/job-specs-custom.yml            |  6 +--
 scripts/xcode_build.rb                        |  6 ++-
 5 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/.circleci/cimodel/data/simple/ios_definitions.py b/.circleci/cimodel/data/simple/ios_definitions.py
index 1418bd5d8050..3473242bdf04 100644
--- a/.circleci/cimodel/data/simple/ios_definitions.py
+++ b/.circleci/cimodel/data/simple/ios_definitions.py
@@ -1,7 +1,7 @@
 from cimodel.data.simple.util.versions import MultiPartVersion
 
 
-IOS_VERSION = MultiPartVersion([11, 2, 1])
+IOS_VERSION = MultiPartVersion([12, 0, 0])
 
 
 class ArchVariant:
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 65f1c4d3f05d..06a7188b7c13 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -925,7 +925,7 @@ jobs:
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run:
@@ -950,7 +950,7 @@ jobs:
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
@@ -991,7 +991,7 @@ jobs:
   binary_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
@@ -1018,7 +1018,7 @@ jobs:
   binary_ios_upload:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
@@ -1260,7 +1260,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -1294,7 +1294,7 @@ jobs:
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - attach_workspace:
@@ -1522,7 +1522,7 @@ jobs:
   pytorch_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_ios_build
@@ -6978,22 +6978,22 @@ workflows:
             - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build
             - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build
       - pytorch_ios_build:
-          build_environment: pytorch-ios-11.2.1-x86_64_build
+          build_environment: pytorch-ios-12.0.0-x86_64_build
           ios_arch: x86_64
           ios_platform: SIMULATOR
-          name: pytorch_ios_11_2_1_x86_64_build
+          name: pytorch_ios_12_0_0_x86_64_build
       - pytorch_ios_build:
-          build_environment: pytorch-ios-11.2.1-arm64_build
+          build_environment: pytorch-ios-12.0.0-arm64_build
           context: org-member
           ios_arch: arm64
           ios_platform: OS
-          name: pytorch_ios_11_2_1_arm64_build
+          name: pytorch_ios_12_0_0_arm64_build
       - pytorch_ios_build:
-          build_environment: pytorch-ios-11.2.1-arm64_custom_build
+          build_environment: pytorch-ios-12.0.0-arm64_custom_build
           context: org-member
           ios_arch: arm64
           ios_platform: OS
-          name: pytorch_ios_11_2_1_arm64_custom_build
+          name: pytorch_ios_12_0_0_arm64_custom_build
           op_list: mobilenetv2.yaml
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-mobile-build
@@ -7161,32 +7161,32 @@ workflows:
           requires:
             - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build
       - binary_ios_build:
-          build_environment: libtorch-ios-11.2.1-nightly-x86_64-build
+          build_environment: libtorch-ios-12.0.0-nightly-x86_64-build
           context: org-member
           filters:
             branches:
               only: nightly
           ios_arch: x86_64
           ios_platform: SIMULATOR
-          name: pytorch_ios_11_2_1_nightly_x86_64_build
+          name: pytorch_ios_12_0_0_nightly_x86_64_build
       - binary_ios_build:
-          build_environment: libtorch-ios-11.2.1-nightly-arm64-build
+          build_environment: libtorch-ios-12.0.0-nightly-arm64-build
           context: org-member
           filters:
             branches:
               only: nightly
           ios_arch: arm64
           ios_platform: OS
-          name: pytorch_ios_11_2_1_nightly_arm64_build
+          name: pytorch_ios_12_0_0_nightly_arm64_build
       - binary_ios_upload:
-          build_environment: libtorch-ios-11.2.1-nightly-binary-build-upload
+          build_environment: libtorch-ios-12.0.0-nightly-binary-build-upload
           context: org-member
           filters:
             branches:
               only: nightly
           requires:
-            - pytorch_ios_11_2_1_nightly_x86_64_build
-            - pytorch_ios_11_2_1_nightly_arm64_build
+            - pytorch_ios_12_0_0_nightly_x86_64_build
+            - pytorch_ios_12_0_0_nightly_arm64_build
       - pytorch_linux_build:
           build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c
diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
index 7e635f42bce4..489dfefdbff1 100644
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@@ -135,7 +135,7 @@
   smoke_mac_test:
     <<: *binary_linux_test_upload_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run:
@@ -160,7 +160,7 @@
   binary_mac_build:
     <<: *binary_mac_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml
     - checkout
@@ -201,7 +201,7 @@
   binary_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
@@ -228,7 +228,7 @@
   binary_ios_upload:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
     - attach_workspace:
         at: ~/workspace
diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
index 1580b4fb8cec..5c7c9bf0462c 100644
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@@ -115,7 +115,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_macos_build
@@ -149,7 +149,7 @@
     environment:
       BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - attach_workspace:
@@ -377,7 +377,7 @@
   pytorch_ios_build:
     <<: *pytorch_ios_params
     macos:
-      xcode: "11.2.1"
+      xcode: "12.0"
     steps:
       - checkout
       - run_brew_for_ios_build
diff --git a/scripts/xcode_build.rb b/scripts/xcode_build.rb
index 801ad34a64fd..810c23352fdd 100644
--- a/scripts/xcode_build.rb
+++ b/scripts/xcode_build.rb
@@ -62,10 +62,13 @@
 project.save
 
 sdk = nil
+arch = nil
 if options[:platform] == 'SIMULATOR'
     sdk = 'iphonesimulator'
+    arch = 'x86_64'
 elsif options[:platform] == 'OS'
     sdk = 'iphoneos'
+    arch = 'arm64'
 else
     raise "unsupported platform #{options[:platform]}"
 end
@@ -76,4 +79,5 @@
 end
 
 # run xcodebuild
-exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile}"
+exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile} -arch #{arch}" 
+

From 0393a1e8b910c0748b20a6f8b952672e680d98b9 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 1 Oct 2020 16:54:58 -0700
Subject: [PATCH 354/449] add an indexer to SymbolicShape (#45450)

Summary:
A convenience indexer into `SymbolicShape`s

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45450

Reviewed By: ZolotukhinM

Differential Revision: D23971758

Pulled By: Krovatkin

fbshipit-source-id: 1f18c5f89f579072f6bf467809ea9471bf42bc2d
---
 aten/src/ATen/core/jit_type.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 1c9d31dd630c..1f8cfbd242b9 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -488,6 +488,13 @@ struct CAFFE2_API SymbolicShape {
     dims_ = shape_symbols;
   }
 
+  ShapeSymbol operator[](size_t i) const {
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+    return (*dims_).at(i);
+  }
+
   // Returns rank or nullopt in case of unranked shape.
   c10::optional<size_t> rank() const {
     if(!dims_) {
@@ -548,7 +555,7 @@ struct VaryingShape {
     return dims_ == other.dims_;
   }
 
-  const c10::optional<T>& operator[](int i) const {
+  const c10::optional<T> &operator[](size_t i) const {
     if (!dims_) {
       throw std::runtime_error("Rank isn't fixed");
     }

From cbdba7cc1ef30bb1e777e2fdd05654e654a39619 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 1 Oct 2020 17:21:45 -0700
Subject: [PATCH 355/449] win job for the legacy executor (#45612)

Summary:
Adds a CUDA job on Windows for the jit legacy executor

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45612

Reviewed By: mrshenli

Differential Revision: D24042196

Pulled By: Krovatkin

fbshipit-source-id: 35c79c53ed569d221e79376c108bc864900ef49e
---
 ...st_python_jit_profiling.bat => test_python_jit_legacy.bat} | 4 +---
 .jenkins/pytorch/win-test.sh                                  | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)
 rename .jenkins/pytorch/win-test-helpers/{test_python_jit_profiling.bat => test_python_jit_legacy.bat} (51%)

diff --git a/.jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
similarity index 51%
rename from .jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat
rename to .jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
index e437833d8c62..a9168644f471 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat
@@ -3,9 +3,7 @@ call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 pushd test
 
 echo Run jit_profiling tests
-python run_test.py --include test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1"
+python run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="%1"
 if ERRORLEVEL 1 exit /b 1
 
 popd
-
-
diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
index 0b0159d04a50..abcd5756d747 100755
--- a/.jenkins/pytorch/win-test.sh
+++ b/.jenkins/pytorch/win-test.sh
@@ -48,7 +48,7 @@ run_tests() {
             $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
             $SCRIPT_HELPERS_DIR/test_libtorch.bat
             if [[ "${USE_CUDA}" == "1" ]]; then
-              $SCRIPT_HELPERS_DIR/test_python_jit_profiling.bat "$DETERMINE_FROM"
+              $SCRIPT_HELPERS_DIR/test_python_jit_legacy.bat "$DETERMINE_FROM"
             fi
         elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
             $SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \

From 4c1e50eb5cc8a8dba5646454407c431d36afd3c8 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 1 Oct 2020 17:46:02 -0700
Subject: [PATCH 356/449] remove skip annotations since we already disabled the
 tests wholesale (#45698)

Summary:
Remove skip annotations since we already disabled the tests wholesale

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45698

Reviewed By: mrshenli

Differential Revision: D24064547

Pulled By: Krovatkin

fbshipit-source-id: 0d154135de0c0550d6874bea3c2d42d5f4d71cb4
---
 test/test_jit_cuda_fuser.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index 43b28ae0d8e4..ac9f054d38c8 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -514,7 +514,6 @@ def t(x: torch.Tensor, y: torch.Tensor, z: float):
         self.assertGraphContains(t_jit.graph_for(x, y, 2.0), FUSION_GROUP)
 
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
-    @unittest.skip("temp disable while switching to Profiling Executor")
     def test_random_topo(self):
         os.environ["PYTORCH_CUDA_FUSER_DISABLE_FALLBACK"] = "1"
         self.assertTrue(runDefaultTestWithSeed(28449))
@@ -619,7 +618,6 @@ def test_reduction(self):
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR !=
                      ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective")
     @skipIfRocm
-    @unittest.skip("temp disable while switching to Profiling Executor")
     def test_reduction_permutation(self):
         x = [7, 8, 12]
         # note that num_dim is exclusive from len(x), so we are not reducing

From fc4209bd4fe9b06e6bfc6ef6db38b08f4b2616e1 Mon Sep 17 00:00:00 2001
From: lixinyu <lixinyu@devgpu175.prn2.facebook.com>
Date: Thu, 1 Oct 2020 18:15:09 -0700
Subject: [PATCH 357/449] Fix the bucketization wrong doc for right argument
 (#45684)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45684

Test Plan: Imported from OSS

Reviewed By: zou3519

Differential Revision: D24057996

Pulled By: glaringlee

fbshipit-source-id: 3db1c24f3cae9747effa4b1f3c5c3baf6888c9a1
---
 torch/_torch_docs.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 8474282613ef..4fb5724fcc00 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -9581,16 +9581,16 @@ def merge_dicts(*dicts):
      - *returned index satisfies*
    * - 1-D
      - False
-     - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+     - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
    * - 1-D
      - True
-     - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+     - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
    * - N-D
      - False
-     - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+     - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
    * - N-D
      - True
-     - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+     - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
 
 Args:
     sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
@@ -9654,9 +9654,9 @@ def merge_dicts(*dicts):
    * - :attr:`right`
      - *returned index satisfies*
    * - False
-     - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
-   * - True
      - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+   * - True
+     - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
 
 Args:
     input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).

From a015ba8dd5ca1d23f695fc8dc0790d6b0b875add Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Thu, 1 Oct 2020 19:56:01 -0700
Subject: [PATCH 358/449] migrating the take() fn from TH to ATen (#45283)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45283

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D24037298

Pulled By: bdhirsh

fbshipit-source-id: 088ce39e55ee8b5a79fa501395fa9eec08d1d396
---
 aten/src/ATen/LegacyTHFunctionsCPU.cpp        | 167 +++---------------
 .../ATen/native/TensorAdvancedIndexing.cpp    |  91 ++++++++++
 aten/src/ATen/native/native_functions.yaml    |   4 +-
 aten/src/TH/generic/THTensorEvenMoreMath.cpp  |  44 -----
 4 files changed, 114 insertions(+), 192 deletions(-)

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
index 9136027c4c1e..f0a55470cc1c 100644
--- a/aten/src/ATen/LegacyTHFunctionsCPU.cpp
+++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -39,7 +39,7 @@ namespace {
 Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -112,7 +112,7 @@ Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor &
 Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -185,7 +185,7 @@ Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tens
 Tensor & _th_nonzero_out(Tensor & result, const Tensor & self) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long);
@@ -316,7 +316,7 @@ Tensor _th_nonzero(const Tensor & self) {
 Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -379,135 +379,10 @@ Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const
     }
     return self;
 }
-Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THBoolTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THByteTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THCharTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THDoubleTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THFloatTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THIntTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THLongTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long);
-            THShortTensor_take(result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take_out not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_take(const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THBoolTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THByteTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THCharTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THDoubleTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THFloatTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THIntTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THLongTensor_take(result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long);
-            THShortTensor_take(result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take not supported on CPUType for ", dispatch_scalar_type);
-    }
-    return result;
-}
 Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -573,7 +448,7 @@ Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bo
 Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Bool: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -639,7 +514,7 @@ Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scala
 std::tuple<Tensor &,Tensor &> _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Byte: {
             auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -746,7 +621,7 @@ std::tuple<Tensor,Tensor> _th_mode(const Tensor & self, int64_t dim, bool keepdi
 Tensor _th_var(const Tensor & self, bool unbiased) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type);
@@ -765,7 +640,7 @@ Tensor _th_var(const Tensor & self, bool unbiased) {
 Tensor _th_std(const Tensor & self, bool unbiased) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CPU, dispatch_scalar_type);
@@ -784,7 +659,7 @@ Tensor _th_std(const Tensor & self, bool unbiased) {
 Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -835,7 +710,7 @@ Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
 Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type);
@@ -859,7 +734,7 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
 Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -910,7 +785,7 @@ Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
 Tensor _th_trace(const Tensor & self) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Byte: {
             auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
@@ -954,7 +829,7 @@ Tensor _th_trace(const Tensor & self) {
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1005,7 +880,7 @@ std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A) {
 std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1052,7 +927,7 @@ std::tuple<Tensor,Tensor> _th_eig(const Tensor & self, bool eigenvectors) {
 Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1095,7 +970,7 @@ Tensor _th_potri(const Tensor & self, bool upper) {
 std::tuple<Tensor &,Tensor &> _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1142,7 +1017,7 @@ std::tuple<Tensor,Tensor> _th_geqrf(const Tensor & self) {
 Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1189,7 +1064,7 @@ Tensor _th_orgqr(const Tensor & self, const Tensor & input2) {
 Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1240,7 +1115,7 @@ Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & inpu
 std::tuple<Tensor &,Tensor &> _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(J);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type);
@@ -1287,7 +1162,7 @@ std::tuple<Tensor,Tensor> _th_multinomial_alias_setup(const Tensor & probs) {
 Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional<Generator> generator) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(result);
-    
+
     switch (dispatch_scalar_type) {
         case ScalarType::Double: {
             auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long);
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index ad6625308ff5..bc58ba8e6eec 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -135,6 +135,26 @@ static Tensor reshape_indexer(const Tensor& index, int64_t dims_before, int64_t
   return index.reshape(shape);
 }
 
+static ptrdiff_t dataOffset(const Tensor& tensor, ptrdiff_t linearIndex) {
+  auto size = tensor.sizes();
+  auto stride = tensor.strides();
+  int nDim = tensor.dim();
+  ptrdiff_t dataOffset = 0;
+  for (int i = nDim - 1; i >= 0; i--) {
+    dataOffset += (linearIndex % size[i]) * stride[i];
+    linearIndex /= size[i];
+  }
+  return dataOffset;
+}
+
+static inline int64_t wrapLinearIndex(int64_t linearIndex, int64_t numel) {
+  return linearIndex < 0 ? linearIndex + numel : linearIndex;
+}
+
+static inline void checkLinearIndex(int64_t linearIndex, int64_t numel) {
+  TORCH_CHECK(linearIndex < numel && linearIndex >= -numel, "out of range: ", linearIndex, " out of ", numel);
+}
+
 AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
 {
   int64_t element_size_bytes = src.element_size();
@@ -815,6 +835,77 @@ Tensor masked_select_backward(const Tensor& grad, const Tensor& input, const Ten
   return result.masked_scatter_(mask, grad);
 }
 
+void take_out_cpu_template(
+    Tensor& output,
+    Tensor const& input,
+    Tensor const& index)
+{
+    TORCH_CHECK(output.device().type() == at::kCPU, "device type of output (", output.device().type(), ") is not CPU");
+    TORCH_CHECK(input.device().type() == at::kCPU, "device type of input (", input.device().type(), ") is not CPU");
+    TORCH_CHECK(index.device().type() == at::kCPU, "device type of index (", index.device().type(), ") is not CPU");
+
+    TORCH_CHECK(output.layout() == Layout::Strided, "take() only supports strided layout, got layout: ",
+            output.layout(), " on output tensor");
+    TORCH_CHECK(input.layout() == Layout::Strided, "take() only supports strided layout, got layout: ",
+            input.layout(), " on input tensor");
+    TORCH_CHECK(index.layout() == Layout::Strided, "take() only supports strided layout, got layout: ",
+            index.layout(), " on index tensor");
+
+    TORCH_CHECK(output.scalar_type() == input.scalar_type(), "output and input scalar type must match.",
+            "But got different types: ", output.scalar_type(), " and ", input.scalar_type());
+    TORCH_CHECK(index.scalar_type() == kLong, "index must be an int64 tensor");
+
+    output.resize_(index.sizes());
+    auto output_contiguous = output.contiguous();
+    auto index_continuous = index.contiguous();
+    bool is_contiguous = input.is_contiguous();
+    auto input_size = input.numel();
+
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cpu", [&] {
+        auto output_data = output_contiguous.data_ptr<scalar_t>();
+        auto input_data = input.data_ptr<scalar_t>();
+        auto index_data = index.data_ptr<int64_t>();
+
+        // Exceptions must not be thrown across parallel sections, so we
+        // record the position of the invalid index and throw the exception after the
+        // loop.
+        std::atomic<int64_t> invalidIdxPos(-1);
+
+        at::parallel_for(0, index.numel(), at::internal::GRAIN_SIZE,
+            [&](int64_t start, int64_t end) {
+            for (auto i = start; i < end; i++) {
+                int64_t idx = index_data[i];
+                if (idx < input_size && idx >= -input_size) {
+                    idx = wrapLinearIndex(idx, input_size);
+                    if (is_contiguous) {
+                        output_data[i] = input_data[idx];
+                    } else {
+                        output_data[i] = input_data[dataOffset(input, idx)];
+                    }
+                } else {
+                    int64_t tmp = -1;
+                    invalidIdxPos.compare_exchange_strong(tmp, i);
+                }
+            }
+        });
+
+        if (invalidIdxPos >= 0) {
+            checkLinearIndex(index_data[invalidIdxPos], input_size);
+        }
+    });
+}
+
+Tensor take_cpu(const Tensor& self, const Tensor& index) {
+    auto output = at::empty(index.sizes(), self.options());
+    take_out_cpu_template(output, self, index);
+    return output;
+}
+
+Tensor& take_out_cpu(Tensor& out, const Tensor& self, const Tensor& index) {
+    take_out_cpu_template(out, self, index);
+    return out;
+}
+
 Tensor take_backward(const Tensor& grad, const Tensor& input, const Tensor& index) {
   return at::zeros_like(input).put_(index, grad, true);
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 60ec28595156..98f1e8b564a2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5502,14 +5502,14 @@
 
 - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: legacy::cpu::_th_take_out
+    CPU: take_out_cpu
     CUDA: legacy::cuda::_th_take_out
 
 - func: take(Tensor self, Tensor index) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: legacy::cpu::_th_take
+    CPU: take_cpu
     CUDA: legacy::cuda::_th_take
 
 - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor
diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
index 764220c24673..6a79f3e14c14 100644
--- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp
@@ -216,50 +216,6 @@ static inline int64_t THTensor_(wrapLinearIndex)(int64_t linearIndex, int64_t nu
   return linearIndex < 0 ? linearIndex + numel : linearIndex;
 }
 
-void THTensor_(take)(THTensor *r_, THTensor *src, THLongTensor *index)
-{
-  THTensor_(resizeNd)(r_, index->dim(), THTensor_getSizePtr(index), NULL);
-  THTensor* dst = THTensor_(newContiguous)(r_);
-
-  index = THLongTensor_newContiguous(index);
-  int64_t* index_data = THLongTensor_data(index);
-  ptrdiff_t srcElements = THTensor_(nElement)(src);
-  scalar_t* src_data = src->data<scalar_t>();
-  scalar_t* dst_data = dst->data<scalar_t>();
-  ptrdiff_t nIndices = THLongTensor_nElement(index);
-  int isContiguous = THTensor_(isContiguous)(src);
-
-  // Exceptions must not be thrown across parallel sections, so we
-  // record the position of the invalid index and throw the exception after the
-  // loop.
-  std::atomic<int64_t> invalidIdxPos(-1);
-
-  at::parallel_for(0, nIndices, TH_OMP_OVERHEAD_THRESHOLD,
-      [&](int64_t start, int64_t end) {
-    for (auto i = start; i < end; i++) {
-      int64_t idx = index_data[i];
-      if (idx < srcElements && idx >= -srcElements) {
-        idx = THTensor_(wrapLinearIndex)(idx, srcElements);
-        if (isContiguous) {
-          dst_data[i] = src_data[idx];
-        } else {
-          dst_data[i] = src_data[THTensor_(dataOffset)(src, idx)];
-        }
-      } else {
-        int64_t tmp = -1;
-        invalidIdxPos.compare_exchange_strong(tmp, i);
-      }
-    }
-  });
-
-  if (invalidIdxPos >= 0) {
-    THTensor_(checkLinearIndex)(index_data[invalidIdxPos], srcElements);
-  }
-
-  THLongTensor_free(index);
-  THTensor_(freeCopyTo)(dst, r_);
-}
-
 void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate)
 {
   THArgCheck(THLongTensor_nElement(index) == THTensor_(nElement)(src), 3,

From 1552a926a3e92f57f2573b5b18a0965a16ac401b Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Thu, 1 Oct 2020 19:56:01 -0700
Subject: [PATCH 359/449] migrate cuda implementation of take() from TH to ATen
 (#45430)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45430

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D24037297

Pulled By: bdhirsh

fbshipit-source-id: 7c5f2c08e895fb0c25eec1d68c7455e4f2b1c64e
---
 aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp | 138 -------------------
 aten/src/ATen/native/cuda/IndexKernel.cu     |  95 ++++++++++++-
 aten/src/ATen/native/native_functions.yaml   |   4 +-
 aten/src/THC/THCTensorIndex.cu               |  14 --
 aten/src/THC/generic/THCTensorIndex.cu       |  15 --
 5 files changed, 96 insertions(+), 170 deletions(-)

diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index c43d53751aee..b2d8df49f51b 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -435,144 +435,6 @@ Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const
     }
     return self;
 }
-Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_take(const Tensor & self, const Tensor & index) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Bool: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long);
-            THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_take not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
 Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index b69267e90437..9c3eab4497aa 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -4,9 +4,12 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/core/Array.h>
-#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/ExpandUtils.h>
+#include <THC/THCTensorInfo.cuh>
 
 namespace at { namespace native {
 
@@ -14,6 +17,54 @@ static constexpr int launch_bound2 = 4;
 
 static constexpr int launch_size_nd = 128;
 
+template <int Dims, typename T, typename IndexType>
+__device__ __forceinline__ IndexType indexToOffset(
+    const cuda::detail::TensorInfo<T, IndexType>& info,
+    int64_t index,
+    IndexType size) {
+  IndexType linearIndex = static_cast<IndexType>(index);
+  CUDA_KERNEL_ASSERT(linearIndex < size && linearIndex >= -size);
+  if (linearIndex < 0) {
+    linearIndex += size;
+  }
+  return cuda::detail::IndexToOffset<T, IndexType, Dims>::get(linearIndex, info);
+}
+
+template<typename IndexType, typename T>
+void dispatchTakePutImpl(const Tensor& input, Tensor& output, const Tensor& index) {
+  auto inputInfo = cuda::detail::getTensorInfo<T, IndexType>(input);
+  inputInfo.collapseDims();
+  auto numel = input.numel();
+  if (inputInfo.isContiguous()) {
+    cuda::CUDA_tensor_apply2<T, int64_t>(
+        output,
+        index,
+        [inputInfo, numel] __device__ (
+            T & out, const int64_t& idx) {
+            auto offset = indexToOffset<-2, T, IndexType>(inputInfo, idx, numel);
+            out = inputInfo.data[offset];
+        });
+  } else {
+    cuda::CUDA_tensor_apply2<T, int64_t>(
+        output,
+        index,
+        [inputInfo, numel] __device__ (
+            T & out, const int64_t& idx) {
+            auto offset = indexToOffset<-1, T, IndexType>(inputInfo, idx, numel);
+            out = inputInfo.data[offset];
+        });
+  }
+}
+
+template<typename T>
+void dispatchTakePut(const Tensor& input, Tensor& output, const Tensor& index) {
+  if (cuda::detail::canUse32BitIndexMath(input)) {
+    dispatchTakePutImpl<int32_t, T>(input, output, index);
+  } else {
+    dispatchTakePutImpl<int64_t, T>(input, output, index);
+  }
+}
+
 template<int nt, int vt, typename func_t>
 C10_LAUNCH_BOUNDS_2(nt, launch_bound2)
 __global__ void index_elementwise_kernel(int N, func_t f) {
@@ -154,6 +205,48 @@ Tensor & masked_select_out_cuda(Tensor & result, const Tensor & self, const Tens
   return masked_select_out_cuda_impl(result, self, mask);
 }
 
+void take_out_cuda_template(Tensor& output, const Tensor& input, const Tensor& index) {
+  TORCH_CHECK(output.device().type() == at::kCUDA, "device type of output (", output.device().type(), ") is not GPU");
+  TORCH_CHECK(input.device().type() == at::kCUDA, "device type of input (", input.device().type(), ") is not GPU");
+  TORCH_CHECK(index.device().type() == at::kCUDA, "device type of index (", index.device().type(), ") is not GPU");
+
+  TORCH_CHECK(output.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", output.layout(), " on output tensor");
+  TORCH_CHECK(input.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", input.layout(), " on input tensor");
+  TORCH_CHECK(index.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", index.layout(), " on index tensor");
+
+  TORCH_CHECK(output.scalar_type() == input.scalar_type(),
+          "output and input scalar type must match. but got different types: ", output.scalar_type(), " and ", input.scalar_type());
+  TORCH_CHECK(index.scalar_type() == kLong, "index must be an int64 tensor");
+
+  TensorArg output_arg{ output, "output", 1 };
+  TensorArg input_arg{ input, "input", 2 };
+  TensorArg index_arg{ index, "index", 3 };
+  checkAllSameGPU("take", {output_arg, input_arg, index_arg});
+
+  TORCH_CHECK(input.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING);
+  TORCH_CHECK(output.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING);
+  TORCH_CHECK(index.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING);
+
+  TORCH_CHECK(!(input.numel() == 0 && index.numel() != 0), "tried to take from an empty tensor");
+
+  output.resize_(index.sizes());
+
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cuda", [&] {
+    dispatchTakePut<scalar_t>(input, output, index);
+  });
+}
+
+Tensor take_cuda(const Tensor& self, const Tensor& index) {
+    auto out = at::empty(index.sizes(), self.options());
+    take_out_cuda_template(out, self, index);
+    return out;
+}
+
+Tensor& take_out_cuda(Tensor& out, const Tensor& self, const Tensor& index) {
+    take_out_cuda_template(out, self, index);
+    return out;
+}
+
 REGISTER_DISPATCH(index_stub, &index_kernel);
 REGISTER_DISPATCH(index_put_stub, &index_put_kernel);
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 98f1e8b564a2..82bf599ea7d2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -5503,14 +5503,14 @@
 - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU: take_out_cpu
-    CUDA: legacy::cuda::_th_take_out
+    CUDA: take_out_cuda
 
 - func: take(Tensor self, Tensor index) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: take_cpu
-    CUDA: legacy::cuda::_th_take
+    CUDA: take_cuda
 
 - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor
   use_c10_dispatcher: full
diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu
index 0287f31f658e..dfd3a510e6e1 100644
--- a/aten/src/THC/THCTensorIndex.cu
+++ b/aten/src/THC/THCTensorIndex.cu
@@ -218,20 +218,6 @@ struct WrapIndexOp {
   int64_t size;
 };
 
-template <typename T, typename IndexType, int Dims>
-struct TensorTakeOp {
-  TensorTakeOp(TensorInfo<T, IndexType> info, IndexType numel, int64_t*, int64_t*)
-    : info(info), numel(numel) {}
-
-  __device__ __forceinline__ void operator()(T* out, int64_t* index) {
-    auto offset = indexToOffset<Dims>(info, *index, numel);
-    *out = info.data[offset];
-  }
-
-  const TensorInfo<T, IndexType> info;
-  IndexType numel;
-};
-
 template <typename T, typename IndexType, int Dims>
 struct TensorPutOp {
   TensorPutOp(TensorInfo<T, IndexType> info, IndexType numel, int64_t*, int64_t*)
diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu
index a6c621c8ef15..07303fa47096 100644
--- a/aten/src/THC/generic/THCTensorIndex.cu
+++ b/aten/src/THC/generic/THCTensorIndex.cu
@@ -220,21 +220,6 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT
 #undef LARGE_INDEX
 }
 
-void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLongTensor *index)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src));
-  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index));
-
-  THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(THCudaLongTensor_nDimensionLegacyNoScalars(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING);
-  THArgCheck(!(THCTensor_(numel)(state, src) == 0 && THCudaLongTensor_numel(state, index) != 0), 2,
-             "tried to take from an empty tensor");
-
-  THCTensor_(resizeNd)(state, dst, index->dim(), THTensor_getSizePtr(index), NULL);
-  dispatchTakePut<scalar_t, TensorTakeOp>(state, src, dst, index);
-}
-
 static void THCTensor_(sort_indices)(THCState *state, THCudaLongTensor *index, THCTensor *src) {
   THCThrustAllocator thrustAlloc(state);
 

From f6dc256bc65e80f48f07f497b12ea01ed8d78e34 Mon Sep 17 00:00:00 2001
From: Lillian Johnson <lillianjohnson@fb.com>
Date: Thu, 1 Oct 2020 20:38:16 -0700
Subject: [PATCH 360/449] example of splitting up an FX graph into smaller
 subgraphs with own submodules (#45404)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45404

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D23956147

Pulled By: Lilyjjo

fbshipit-source-id: a35e33a0b9f1ed5f3fb6e5cd146f66c29bf3d518
---
 test/test_fx.py                               |  38 ++++
 .../experimental/subgraph_creation_example.py | 174 ++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 torch/fx/experimental/subgraph_creation_example.py

diff --git a/test/test_fx.py b/test/test_fx.py
index 6ae422e84581..34e2604988e1 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -9,6 +9,7 @@
 from torch.fx.experimental import GraphManipulation
 from torch.fx.experimental import shape_prop
 from torch.fx.experimental.Partitioner import DAG, Partitioner
+from torch.fx.experimental.subgraph_creation_example import split_module 
 
 from torch.fx.proxy import TraceError
 
@@ -780,5 +781,42 @@ def forward(self, a, b):
             assert(r.input_nodes == d.input_nodes)
             assert(r.output_nodes == d.output_nodes)
 
+    def test_subgraph_creation(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.rand(3, 4))
+                self.linear = torch.nn.Linear(4, 5)
+
+            def forward(self, x, y):
+                z = self.linear(x + self.param).clamp(min=0.0, max=1.0) 
+                w = self.linear(y).clamp(min=0.0, max=1.0) 
+                return z + w
+
+        # symbolically trace model
+        my_module = MyModule()
+        my_module_traced = symbolic_trace(my_module)
+
+        # random mod partitioning
+        partition_counter = 0
+        NPARTITIONS = 3
+
+        def mod_partition(node: Node):
+            nonlocal partition_counter
+            partition = partition_counter % NPARTITIONS
+            partition_counter = (partition_counter + 1) % NPARTITIONS
+            return partition
+
+        # split module in module with submodules 
+        module_with_submodules = split_module(my_module_traced, my_module, mod_partition)
+
+        x = torch.rand(3, 4)
+        y = torch.rand(3, 4)
+
+        orig_out = my_module_traced(x, y)
+        submodules_out = module_with_submodules(x, y)
+
+        self.assertEqual(orig_out, submodules_out)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/experimental/subgraph_creation_example.py b/torch/fx/experimental/subgraph_creation_example.py
new file mode 100644
index 000000000000..430ba2586ae5
--- /dev/null
+++ b/torch/fx/experimental/subgraph_creation_example.py
@@ -0,0 +1,174 @@
+import torch
+from torch.fx.graph_module import GraphModule
+from typing import Callable, List, Dict, Set, Any, Optional
+
+class Partition:
+    def __init__(self, name: str):
+        self.name: str = name
+        self.node_names: List[str] = []
+        self.inputs: Set[str] = set()
+        self.outputs: Set[str] = set()
+        self.partitions_dependent_on: Set[str] = set()
+        self.partition_dependents: Set[str] = set()
+        self.graph : torch.fx.graph.Graph = torch.fx.graph.Graph()
+        self.environment : Dict[torch.fx.node.Node, torch.fx.node.Node] = {}
+        self.targets : Dict[str, Any] = {}
+
+    def __repr__(self) -> str:
+        return f"name: {self.name},\n" \
+            f" nodes: {self.node_names},\n" \
+            f" inputs: {self.inputs},\n" \
+            f" outputs: {self.outputs},\n" \
+            f" partitions depenent on: {self.partitions_dependent_on},\n" \
+            f" parition dependents: {self.partition_dependents}"
+
+# Creates subgraphs out of main graph 
+def split_module(
+    m: GraphModule,
+    root_m: torch.nn.Module,
+    split_callback: Callable[[torch.fx.node.Node], int],
+):
+    partitions: Dict[str, Partition] = {}
+    orig_nodes: Dict[str, torch.fx.node.Node] = {}
+
+    def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optional[torch.fx.node.Node]):
+        def_partition_name = getattr(def_node, '_fx_partition', None)
+        use_partition_name = getattr(use_node, '_fx_partition', None)
+        if def_partition_name != use_partition_name:
+            if def_partition_name is not None:
+                def_partition = partitions[def_partition_name]
+                def_partition.outputs.add(def_node.name)
+                if use_partition_name is not None:
+                    def_partition.partition_dependents.add(use_partition_name)
+
+            if use_partition_name is not None:
+                use_partition = partitions[use_partition_name]
+                use_partition.inputs.add(def_node.name)
+                if def_partition_name is not None:
+                    use_partition.partitions_dependent_on.add(def_partition_name)
+
+    # split nodes into parititons
+    for node in m.graph.nodes:
+        orig_nodes[node.name] = node
+
+        # TODO currently placeholders/parameters aren't put into random partitions,
+        # rather they're added to the graphs where they are used down below
+        if node.op in ["placeholder", "get_attr"]:
+            continue
+        partition_name = str(split_callback(node))
+
+        # add node to partitions
+        partition = partitions.get(partition_name)
+        if partition is None:
+            partitions[partition_name] = partition = Partition(partition_name)
+
+        partition.node_names.append(node.name)
+        node._fx_partition = partition_name
+
+        torch.fx.graph.map_arg(node.args, lambda def_node: record_cross_partition_use(def_node, node))
+        torch.fx.graph.map_arg(node.kwargs, lambda def_node: record_cross_partition_use(def_node, node))
+
+    torch.fx.graph.map_arg(m.graph.result, lambda n: record_cross_partition_use(n, None))
+
+    # find partitions with no dependencies
+    root_partitions : List[str] = []
+    for partition_name, partition in partitions.items():
+        if not len(partition.partitions_dependent_on):
+            root_partitions.append(partition_name)
+
+    # check partitions for circular dependencies and create topological partition ordering
+    sorted_partitions : List[str] = []
+    while root_partitions:
+        root_partition = root_partitions.pop()
+        sorted_partitions.append(root_partition)
+        for dependent in partitions[root_partition].partition_dependents:
+            partitions[dependent].partitions_dependent_on.remove(root_partition)
+            if not partitions[dependent].partitions_dependent_on:
+                root_partitions.append(dependent)
+    if len(sorted_partitions) != len(partitions):
+        raise RuntimeError("cycle exists between partitions!")
+
+    # add placeholders to parititons
+    for partition_name in sorted_partitions:
+        partition = partitions[partition_name]
+        for input in partition.inputs:
+            placeholder = partition.graph.placeholder(input)
+            partition.environment[orig_nodes[input]] = placeholder
+
+    # Transform nodes and collect targets for partition's submodule
+    for node in m.graph.nodes:
+        if hasattr(node, '_fx_partition'):
+            partition = partitions[node._fx_partition]
+
+            # swap out old graph nodes in kw/args with references to new nodes in this submodule
+            environment = partition.environment
+            gathered_args = torch.fx.graph.map_arg(node.args, lambda n : environment[n])
+            gathered_kwargs = torch.fx.graph.map_arg(node.kwargs, lambda n : environment[n])
+
+            if node.op not in ['call_module', 'get_attr']:
+                target = node.target
+            else:
+                target_atoms = node.target.split('.')
+                target_attr = m
+                for atom in target_atoms:
+                    if not hasattr(target_attr, atom):
+                        raise RuntimeError(f'Operator target {node.target} not found!')
+                    target_attr = getattr(target_attr, atom)
+                partition.targets[node.target] = target_attr
+                target = target_atoms[-1]
+
+            assert isinstance(gathered_args, tuple)
+            assert isinstance(gathered_kwargs, dict)
+            new_node = partition.graph.create_node(op=node.op, target=target, args=gathered_args,
+                                                   kwargs=gathered_kwargs)
+            partition.environment[node] = new_node
+
+    # Set up values to construct base module
+    base_mod_env : Dict[str, torch.fx.node.Node] = {}
+    base_mod_graph : torch.fx.graph.Graph = torch.fx.graph.Graph()
+    base_mod_attrs : Dict[str, torch.fx.graph_module.GraphModule] = {}
+    for node in m.graph.nodes:
+        if node.op == 'placeholder':
+            base_mod_env[node.name] = base_mod_graph.placeholder(node.name)
+        elif node.op == 'get_attr':
+            base_mod_env[node.name] = base_mod_graph.get_attr(node.target)
+            attr_val = m
+            for atom in node.target.split('.'):
+                if not hasattr(attr_val, atom):
+                    raise RuntimeError(f'Node target {node.target} not found!')
+                attr_val = getattr(attr_val, atom)
+            base_mod_attrs[node.target] = attr_val
+
+    # Do some things iterating over the partitions in topological order again:
+    # 1) Finish off submodule Graphs by setting corresponding outputs
+    # 2) Construct GraphModules for each submodule
+    # 3) Construct the base graph by emitting calls to those submodules in
+    #    topological order
+
+    for partition_name in sorted_partitions:
+        partition = partitions[partition_name]
+
+        # Set correct output values
+        output_vals = tuple(partition.environment[orig_nodes[name]] for name in partition.outputs)
+        output_vals = output_vals[0] if len(output_vals) == 1 else output_vals  # type: ignore
+        partition.graph.output(output_vals)
+
+        # Construct GraphModule for this partition
+        submod_name = f'submod_{partition_name}'
+        base_mod_attrs[submod_name] = torch.fx.graph_module.GraphModule(partition.targets, partition.graph)
+
+        # Emit call in base graph to this submodule
+
+        output_val = base_mod_graph.call_module(submod_name, [base_mod_env[name] for name in partition.inputs])  # type: ignore
+        if len(partition.outputs) > 1:
+            # Unpack multiple return values from submodule
+            output_val_proxy = torch.fx.proxy.Proxy(output_val)
+            for i, output_name in enumerate(partition.outputs):
+                base_mod_env[output_name] = output_val_proxy[i].node  # type: ignore
+        else:
+            base_mod_env[list(partition.outputs)[0]] = output_val
+
+    # Set output value for base graph
+    base_mod_graph.output(torch.fx.graph.map_arg(m.graph.result, lambda n : base_mod_env[n.name]))
+
+    return torch.fx.graph_module.GraphModule(base_mod_attrs, base_mod_graph)

From 36de05dbf60f9862847d2c91f0e974c5dc3cfe03 Mon Sep 17 00:00:00 2001
From: "Daily, Jeff" <jeff.daily@amd.com>
Date: Thu, 1 Oct 2020 23:09:55 -0700
Subject: [PATCH 361/449] passing all arguments to sccache wrapper script
 should be quoted as "$@" (#45582)

Summary:
This fixes MIOpen runtime compilation since it passes quoted arguments to the clang compiler.  This change also makes the sccache wrapper scripts consistent with the nvcc wrapper.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45582

Reviewed By: seemethere, izdeby

Differential Revision: D24034477

Pulled By: malfet

fbshipit-source-id: 1964bac1e693b238e8efe9c046a39be64571e9df
---
 .circleci/docker/common/install_cache.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/docker/common/install_cache.sh b/.circleci/docker/common/install_cache.sh
index f1066519cd70..17931375b6f0 100644
--- a/.circleci/docker/common/install_cache.sh
+++ b/.circleci/docker/common/install_cache.sh
@@ -16,7 +16,7 @@ fi
 chmod a+x /opt/cache/bin/sccache
 
 function write_sccache_stub() {
-  printf "#!/bin/sh\nexec sccache $(which $1) \$*" > "/opt/cache/bin/$1"
+  printf "#!/bin/sh\nexec sccache $(which $1) \"\$@\"" > "/opt/cache/bin/$1"
   chmod a+x "/opt/cache/bin/$1"
 }
 
@@ -57,8 +57,8 @@ if [ -n "$ROCM_VERSION" ]; then
     TOPDIR=$(dirname $OLDCOMP)
     WRAPPED="$TOPDIR/original/$COMPNAME"
     mv "$OLDCOMP" "$WRAPPED"
-    printf "#!/bin/sh\nexec sccache $WRAPPED \$*" > "$OLDCOMP"
-    chmod a+x "$1"
+    printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" > "$OLDCOMP"
+    chmod a+x "$OLDCOMP"
   }
 
   if [[ -e "/opt/rocm/hcc/bin/hcc" ]]; then

From 402caaeba513929dcfe12df183c764b0ef43f688 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Thu, 1 Oct 2020 23:10:30 -0700
Subject: [PATCH 362/449] [docs] Update docs for NegativeBinomial (#45693)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45693

**Summary**
This commit updates the docstring for
`torch.distributions.NegativeBinomial` to better match actual behaviour.
In particular, the parameter currently documented as probability of
success is actually probability of failure.

**Test Plan**
1) Ran the code from the issue to make sure this is still an issue (it
is)
2) `make html` and viewed the docs in a browser.

*Before*
<img width="879" alt="Captura de Pantalla 2020-10-01 a la(s) 1 35 28 p  m" src="https://user-images.githubusercontent.com/4392003/94864456-db3a5680-03f0-11eb-977e-3bab0fb9c206.png">

*After*
<img width="877" alt="Captura de Pantalla 2020-10-01 a la(s) 2 12 24 p  m" src="https://user-images.githubusercontent.com/4392003/94864478-e42b2800-03f0-11eb-965a-51493ca27c80.png">

**Fixes**
This commit closes #42449.

Test Plan: Imported from OSS

Reviewed By: robieta

Differential Revision: D24071048

Pulled By: SplitInfinity

fbshipit-source-id: d345b4de721475dbe26233e368af62eb57a47970
---
 torch/distributions/negative_binomial.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index 7395635971de..051725db19ca 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -10,14 +10,14 @@ class NegativeBinomial(Distribution):
     Creates a Negative Binomial distribution, i.e. distribution
     of the number of successful independent and identical Bernoulli trials
     before :attr:`total_count` failures are achieved. The probability
-    of success of each Bernoulli trial is :attr:`probs`.
+    of failure of each Bernoulli trial is :attr:`probs`.
 
     Args:
         total_count (float or Tensor): non-negative number of negative Bernoulli
             trials to stop, although the distribution is still valid for real
             valued count
-        probs (Tensor): Event probabilities of success in the half open interval [0, 1)
-        logits (Tensor): Event log-odds for probabilities of success
+        probs (Tensor): Event probabilities of failure in the half open interval [0, 1)
+        logits (Tensor): Event log-odds for probabilities of failure
     """
     arg_constraints = {'total_count': constraints.greater_than_eq(0),
                        'probs': constraints.half_open_interval(0., 1.),

From a0d08b2199660cd595121926a231ebb85aaabc5f Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 1 Oct 2020 23:35:09 -0700
Subject: [PATCH 363/449] Set the default bailout depth to 20 (#45710)

Summary:
This modifies the default bailout depth to 20 which gives us a reasonable performance in benchmarks we considered (fastrnns, maskrcnn, hub/benchmark, etc)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45710

Reviewed By: robieta

Differential Revision: D24071861

Pulled By: Krovatkin

fbshipit-source-id: 472aacc136f37297b21f577750c1d60683a6c81e
---
 torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index cb733938c033..d8af0756e2e6 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -49,7 +49,7 @@ static std::atomic<bool> profiling_mode{true};
 #endif
 
 static std::atomic<size_t> num_profiled_runs{1};
-static std::atomic<size_t> bailout_depth{1};
+static std::atomic<size_t> bailout_depth{20}; // NOLINT
 
 std::atomic<bool>& getProfilingMode() {
   return profiling_mode;

From 04526a49d3c2a691d1a85565c4a96cf3fd6525ad Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Thu, 1 Oct 2020 23:51:12 -0700
Subject: [PATCH 364/449] [quant] creating quint4x2 dtype for quantized tensors
 (#44678)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44678

This is a prototype PR that introduces 4 bit qtensors. The new dtype added for this is c10::quint4x2
The underlying storage for this is still uint8_t, so we pack 2 4-bit values in a byte while quantizing it.

This change uses most of the existing scaffolding for qtensor storage. We allocate storage
based on the dtype before creating a new qtensor.

It also adds a dispatch mechanism for this dtype so we can use this to get the bitwidth, qmin and qmax info
while quantizing and packing the qtensor (when we add 2-bit qtensor)

Kernels that use this dtype should be aware of the packing format.

Test Plan:
Locally tested
```
x = torch.ones((100, 100), dtype=torch.float)
qx_8bit = torch.quantize_per_tensor(x, scale=1.0, zero_point=2, dtype=torch.quint8)
qx = torch.quantize_per_tensor(x, scale=1.0, zero_point=2, dtype=torch.quint4x2)

torch.save(x, "temp.p")
print('Size float (B):', os.path.getsize("temp.p"))
os.remove('temp.p')

torch.save(qx_8bit, "temp.p")
print('Size quantized 8bit(B):', os.path.getsize("temp.p"))
os.remove('temp.p')

torch.save(qx, "temp.p")
print('Size quantized 4bit(B):', os.path.getsize("temp.p"))
os.remove('temp.p')
```

Size float (B): 40760
Size quantized 8bit(B): 10808
Size quantized 4bit(B): 5816

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D23993134

fbshipit-source-id: 073bf262f9680416150ba78ed2d932032275946d
---
 aten/src/ATen/DLConvertor.cpp                 |  7 +--
 aten/src/ATen/Dispatch.h                      | 34 +++++++++++
 .../native/quantized/affine_quantizer.cpp     | 28 +++++++--
 .../ATen/native/quantized/affine_quantizer.h  | 13 ++++
 .../native/quantized/cpu/int_repr_quant.cpp   | 34 +++++++----
 .../cpu/kernels/QuantizedOpKernels.cpp        | 61 +++++++++++++++++++
 aten/src/ATen/quantized/Quantizer.cpp         | 16 ++++-
 aten/src/TH/CMakeLists.txt                    |  1 +
 aten/src/TH/THGenerateQTypes.h                |  1 +
 aten/src/TH/THGenerateQUInt4x2Type.h          | 24 ++++++++
 aten/src/TH/generic/THStorage.h               |  1 +
 c10/core/ScalarType.h                         | 10 ++-
 c10/util/quint4x2.h                           | 18 ++++++
 c10/util/typeid.cpp                           |  1 +
 c10/util/typeid.h                             |  1 +
 test/quantization/test_quantized_tensor.py    | 43 ++++++++++---
 tools/pyi/gen_pyi.py                          |  4 +-
 torch/__init__.py                             |  5 +-
 torch/csrc/Module.cpp                         |  2 +
 torch/csrc/Storage.h                          |  4 +-
 torch/csrc/utils.h                            |  3 +
 torch/csrc/utils/tensor_dtypes.cpp            |  2 +
 22 files changed, 274 insertions(+), 39 deletions(-)
 create mode 100644 aten/src/TH/THGenerateQUInt4x2Type.h
 create mode 100644 c10/util/quint4x2.h

diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 786fe6214dc3..fd045960b52c 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -43,13 +43,10 @@ DLDataType getDLDataType(const Tensor& t) {
       throw std::logic_error("BFloat16 is not supported by dlpack");
       break;
     case ScalarType::QInt8:
-      throw std::logic_error("QInt8 is not supported by dlpack");
-      break;
     case ScalarType::QUInt8:
-      throw std::logic_error("QUInt8 is not supported by dlpack");
-      break;
     case ScalarType::QInt32:
-      throw std::logic_error("QInt32 is not supported by dlpack");
+    case ScalarType::QUInt4x2:
+      throw std::logic_error("QUInt/QInt types are not supported by dlpack");
       break;
     case ScalarType::ComplexHalf:
       throw std::logic_error("ComplexHalf is not supported by dlpack");
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 53a22db6ff9c..e0fc25c394d3 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -34,6 +34,21 @@
     return __VA_ARGS__();                                                    \
   }
 
+#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                       \
+    enum_type, type, underlying_type, bitwidth, qmin, qmax, ...)                  \
+  case enum_type: {                                                               \
+    using scalar_t = type;                                                        \
+    using underlying_t C10_UNUSED_DISPATCH_CUDA_WORKAROUND =                      \
+        scalar_t::underlying;                                                     \
+    const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type;      \
+    const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND =             \
+        toUnderlying(enum_type);                                                  \
+    int bit_width = bitwidth;                                                     \
+    int64_t quant_min = qmin;                                                     \
+    int64_t quant_max = qmax;                                                     \
+    return __VA_ARGS__();                                                         \
+  }
+
 // This macro should be used to skip bfloat16 dispatch on non-ROCm platforms and
 // should be removed once the bfloat16 bringup is complete on other platforms.
 // This is supposed to be used as a wrapper around the lambda function passed to
@@ -346,6 +361,25 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
     }                                                                       \
   }()
 
+#define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...)                                   \
+  [&] {                                                                                        \
+    const auto& the_type = TYPE;                                                               \
+    /* don't use TYPE again in case it is an expensive or side-effect op */                    \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                                      \
+    switch (_st) {                                                                             \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQInt8, at::qint8, int8_t, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__)          \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQUInt8, at::quint8, uint8_t, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__)               \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQInt32, at::qint32, int, CHAR_BIT * sizeof(int), INT_MIN, INT_MAX, __VA_ARGS__) \
+      AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                                      \
+          at::kQUInt4x2, at::quint4x2, uint8_t, 4, 0, 15, __VA_ARGS__)                         \
+      default:                                                                                 \
+        AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");                        \
+    }                                                                                          \
+  }()
+
 #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...)                  \
   [&] {                                                                     \
     const auto& the_type = TYPE;                                            \
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.cpp b/aten/src/ATen/native/quantized/affine_quantizer.cpp
index cbf116d741e3..1e0285bc1426 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.cpp
+++ b/aten/src/ATen/native/quantized/affine_quantizer.cpp
@@ -17,6 +17,8 @@ DEFINE_DISPATCH(quantize_tensor_per_channel_float_qparams_stub);
 DEFINE_DISPATCH(dequantize_tensor_per_tensor_affine_stub);
 DEFINE_DISPATCH(dequantize_tensor_per_channel_affine_stub);
 DEFINE_DISPATCH(dequantize_tensor_per_channel_float_qparams_stub);
+DEFINE_DISPATCH(quantize_tensor_per_tensor_affine_sub_byte_stub);
+DEFINE_DISPATCH(dequantize_tensor_per_tensor_affine_sub_byte_stub);
 
 namespace {
 
@@ -55,7 +57,8 @@ void checkQuantizedTensor(const std::string& fn_name, Tensor t) {
       fn_name,
       " expects a ",
       caffe2::TypeMeta::Make<T>(),
-      " Tensor");
+      " Tensor, got ",
+      t.scalar_type());
 }
 
 template <typename T>
@@ -103,13 +106,21 @@ Tensor quantize_tensor_per_tensor_affine(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     checkZeroPoint<underlying_t>(fn_name, zero_point);
   });
 
-  quantize_tensor_per_tensor_affine_stub(
+  // Temporary solution to pack the tensor if dtype is torch.quint4x2
+  // Can move this into the fbgemm::Quantize op.
+  if (qtensor.scalar_type() == at::ScalarType::QUInt4x2) {
+    quantize_tensor_per_tensor_affine_sub_byte_stub(
+      rtensor.device().type(), rtensor, qtensor, scale, zero_point);
+  }
+  else {
+    quantize_tensor_per_tensor_affine_stub(
       rtensor.device().type(), rtensor, qtensor, scale, zero_point);
+  }
   return qtensor;
 }
 
@@ -195,13 +206,18 @@ Tensor dequantize_tensor_per_tensor_affine(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
     checkZeroPoint<underlying_t>(fn_name, zero_point);
   });
 
-  dequantize_tensor_per_tensor_affine_stub(
-      qtensor.device().type(), qtensor, rtensor, scale, zero_point);
+  if (qtensor.scalar_type() == at::ScalarType::QUInt4x2) {
+    dequantize_tensor_per_tensor_affine_sub_byte_stub(
+        qtensor.device().type(), qtensor, rtensor, scale, zero_point);
+  } else {
+    dequantize_tensor_per_tensor_affine_stub(
+        qtensor.device().type(), qtensor, rtensor, scale, zero_point);
+  }
   return rtensor;
 }
 
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.h b/aten/src/ATen/native/quantized/affine_quantizer.h
index 862a36f5f61a..2991922c7f86 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.h
+++ b/aten/src/ATen/native/quantized/affine_quantizer.h
@@ -77,6 +77,12 @@ using dequantize_tensor_per_channel_float_qparams_fn = void (*)(
     Tensor zero_points,
     int64_t axis);
 
+using quantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(Tensor rtensor, Tensor qtensor, float scale, float zero_point);
+
+using dequantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(Tensor qtensor, Tensor rtensor, float scale, float zero_point);
+
 DECLARE_DISPATCH(
     quantize_tensor_per_tensor_affine_fn,
     quantize_tensor_per_tensor_affine_stub);
@@ -97,6 +103,13 @@ DECLARE_DISPATCH(
     dequantize_tensor_per_channel_float_qparams_fn,
     dequantize_tensor_per_channel_float_qparams_stub);
 
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_fn,
+    quantize_tensor_per_tensor_affine_sub_byte_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_fn,
+    dequantize_tensor_per_tensor_affine_sub_byte_stub);
 
 // Quantize a float value into a uint value given scale and zero_point
 template <typename T>
diff --git a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp
index 65036302e6ef..29e7a9b259bb 100644
--- a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp
+++ b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp
@@ -10,17 +10,29 @@ namespace native {
 // format of the output the same as input
 Tensor int_repr_quantized_cpu(const Tensor& self) {
   Tensor dst;
-  AT_DISPATCH_QINT_TYPES(self.scalar_type(), "int_repr", [&]() {
-    dst = at::empty(
-        self.sizes(),
-        self.options().dtype(UNDERLYING_TYPE),
-        self.suggest_memory_format());
-    auto iter = TensorIteratorConfig()
-      .check_all_same_dtype(false)
-      .add_output(dst)
-      .add_input(self)
-      .build();
-    cpu_kernel(iter, [](scalar_t value) -> underlying_t { return value.val_; });
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(self.scalar_type(), "int_repr", [&]() {
+    if (bit_width == 4) {
+      int64_t out_size = std::ceil(self.numel() * 0.5);
+      dst = at::empty(
+          {out_size},
+          self.options().dtype(UNDERLYING_TYPE),
+          self.suggest_memory_format());
+      const underlying_t* qdata = reinterpret_cast<underlying_t*>(self.data_ptr<scalar_t>());
+      for (int64_t i = 0; i < dst.numel(); ++i) {
+        dst[i] = static_cast<underlying_t>(qdata[i]);
+      }
+    } else {
+      dst = at::empty(
+          self.sizes(),
+          self.options().dtype(UNDERLYING_TYPE),
+          self.suggest_memory_format());
+      auto iter = TensorIteratorConfig()
+        .check_all_same_dtype(false)
+        .add_output(dst)
+        .add_input(self)
+        .build();
+      cpu_kernel(iter, [](scalar_t value) -> underlying_t { return value.val_; });
+      }
   });
   return dst;
 }
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index ddde74b61d52..bae9fe650046 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2716,6 +2716,60 @@ void dequantize_tensor_per_channel_float_qparams_cpu(
       });
 }
 
+void quantize_tensor_per_tensor_affine_sub_byte_cpu(
+    Tensor rtensor,
+    Tensor qtensor,
+    float scale,
+    float zero_point) {
+  // TODO Use fbgemm kernel to pack values
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
+    qtensor.scalar_type(), "quantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() {
+      check_tensor_memory_format(rtensor, qtensor);
+      const float* const rdata = rtensor.data_ptr<float>();
+      auto qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
+      auto numel = rtensor.numel();
+      const auto elem_per_byte = CHAR_BIT / bit_width;
+      for (int i = 0; i < numel; ++i) {
+        float inv_scale = scale == 0 ? 1.0f : 1.0f / scale;
+        int qvalue = lrintf(std::nearbyint(rdata[i] * inv_scale) + zero_point);
+        qvalue = std::max(quant_min, std::min(qvalue, quant_max));
+
+        // We pack sub_byte values and align them to a byte.
+        // Eg. for 4-bits Index 0 is packed in the lower 4-bits
+        // and index 1 is packed in the upper 4-bits.
+        if (i % elem_per_byte == 0) {
+          qdata[i / elem_per_byte] = static_cast<underlying_t>(qvalue);
+        } else {
+          qdata[i / elem_per_byte] |= static_cast<underlying_t>(qvalue << ((i % elem_per_byte) * bit_width));
+        }
+      } // for numel
+    });
+}
+
+void dequantize_tensor_per_tensor_affine_sub_byte_cpu(
+    Tensor qtensor,
+    Tensor rtensor,
+    float scale,
+    float zero_point) {
+  // TODO Use fbgemm kernel to pack values
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
+    qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() {
+      check_tensor_memory_format(rtensor, qtensor);
+      auto rdata = rtensor.data_ptr<float>();
+      const underlying_t* qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
+      auto numel = rtensor.numel();
+      const auto elem_per_byte = CHAR_BIT / bit_width;
+
+      for (int i = 0; i < numel; ++i) {
+        underlying_t qvalue = qdata[i / elem_per_byte];
+        qvalue >>= (i % elem_per_byte) * bit_width;
+        qvalue &= (1 << bit_width) - 1;
+        rdata[i] = (static_cast<float>(qvalue) - zero_point) * scale;
+      }
+  });
+
+}
+
 } // namespace
 
 REGISTER_DISPATCH(dequantize_tensor_per_channel_affine_stub,
@@ -2773,6 +2827,13 @@ REGISTER_DISPATCH(
 REGISTER_DISPATCH(quantized_normalize_stub, &quantized_normalize_kernel);
 REGISTER_DISPATCH(qupsample_bilinear2d_nhwc_stub,
                   &qupsample_bilinear2d_nhwc_kernel);
+REGISTER_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_stub,
+    &quantize_tensor_per_tensor_affine_sub_byte_cpu);
+REGISTER_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_stub,
+    &dequantize_tensor_per_tensor_affine_sub_byte_cpu);
+
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index 1f9225b52770..5ab64d2cb803 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -77,6 +77,18 @@ QTensorImpl* get_qtensorimpl(const Tensor& self) {
   return static_cast<QTensorImpl*>(self.unsafeGetTensorImpl());
 }
 
+int64_t get_sub_byte_tensor_size(int64_t size_bytes, at::ScalarType t) {
+  int64_t new_size_bytes;
+  switch(t) {
+    case at::ScalarType::QUInt4x2:
+      new_size_bytes = std::ceil(size_bytes * 0.5);
+      break;
+    default:
+      new_size_bytes = size_bytes;
+  }
+  return new_size_bytes;
+}
+
 inline Tensor new_qtensor(
     IntArrayRef sizes,
     const TensorOptions& options,
@@ -99,7 +111,9 @@ inline Tensor new_qtensor(
   TORCH_CHECK(
       isQIntType(typeMetaToScalarType(dtype)),
       "ScalarType is not supported in new_qtensor.");
-  int64_t size_bytes = nelements * dtype.itemsize();
+  auto scalar_type = typeMetaToScalarType(dtype);
+  int64_t size_bytes = get_sub_byte_tensor_size(nelements * dtype.itemsize(), scalar_type);
+
   auto storage = c10::make_intrusive<StorageImpl>(
       StorageImpl::use_byte_size_t(),
       size_bytes,
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index 6a491991a090..a3ed10126b93 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -65,6 +65,7 @@ install(FILES
   THGenerateComplexTypes.h
   THGenerateIntTypes.h
   THGenerateQUInt8Type.h
+  THGenerateQUInt4x2Type.h
   THGenerateQInt8Type.h
   THGenerateQInt32Type.h
   THGenerateQTypes.h
diff --git a/aten/src/TH/THGenerateQTypes.h b/aten/src/TH/THGenerateQTypes.h
index ee958b3a3210..611b990f508f 100644
--- a/aten/src/TH/THGenerateQTypes.h
+++ b/aten/src/TH/THGenerateQTypes.h
@@ -10,6 +10,7 @@
 #include <TH/THGenerateQUInt8Type.h>
 #include <TH/THGenerateQInt8Type.h>
 #include <TH/THGenerateQInt32Type.h>
+#include <TH/THGenerateQUInt4x2Type.h>
 
 #ifdef THQLocalGenerateManyTypes
 #undef THQLocalGenerateManyTypes
diff --git a/aten/src/TH/THGenerateQUInt4x2Type.h b/aten/src/TH/THGenerateQUInt4x2Type.h
new file mode 100644
index 000000000000..4ecea4514359
--- /dev/null
+++ b/aten/src/TH/THGenerateQUInt4x2Type.h
@@ -0,0 +1,24 @@
+#ifndef TH_GENERIC_FILE
+#error "You must define TH_GENERIC_FILE before including THGenerateQUInt4x2Type.h"
+#endif
+
+#define quantized_t c10::quint4x2
+#define scalar_t uint8_t
+#define Real QUInt4x2
+#define RealUnderlying Byte
+#define THQUANTIZED
+#define THQUINT8
+#define TH_REAL_IS_BYTE
+#line 1 TH_GENERIC_FILE
+#include TH_GENERIC_FILE
+#undef scalar_t
+#undef quantized_t
+#undef Real
+#undef RealUnderlying
+#undef TH_REAL_IS_BYTE
+#undef THQUINT8
+#undef THQUANTIZED
+
+#ifndef THGenerateManyTypes
+#undef TH_GENERIC_FILE
+#endif
diff --git a/aten/src/TH/generic/THStorage.h b/aten/src/TH/generic/THStorage.h
index cd419c695ba5..a41991c469c7 100644
--- a/aten/src/TH/generic/THStorage.h
+++ b/aten/src/TH/generic/THStorage.h
@@ -38,6 +38,7 @@
 #define THQUInt8Storage THStorage
 #define THQInt8Storage THStorage
 #define THQInt32Storage THStorage
+#define THQUInt4x2Storage THStorage
 #define THComplexFloatStorage THStorage
 #define THComplexDoubleStorage THStorage
 
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 41980540017c..8f2acebd84f0 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -38,7 +38,8 @@ namespace c10 {
   _(c10::qint8, QInt8) /* 12 */                          \
   _(c10::quint8, QUInt8) /* 13 */                        \
   _(c10::qint32, QInt32) /* 14 */                        \
-  _(at::BFloat16, BFloat16) /* 15 */
+  _(at::BFloat16, BFloat16) /* 15 */                     \
+  _(c10::quint4x2, QUInt4x2) /* 16 */
 
 
 // If you want to support ComplexHalf for real, add ComplexHalf
@@ -154,7 +155,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 #define AT_FORALL_QINT_TYPES(_)  \
   _(c10::qint8, QInt8)           \
   _(c10::quint8, QUInt8)         \
-  _(c10::qint32, QInt32)
+  _(c10::qint32, QInt32)         \
+  _(c10::quint4x2, QUInt4x2)
 
 #define AT_FORALL_COMPLEX_TYPES(_)             \
   _(c10::complex<float>, ComplexFloat)         \
@@ -279,7 +281,7 @@ static inline bool isComplexType(ScalarType t) {
 
 static inline bool isQIntType(ScalarType t) {
   // Don't forget to extend this when adding new QInt types
-  return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32;
+  return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32 || t == ScalarType::QUInt4x2;
 }
 
 static inline ScalarType toQIntType(ScalarType t) {
@@ -303,6 +305,8 @@ static inline ScalarType toUnderlying(ScalarType t) {
       return ScalarType::Char;
     case ScalarType::QInt32:
       return ScalarType::Int;
+    case ScalarType::QUInt4x2:
+      return ScalarType::Byte;
     default:
       return t;
   }
diff --git a/c10/util/quint4x2.h b/c10/util/quint4x2.h
new file mode 100644
index 000000000000..c2502b561409
--- /dev/null
+++ b/c10/util/quint4x2.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte boundary.
+ */
+struct alignas(1) quint4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint4x2() = default;
+  C10_HOST_DEVICE explicit quint4x2(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index e97eaa843979..e2070a1584a2 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -88,5 +88,6 @@ CAFFE_KNOWN_TYPE(c10::qint8)
 CAFFE_KNOWN_TYPE(c10::quint8)
 CAFFE_KNOWN_TYPE(c10::qint32)
 CAFFE_KNOWN_TYPE(at::BFloat16)
+CAFFE_KNOWN_TYPE(c10::quint4x2)
 
 } // namespace caffe2
diff --git a/c10/util/typeid.h b/c10/util/typeid.h
index 62a0bdfc6644..51833fb545ad 100644
--- a/c10/util/typeid.h
+++ b/c10/util/typeid.h
@@ -29,6 +29,7 @@
 #include <c10/util/qint32.h>
 #include <c10/util/qint8.h>
 #include <c10/util/quint8.h>
+#include <c10/util/quint4x2.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/flat_hash_map.h>
 
diff --git a/test/quantization/test_quantized_tensor.py b/test/quantization/test_quantized_tensor.py
index 03f1a2e2b637..4556bdac07cf 100644
--- a/test/quantization/test_quantized_tensor.py
+++ b/test/quantization/test_quantized_tensor.py
@@ -172,6 +172,36 @@ def test_qtensor(self):
                                  "quantization_scheme=torch.per_tensor_affine, " +
                                  "scale=1.0, zero_point=2)")
 
+    def test_qtensor_sub_byte(self):
+        num_elements = 10
+        scale = 1.0
+        zero_point = 2
+        for dtype in [torch.quint4x2]:
+            r = torch.ones((5, 2), dtype=torch.float)
+            qr = torch.quantize_per_tensor(r, scale, zero_point, dtype)
+            self.assertEqual(qr.q_scale(), scale)
+            self.assertEqual(qr.q_zero_point(), zero_point)
+            self.assertTrue(qr.is_quantized)
+            self.assertFalse(r.is_quantized)
+            self.assertEqual(qr.storage().size(), 5)
+
+            int_repr = qr.int_repr()
+            for num in int_repr[0:5]:
+                self.assertEqual(num, 51)  # Packed entries, each of value 3, i.e. 00110011
+
+            # Test tensor creation
+            q = torch._empty_affine_quantized([num_elements], scale=scale, zero_point=zero_point,
+                                              dtype=torch.quint4x2)
+            self.assertEqual(q.storage().size(), 5)
+
+            # Test save/load
+            with tempfile.NamedTemporaryFile() as f:
+                torch.save(qr, f)
+                f.seek(0)
+                loaded_q = torch.load(f)
+                loaded_int_repr = loaded_q.int_repr()[0:5]
+                self.assertEqual(int_repr[0:5], loaded_int_repr)
+
     def test_qtensor_float_assignment(self):
         # Scalar Tensor
         # item
@@ -285,15 +315,10 @@ def test_qtensor_dtypes(self):
         r = torch.rand(3, 2, dtype=torch.float) * 4 - 2
         scale = 0.2
         zero_point = 2
-        qr = torch.quantize_per_tensor(r, scale, zero_point, torch.qint8)
-        rqr = qr.dequantize()
-        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
-        qr = torch.quantize_per_tensor(r, scale, zero_point, torch.quint8)
-        rqr = qr.dequantize()
-        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
-        qr = torch.quantize_per_tensor(r, scale, zero_point, torch.qint32)
-        rqr = qr.dequantize()
-        self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
+        for dtype in [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2]:
+            qr = torch.quantize_per_tensor(r, scale, zero_point, dtype)
+            rqr = qr.dequantize()
+            self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale))
 
     def _test_quantize_per_channel(self, r, scales, zero_points, axis, float_params):
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index d2c3074c696e..576a0b39f501 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -644,7 +644,7 @@ def gen_pyi(declarations_path, out):
     for c in ('Double', 'Float', 'Long', 'Int',
               'Short', 'Char', 'Byte', 'Bool',
               'Half', 'BFloat16', 'ComplexDouble',
-              'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32'):
+              'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32', 'QUInt4x2'):
         legacy_storage_base_hints.append('class {}StorageBase(object): ...'.format(c))
 
     legacy_class_hints = []
@@ -662,7 +662,7 @@ def gen_pyi(declarations_path, out):
                          ['float32', 'float', 'float64', 'double', 'float16', 'bfloat16', 'half',
                           'uint8', 'int8', 'int16', 'short', 'int32', 'int', 'int64', 'long',
                           'complex32', 'complex64', 'cfloat', 'complex128', 'cdouble',
-                          'quint8', 'qint8', 'qint32', 'bool']]
+                          'quint8', 'qint8', 'qint32', 'bool', 'quint4x2']]
 
     # Generate __all__ directive
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/torch/__init__.py b/torch/__init__.py
index ea4b906a4f99..1ca766fa77ca 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -467,11 +467,13 @@ class QInt8Storage(_C.QInt8StorageBase, _StorageBase):
 class QInt32Storage(_C.QInt32StorageBase, _StorageBase):
     pass
 
+class QUInt4x2Storage(_C.QUInt4x2StorageBase, _StorageBase):
+    pass
 
 _storage_classes = {
     DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage,
     CharStorage, ByteStorage, HalfStorage, BoolStorage, QUInt8Storage, QInt8Storage,
-    QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage
+    QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage
 }
 
 # The _tensor_classes set is initialized by the call to _C._initialize_tensor_type_bindings()
@@ -540,6 +542,7 @@ def manager_path():
 del BFloat16StorageBase
 del ComplexDoubleStorageBase
 del ComplexFloatStorageBase
+del QUInt4x2StorageBase
 
 ################################################################################
 # Import most common subpackages
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 805548d18a98..027251234ff6 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -133,6 +133,7 @@ static PyObject * THPModule_initExtension(PyObject *_unused, PyObject *shm_manag
   THPByteStorage_postInit(module);
   THPBoolStorage_postInit(module);
   THPQUInt8Storage_postInit(module);
+  THPQUInt4x2Storage_postInit(module);
   THPQInt8Storage_postInit(module);
   THPQInt32Storage_postInit(module);
   THPBFloat16Storage_postInit(module);
@@ -752,6 +753,7 @@ PyObject* initModule() {
   ASSERT_TRUE(THPQUInt8Storage_init(module));
   ASSERT_TRUE(THPQInt8Storage_init(module));
   ASSERT_TRUE(THPQInt32Storage_init(module));
+  ASSERT_TRUE(THPQUInt4x2Storage_init(module));
   ASSERT_TRUE(THPBFloat16Storage_init(module));
   ASSERT_TRUE(THPComplexDoubleStorage_init(module));
   ASSERT_TRUE(THPComplexFloatStorage_init(module));
diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h
index 5e708f2b4f2d..e7c8bfdbe4f2 100644
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@@ -35,7 +35,8 @@
     PyObject_IsInstance(obj, THPComplexDoubleStorageClass)
 #define THPComplexFloatStorage_Check(obj) \
     PyObject_IsInstance(obj, THPComplexFloatStorageClass)
-
+#define THPQUInt4x2Storage_Check(obj) \
+    PyObject_IsInstance(obj, THPQUInt8StorageClass)
 
 #define THPDoubleStorage_CData(obj)         (obj)->cdata
 #define THPFloatStorage_CData(obj)          (obj)->cdata
@@ -52,6 +53,7 @@
 #define THPBFloat16Storage_CData(obj)       (obj)->cdata
 #define THPComplexDoubleStorage_CData(obj)  (obj)->cdata
 #define THPComplexFloatStorage_CData(obj)   (obj)->cdata
+#define THPQUInt4x2Storage_CData(obj)       (obj)->cdata
 
 #define THPStorageType TH_CONCAT_3(THP,Real,StorageType)
 #define THPStorageBaseStr TH_CONCAT_STRING_2(Real,StorageBase)
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index c6ef331de2b9..010ea498c3e7 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -115,6 +115,9 @@
 #define THPQInt32Utils_checkReal(object)         THPUtils_checkReal_INT(object)
 #define THPQInt32Utils_unpackReal(object)        (int)THPUtils_unpackReal_INT(object)
 #define THPQInt32Utils_newReal(value)           THPUtils_newReal_INT(value)
+#define THPQUInt4x2Utils_checkReal(object)      THPUtils_checkReal_INT(object)
+#define THPQUInt4x2Utils_unpackReal(object)     (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt4x2Utils_newReal(value)         THPUtils_newReal_INT(value)
 
 
 #define THPUtils_assert(cond, ...) THPUtils_assertRet(nullptr, cond, __VA_ARGS__)
diff --git a/torch/csrc/utils/tensor_dtypes.cpp b/torch/csrc/utils/tensor_dtypes.cpp
index d15e75fe8f38..724b7e35d8a1 100644
--- a/torch/csrc/utils/tensor_dtypes.cpp
+++ b/torch/csrc/utils/tensor_dtypes.cpp
@@ -49,6 +49,8 @@ std::pair<std::string, std::string> getDtypeNames(
       return std::make_pair("qint32", "");
     case at::ScalarType::BFloat16:
       return std::make_pair("bfloat16", "");
+    case at::ScalarType::QUInt4x2:
+      return std::make_pair("quint4x2", "");
     default:
       throw std::runtime_error("Unimplemented scalar type");
   }

From 1a2d3b6a750cc4c1709b64412619b28c737e1a79 Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Thu, 1 Oct 2020 23:51:12 -0700
Subject: [PATCH 365/449] [quant] PerChannelFloatQParams support for quint4x2
 dtype (#45594)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45594

Adds support for Per-channel quantization using float qparams for 4-bit dtype
We use the new dispatch mechanism and use existing quantize/dequantize kernels to pack the
4-bit data depending on the bit_width.
Size of 4-bit quantized tensor is half that of 8-bit quantized tensor.

Test Plan:
python test/test_quantization.py TestQuantizedTensor.test_quantize_per_channel_sub_byte

Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D24025595

fbshipit-source-id: dd9d0557de585dd4aaf5f138959c3523a29fb759
---
 .../native/quantized/affine_quantizer.cpp     | 20 ++------
 .../ATen/native/quantized/affine_quantizer.h  |  3 +-
 .../cpu/kernels/QuantizedOpKernels.cpp        | 48 +++++++++++++-----
 test/quantization/test_quantized_tensor.py    | 50 ++++++++++++++++++-
 4 files changed, 90 insertions(+), 31 deletions(-)

diff --git a/aten/src/ATen/native/quantized/affine_quantizer.cpp b/aten/src/ATen/native/quantized/affine_quantizer.cpp
index 1e0285bc1426..1d0aed1174aa 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.cpp
+++ b/aten/src/ATen/native/quantized/affine_quantizer.cpp
@@ -174,7 +174,7 @@ Tensor quantize_tensor_per_channel_float_qparams(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
   });
 
@@ -269,7 +269,7 @@ Tensor dequantize_tensor_per_channel_float_qparams(
   checkSameDevice(fn_name, rtensor, qtensor);
   checkSameSize(fn_name, qtensor, rtensor);
 
-  AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() {
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() {
     checkQuantizedTensor<scalar_t>(fn_name, qtensor);
   });
 
@@ -410,17 +410,13 @@ CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value) {
 * Note: For the case of embedding quantization we will set zero_point
 * to (-Xmin/scale), where Xmin is the min value in input tensor row.
 */
-template <typename T>
-T quantize_val_float_qparams(float scale, float zero_point, float value) {
-  int64_t qvalue;
+int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax) {
+  int qvalue;
 
-  // TODO make sure qmax and qmin for dtypes other than int8, uint8 is correctly defined.
-  constexpr int64_t qmin = std::numeric_limits<typename T::underlying>::min();
-  constexpr int64_t qmax = std::numeric_limits<typename T::underlying>::max();
   float inv_scale = scale == 0 ? 1.0f : 1.0f / scale;
   qvalue = lrintf(value * inv_scale + zero_point);
   qvalue = std::max(qmin, std::min(qvalue, qmax));
-  return static_cast<T>(qvalue);
+  return qvalue;
 }
 
 template <typename SRC_T, typename DST_T>
@@ -507,11 +503,5 @@ requantize_from_int<quint8>(double, int64_t, int64_t);
 template CAFFE2_API qint32
 requantize_from_int<qint32>(double, int64_t, int64_t);
 
-template CAFFE2_API qint8
-quantize_val_float_qparams<qint8>(float scale, float zero_point, float value);
-template CAFFE2_API quint8
-quantize_val_float_qparams<quint8>(float scale, float zero_point, float value);
-template CAFFE2_API qint32
-quantize_val_float_qparams<qint32>(float scale, float zero_point, float value);
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/affine_quantizer.h b/aten/src/ATen/native/quantized/affine_quantizer.h
index 2991922c7f86..670b119652cd 100644
--- a/aten/src/ATen/native/quantized/affine_quantizer.h
+++ b/aten/src/ATen/native/quantized/affine_quantizer.h
@@ -158,8 +158,7 @@ template <typename DST_T>
 CAFFE2_API DST_T
 requantize_from_int(double multiplier, int64_t zero_point, int64_t src);
 
-template <typename T>
-CAFFE2_API T quantize_val_float_qparams(float scale, float zero_point, float value);
+int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax);
 
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index bae9fe650046..a65e9f00f1d8 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -2592,7 +2592,8 @@ void dequantize_per_channel_affine_kernel(
       Tensor rtensor,
       Tensor scales,
       Tensor zero_points,
-      int64_t axis) {
+      int64_t axis,
+      int bit_width=8) {
 
   // For contiguous tensors, e.g. NCHW, arbitrary axis can be used.
   // For channels_last/3d however axis == 0 or 1.
@@ -2611,6 +2612,7 @@ void dequantize_per_channel_affine_kernel(
   check_tensor_memory_format(qtensor, rtensor);
   const auto* qd = qtensor.data_ptr<Q>();
   float* rd = rtensor.data_ptr<float>();
+  const auto elem_per_byte = 8 / bit_width;
   if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) ||
       rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) {
     for (auto b = 0; b < batches; ++b) {
@@ -2619,8 +2621,12 @@ void dequantize_per_channel_affine_kernel(
           auto i = b * channel * elements_per_channel + e * channel + c;
           // We need to convert the qint8 value to float to ensure the
           // subtraction subexpression returns a float
-          rd[i] = (static_cast<float>(qd[i].val_) - zero_points_data[c]) *
-              scales_data[c];
+          auto qvalue = qd[i / elem_per_byte].val_;
+          if (bit_width < 8) {
+            qvalue >>= (i % elem_per_byte) * bit_width;
+            qvalue &= (1 << bit_width) - 1;
+          }
+          rd[i] = (static_cast<float>(qvalue) - zero_points_data[c]) * scales_data[c];
         }
       }
     }
@@ -2632,8 +2638,12 @@ void dequantize_per_channel_affine_kernel(
               c * elements_per_channel + e;
           // We need to convert the qint8 value to float to ensure the
           // subtraction subexpression returns a float
-          rd[i] = (static_cast<float>(qd[i].val_) - zero_points_data[c]) *
-              scales_data[c];
+          auto qvalue = qd[i / elem_per_byte].val_;
+          if (bit_width < 8) {
+            qvalue >>= (i % elem_per_byte) * bit_width;
+            qvalue &= (1 << bit_width) - 1;
+          }
+          rd[i] = (static_cast<float>(qvalue) - zero_points_data[c]) * scales_data[c];
         }
       }
     }
@@ -2667,7 +2677,7 @@ void quantize_tensor_per_channel_float_qparams_cpu(
   TORCH_CHECK(rtensor.is_contiguous() || (axis <=1),
       "If tensor is channels_last contig then per channel quantization "
       "is supported only for axis = 0 or 1.");
-  AT_DISPATCH_QINT_TYPES(
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
       qtensor.scalar_type(), "quantize_tensor_per_channel_float_qparams_cpu", [&]() {
         int64_t batches = size_to_dim_(axis, rtensor.sizes());
         int64_t elements_per_channel =
@@ -2677,15 +2687,22 @@ void quantize_tensor_per_channel_float_qparams_cpu(
         auto zero_points_data = zero_points.data_ptr<float>();
         check_tensor_memory_format(rtensor, qtensor);
         const float* rdata = rtensor.data_ptr<float>();
-        auto qdata = qtensor.data_ptr<scalar_t>();
+        auto qdata = reinterpret_cast<underlying_t*>(qtensor.data_ptr<scalar_t>());
+        const auto elem_per_byte = CHAR_BIT / bit_width;
+        int qvalue = 0;
         if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) ||
             rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) {
           for (auto b = 0; b < batches; ++b) {
             for (auto e = 0; e < elements_per_channel; ++e) {
               for (auto c = 0; c < channel; ++c) {
                 auto i = b * channel * elements_per_channel + e * channel + c;
-                qdata[i] = quantize_val_float_qparams<scalar_t>(
-                    scales_data[c], zero_points_data[c], rdata[i]);
+                qvalue = quantize_val_float_qparams(
+                    scales_data[c], zero_points_data[c], rdata[i], quant_min, quant_max);
+                if (i % elem_per_byte == 0) {
+                  qdata[i / elem_per_byte] = static_cast<underlying_t>(qvalue);
+                } else {
+                  qdata[i / elem_per_byte] |= static_cast<underlying_t>(qvalue << ((i % elem_per_byte) * bit_width));
+                }
               }
             }
           }
@@ -2695,8 +2712,13 @@ void quantize_tensor_per_channel_float_qparams_cpu(
               for (auto e = 0; e < elements_per_channel; ++e) {
                 auto i = b * channel * elements_per_channel +
                     c * elements_per_channel + e;
-                qdata[i] = quantize_val_float_qparams<scalar_t>(
-                    scales_data[c], zero_points_data[c], rdata[i]);
+                qvalue = quantize_val_float_qparams(
+                    scales_data[c], zero_points_data[c], rdata[i], quant_min, quant_max);
+                if (i % elem_per_byte == 0) {
+                  qdata[i / elem_per_byte] = static_cast<underlying_t>(qvalue);
+                } else {
+                  qdata[i / elem_per_byte] |= static_cast<underlying_t>(qvalue << ((i % elem_per_byte) * bit_width));
+                }
               }
             }
           }
@@ -2710,9 +2732,9 @@ void dequantize_tensor_per_channel_float_qparams_cpu(
     Tensor scales,
     Tensor zero_points,
     int64_t axis) {
-  AT_DISPATCH_QINT_TYPES(
+  AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(
       qtensor.scalar_type(), "dequantize_tensor_per_channel_float_qparams_cpu", [&]() {
-        dequantize_per_channel_affine_kernel<float, float, scalar_t>(qtensor, rtensor, scales, zero_points, axis);
+        dequantize_per_channel_affine_kernel<float, float, scalar_t>(qtensor, rtensor, scales, zero_points, axis, bit_width);
       });
 }
 
diff --git a/test/quantization/test_quantized_tensor.py b/test/quantization/test_quantized_tensor.py
index 4556bdac07cf..e919deb9d2bd 100644
--- a/test/quantization/test_quantized_tensor.py
+++ b/test/quantization/test_quantized_tensor.py
@@ -429,6 +429,52 @@ def quantize_ref(data, scales, zero_points):
         zero_points = torch.tensor([0.1, 0.2, 1.], dtype=torch.float)
         self._test_quantize_per_channel(r, scales, zero_points, 0, True)
 
+    def test_quantize_per_channel_sub_byte(self):
+        """ Tests the per channel quantization scheme for 4-bit qtensors.
+        The scale and zero point for this have to be in floating point. """
+        r = torch.rand(3, 2, dtype=torch.float) * 4
+        scales = torch.tensor([0.2, 0.3, 0.1], dtype=torch.float)
+        zero_points = torch.tensor([0.1, 0.2, 0.3], dtype=torch.float)
+        qr = torch.quantize_per_channel(r, scales, zero_points, 0, torch.quint4x2)
+        dequant_tensor = qr.dequantize()
+
+        def _get_qranges(bit_width):
+            if bit_width == 4:
+                return 0, 15
+
+        def _quantize_per_channel_sub_byte_ref(data, scales, zero_points, axis, bit_width):
+            dims = data.size()
+            data = data.view(-1, dims[axis], np.prod(dims[axis + 1:]))
+            qtensor_size = math.ceil(data.numel() / 2)
+            res = torch.empty(qtensor_size, dtype=torch.uint8)
+            elem_per_byte = 8 / bit_width
+            quant_min, quant_max = _get_qranges(bit_width)
+            for i in range(data.size()[0]):
+                for j in range(data.size()[1]):
+                    for k in range(data.size()[2]):
+                        inv_scale = 1.0 / scales[j]
+                        index = i * data.size()[1] * data.size()[2] + j * data.size()[2] + k
+                        qvalue = np.clip(
+                            np.round(data[i][j][k] * inv_scale + zero_points[j]), quant_min, quant_max).to(dtype=torch.int)
+                        res_idx = int(index / elem_per_byte)
+                        if (index % elem_per_byte == 0):
+                            res[res_idx] = qvalue
+                        else:
+                            res[res_idx] |= (qvalue << ((index % elem_per_byte) * bit_width))
+            return res
+
+        ref_res = _quantize_per_channel_sub_byte_ref(r, scales, zero_points, 0, 4)
+        self.assertTrue(np.allclose(qr.int_repr(), ref_res))
+        self.assertTrue(np.allclose(r.numpy(), dequant_tensor.numpy(), atol=1 / np.min(scales.numpy())))
+
+        # Check 4D tensor with non-zero axis.
+        r = torch.rand(3, 2, 4, 5, dtype=torch.float) * 4
+        scales = torch.tensor([0.2, 0.03], dtype=torch.float)
+        zero_points = torch.tensor([0.1, 0.2], dtype=torch.float)
+        qr = torch.quantize_per_channel(r, scales, zero_points, axis=1, dtype=torch.quint4x2)
+        ref_res = _quantize_per_channel_sub_byte_ref(r, scales, zero_points, 1, 4)
+        self.assertTrue(np.allclose(qr.int_repr(), ref_res))
+
     def test_qtensor_permute(self):
         scale = 0.02
         zero_point = 1
@@ -516,7 +562,9 @@ def test_qtensor_per_channel_load_save(self):
         scales = torch.rand(10, dtype=torch.double) * 0.02 + 0.01
         zero_points = torch.round(torch.rand(10) * 20 + 1).to(torch.long)
         # quint32, cuda is not supported yet
-        for dtype in [torch.quint8, torch.qint8]:
+        for dtype in [torch.quint8, torch.qint8, torch.quint4x2]:
+            if dtype == torch.quint4x2:
+                zero_points = torch.ones(10, dtype=torch.float)
             qr = torch.quantize_per_channel(r, scales, zero_points, 1, dtype)
             with tempfile.NamedTemporaryFile() as f:
                 # Serializing and Deserializing Tensor

From 9201c37d020007979e144693d86c8e8599e2fd8f Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@fb.com>
Date: Fri, 2 Oct 2020 00:24:14 -0700
Subject: [PATCH 366/449] Use addmm directly for 1x1 convolution (#45557)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/45274
Based on https://github.com/pytorch/pytorch/issues/44041, sets intermediate for backward computation (otherwise, backward tests are failing).

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45557

Reviewed By: izdeby

Differential Revision: D24030655

Pulled By: ngimel

fbshipit-source-id: 368fe9440668dffc004879f8b1d2dd3787d915c9
---
 aten/src/ATen/native/ConvolutionMM2d.cpp | 16 +++++++++++-----
 aten/src/ATen/native/ConvolutionMM3d.cpp | 12 +++++++++---
 test/test_nn.py                          |  9 +++++++++
 torch/testing/_internal/common_nn.py     | 11 +++++++++++
 4 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 9bc6b476e221..6a0ca1e67900 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -191,11 +191,10 @@ static void slow_conv2d_update_output_frame(
       output.reshape({n_output_plane, output_height * output_width});
   if (bias.defined()) {
     output.copy_(bias.unsqueeze(-1).unsqueeze(-1));
+    output2d.addmm_(weight, finput, 1, 1);
   } else {
-    output.zero_();
+    output2d.addmm_(weight, finput, 0, 1);
   }
-
-  output2d.addmm_(weight, finput, 1, 1);
 }
 
 void slow_conv2d_backward_update_grad_input_frame(
@@ -434,16 +433,23 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_forward_out_cpu(
 
   const int64_t batch_size = input.size(0);
 
-  finput.resize_({batch_size,
+  if ((input.ndimension() == 4) && (kernel_height == 1) && (stride_height == 1) && (pad_height == 0) &&
+      (kernel_width == 1) && (stride_width == 1) && (pad_width == 0)) {
+    finput =
+        input.view({batch_size, n_input_plane, output_height * output_width})
+            .detach();
+  } else {
+     finput.resize_({batch_size,
                   n_input_plane * kernel_height * kernel_width,
                   output_height * output_width});
+  }
   output.resize_({batch_size, n_output_plane, output_height, output_width});
 
   at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
     NoGradGuard no_grad;
     AutoNonVariableTypeMode non_variable_type_mode;
     for (int64_t t = start; t < end; t++) {
-      Tensor input_t = input[t];
+      Tensor input_t = input[t].unsqueeze(0);
       Tensor output_t = output[t];
       Tensor finput_t = finput[t];
       slow_conv2d_update_output_frame(
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index d5a29a3abbe1..95263617e2a8 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -581,9 +581,15 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv3d_forward_out_cpu(
       (input_width + 2 * pad_width - kernel_width) / stride_width + 1;
 
   const int64_t batch_size = input.size(0);
-  finput.resize_({batch_size,
-                  n_input_plane * kernel_depth * kernel_height * kernel_width,
-                  output_depth * output_height * output_width});
+  if ((kernel_depth == 1) && (kernel_height == 1) && (kernel_width == 1) &&
+      (pad_depth == 0) && (pad_height == 0) && (pad_width == 0) &&
+      (stride_depth == 1) && (stride_height == 1) && (stride_width == 1) && (groups == 1)) {
+    finput = input.view({batch_size, n_input_plane, output_height * output_width * output_depth}).detach();
+  } else {
+    finput.resize_({batch_size,
+                    n_input_plane * kernel_depth * kernel_height * kernel_width,
+                    output_depth * output_height * output_width});
+  }
   output.resize_(
       {batch_size, n_output_plane, output_depth, output_height, output_width});
 
diff --git a/test/test_nn.py b/test/test_nn.py
index 93bdafdb6866..56e727992bfe 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -3932,6 +3932,15 @@ def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
             # but it should work with the same type
             nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
 
+    def test_Conv2d_1x1(self):
+        in_channels = 2
+        out_channels = 2
+        mod = torch.nn.Conv2d(2, 2, 1, bias=False).to(dtype=torch.double)
+        input = torch.randn(1, in_channels, 5, 5, requires_grad=True, dtype=torch.double)
+        for enabled in (False, True):
+            with torch.backends.mkldnn.flags(enabled=enabled):
+                gradcheck(F.conv2d, (input, mod.weight))
+
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
     def test_cudnn_non_contiguous(self):
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index a379fa10b864..4e96e2b26062 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2173,6 +2173,17 @@ def fractional_max_pool3d_test(test_case):
         with_tf32=True,
         tf32_precision=0.05,
     ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(2, 3, (1, 1, 1), 1, 0, 1, 1, False),
+        cpp_constructor_args='''torch::nn::Conv3dOptions(2, 3, {2, 3, 4})
+                                .stride(1).padding(0).dilation(1).groups(1).bias(false)''',
+        input_size=(1, 2, 3, 4, 5),
+        cudnn=True,
+        desc='1x1x1_no_bias',
+        check_with_long_tensor=False,
+        with_tf32=False,
+    ),
     dict(
         module_name='Conv3d',
         constructor_args=(3, 4, 2, 2),

From 6e2eee2b9db7b11d0980a5dfafe92af7b99b5724 Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 2 Oct 2020 04:06:27 -0700
Subject: [PATCH 367/449] Add faithful C++ API (#44087)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44087

Each op taking a TensorOptions argument now has an additional overload in the C++ frontend where it takes scattered ScalarType, Layout, Device, bool instead of one TensorOptions argument.

If it is a c10-full op, then the scattered version calls into the dispatcher and the gathered version is a proxy calling into the scattered version.
If it is a non-c10-full op, then the gathered version calls into the dispatcher and the scattered version is a proxy calling into the gathered version.

This should minimize the amount of gathering and scattering needed.

This PR is also a prerequisite to remove the re-gathering of arguments that is currently happening in VariableKernel. Currently, VariableKernels gather arguments into a TensorOptions object
to call into the C++ API. In a PR stacked on top of this, VariableKernel will just directly call into the scattered C++ API introduced here and avoid the gathering step.
ghstack-source-id: 113355689

Test Plan:
waitforsandcastle

vs master: https://www.internalfb.com/intern/fblearner/details/216169815/

vs previous diff: https://www.internalfb.com/intern/fblearner/details/216169957/

Reviewed By: ezyang

Differential Revision: D23492188

fbshipit-source-id: 3e84c467545ad9371e98e09075a311bd18411c5a
---
 tools/codegen/api/cpp.py               |  77 ++++++++++--
 tools/codegen/api/dispatcher.py        |  52 ++++++--
 tools/codegen/api/legacy_dispatcher.py |   4 +-
 tools/codegen/gen.py                   | 167 +++++++++++++++++++++----
 4 files changed, 251 insertions(+), 49 deletions(-)

diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index d8445f02ee54..538ba3596c7d 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -1,7 +1,9 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import TensorOptionsArguments, CppArgument, ThisArgument
 import tools.codegen.local as local
-from typing import Optional, Sequence, Union, Callable, List
+from typing import Optional, Sequence, Union, Callable, List, Tuple
+import copy
+from dataclasses import dataclass
 
 # This file describes the translation of JIT schema to the public C++
 # API, which is what people use when they call functions like at::add.
@@ -152,6 +154,7 @@ def returns_type(rs: Sequence[Return]) -> str:
     '[]': '{}',
     '[0,1]': '{0,1}',  # TODO: stop special casing
     'contiguous_format': 'MemoryFormat::Contiguous',
+    'long': 'at::kLong',
 }
 
 # Convert a JIT default into C++ expression representing the default
@@ -191,9 +194,50 @@ def argument(a: Union[Argument, TensorOptionsArguments, ThisArgument]) -> CppArg
     else:
         assert_never(a)
 
-def group_arguments(
-    func: FunctionSchema, *, method: bool = False
-) -> Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]]:
+@dataclass(frozen=True)
+class CppSignature:
+    returns: Tuple[Return, ...]
+    arguments: Tuple[Union[Argument, TensorOptionsArguments, ThisArgument], ...]
+
+    def cpp_arguments(self) -> Sequence[CppArgument]:
+        return list(map(argument, self.arguments))
+
+    # Return arguments as a comma separated list, i.e. like they would be in a C++
+    # function signature. Include default values for arguments.
+    def cpp_arguments_str(self, with_defaults: bool) -> str:
+        args_without_this = [argument(a) for a in self.arguments if not isinstance(a, ThisArgument)]
+        if with_defaults:
+            return ', '.join(map(str, args_without_this))
+        else:
+            return ', '.join(map(lambda s: s.str_no_default(), args_without_this))
+
+
+@dataclass(frozen=True)
+class CppSignatureGroup:
+    # arguments contains the arguments for the C++ signature as it is represented
+    # in the JIT schema.
+    signature: CppSignature
+
+    # gathered_signature is an alternative C++ signature in which TensorOptions are
+    # gathered into one TensorOptions object instead of being scattered into
+    # ScalarType, Layout, Device. This is only present for factory operators,
+    # other operators have this set to None. This can be used to generate a
+    # convenience API in the C++ frontend so users can call using TensorOptions objects.
+    gathered_signature: Optional[CppSignature]
+
+    # If it is a factory op, this returns the arguments for the convenience API
+    # that takes TensorOptions. If it is not a factory op and doesn't have
+    # a gathered signature, then this returns the regular signature instead.
+    def signature_prefer_gathered(self) -> CppSignature:
+        if self.gathered_signature is not None:
+            return self.gathered_signature
+        else:
+            return self.signature
+
+
+def signature_group(
+    func: FunctionSchema, *, method: bool = False,
+) -> CppSignatureGroup:
     args: List[Union[Argument, ThisArgument, TensorOptionsArguments]] = []
     args.extend(func.out_arguments)
 
@@ -202,8 +246,9 @@ def group_arguments(
     else:
         args.extend(func.arguments)
 
-    # group up arguments for tensor options
+    gathered_args = copy.deepcopy(args)
 
+    # group up arguments for tensor options
     def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
         return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
     predicates = [  # order matters
@@ -213,14 +258,16 @@ def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
         pred('pin_memory', Type.parse('bool')),
     ]
 
+    has_tensoroptions_argument = False
     i = 0
     while i < len(func.kwarg_only_arguments):
         # If there is enough space...
         if i <= len(func.kwarg_only_arguments) - len(predicates):
             # And the next len(predicates) arguments look like TensorOptions arguments
             if all(p(a) for p, a in zip(predicates, func.kwarg_only_arguments[i : i + len(predicates)])):
+                has_tensoroptions_argument = True
                 # Group them together as one argument
-                args.append(TensorOptionsArguments(
+                gathered_args.append(TensorOptionsArguments(
                     dtype=func.kwarg_only_arguments[i],
                     layout=func.kwarg_only_arguments[i + 1],
                     device=func.kwarg_only_arguments[i + 2],
@@ -228,11 +275,19 @@ def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
                 ))
                 i += len(predicates)
                 continue
-        args.append(func.kwarg_only_arguments[i])
+        gathered_args.append(func.kwarg_only_arguments[i])
         i += 1
 
-    return args
+    args.extend(func.kwarg_only_arguments)
 
-# Convert arguments to C++ API form
-def arguments(func: FunctionSchema, *, method: bool = False) -> Sequence[CppArgument]:
-    return list(map(argument, group_arguments(func, method=method)))
+    if has_tensoroptions_argument:
+        return CppSignatureGroup(
+            signature=CppSignature(arguments=tuple(args), returns=tuple(func.returns)),
+            gathered_signature=CppSignature(arguments=tuple(gathered_args), returns=tuple(func.returns)),
+        )
+    else:
+        assert gathered_args == args
+        return CppSignatureGroup(
+            signature=CppSignature(arguments=tuple(args), returns=tuple(func.returns)),
+            gathered_signature=None,
+        )
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index 67724b271691..6cb141c22f99 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -2,10 +2,10 @@
 
 from tools.codegen.api.types import CppArgument, DispatcherExpr, TensorOptionsArguments, \
     DispatcherArgument, ThisArgument, LegacyDispatcherArgument
-import tools.codegen.api.cpp as cpp
+from tools.codegen.api import cpp
 import tools.codegen.api.legacy_dispatcher as legacy_dispatcher
 import tools.codegen.local as local
-
+from enum import Enum
 import itertools
 from typing import Sequence, Optional
 
@@ -75,11 +75,19 @@ def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]:
             for la in legacy_dispatcher.arguments(func)
         ]
 
+# TODO GATHER is only needed for non-c10-full ops, remove later.
+ProcessTensoroptions = Enum('ProcessTensoroptions', ('GATHER', 'SCATTER', 'PASS_THROUGH'))
+
+
 # Given a set of CppArguments in scope, return a sequence of dispatcher
 # expressions that translate the cpp API into dispatcher API
-def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument]) -> Sequence[DispatcherExpr]:
+def cppargument_exprs(a: CppArgument,
+                      *,
+                      tensor_options: Optional[CppArgument],
+                      process_tensoroptions: ProcessTensoroptions = ProcessTensoroptions.PASS_THROUGH
+                      ) -> Sequence[DispatcherExpr]:
     if isinstance(a.argument, TensorOptionsArguments):
-        if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        if process_tensoroptions == ProcessTensoroptions.SCATTER:
             ta = a.argument
             return [
                 DispatcherExpr(type=argument_type(ta.dtype), expr=f'optTypeMetaToScalarType({a.name}.dtype_opt())'),
@@ -87,8 +95,16 @@ def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument])
                 DispatcherExpr(type=argument_type(ta.device), expr=f'{a.name}.device_opt()'),
                 DispatcherExpr(type=argument_type(ta.pin_memory), expr=f'{a.name}.pinned_memory_opt()'),  # weird discrep
             ]
+        elif process_tensoroptions == ProcessTensoroptions.GATHER:
+            return [
+                DispatcherExpr(
+                    type='const TensorOptions &',
+                    expr="TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)")]
         else:
+            assert process_tensoroptions == ProcessTensoroptions.PASS_THROUGH
             return [DispatcherExpr(type='const TensorOptions &', expr=a.name)]
+    elif isinstance(a.argument, ThisArgument):
+        return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)]
     elif isinstance(a.argument, Argument):
         if a.name == 'memory_format' and tensor_options is not None and local.use_c10_dispatcher() is UseC10Dispatcher.full:
             return [DispatcherExpr(
@@ -97,19 +113,35 @@ def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument])
             ]
         else:
             return [DispatcherExpr(type=argument_type(a.argument), expr=a.name)]
-    elif isinstance(a.argument, ThisArgument):
-        return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)]
     else:
         assert_never(a.argument)
 
-def cpparguments_exprs(args: Sequence[CppArgument]) -> Sequence[DispatcherExpr]:
+def cpparguments_exprs(args: Sequence[CppArgument], process_tensoroptions: ProcessTensoroptions) -> Sequence[DispatcherExpr]:
     tensor_options = next((a for a in args if isinstance(a.argument, TensorOptionsArguments)), None)
-    return [r for a in args for r in cppargument_exprs(a, tensor_options=tensor_options)]
+    return [r for a in args for r in cppargument_exprs(a,
+                                                       tensor_options=tensor_options,
+                                                       process_tensoroptions=process_tensoroptions)]
 
 # I don't think this is entirely sound, but it should be reasonably
 # close
 def legacydispatcherarguments_exprs(args: Sequence[LegacyDispatcherArgument]) -> Sequence[DispatcherExpr]:
-    return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args])
+    if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        process_tensoroptions = ProcessTensoroptions.SCATTER
+    else:
+        process_tensoroptions = ProcessTensoroptions.PASS_THROUGH
+    return cpparguments_exprs([CppArgument(type=a.type,
+                                           name=a.name,
+                                           default=None,
+                                           argument=a.argument) for a in args],
+                              process_tensoroptions=process_tensoroptions)
 
 def exprs(args: Sequence[DispatcherArgument]) -> Sequence[DispatcherExpr]:
-    return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args])
+    if local.use_c10_dispatcher() is UseC10Dispatcher.full:
+        process_tensoroptions = ProcessTensoroptions.SCATTER
+    else:
+        process_tensoroptions = ProcessTensoroptions.PASS_THROUGH
+    return cpparguments_exprs([CppArgument(type=a.type,
+                                           name=a.name,
+                                           default=None,
+                                           argument=a.argument) for a in args],
+                              process_tensoroptions=process_tensoroptions)
diff --git a/tools/codegen/api/legacy_dispatcher.py b/tools/codegen/api/legacy_dispatcher.py
index db3d26c84fd0..160d39495951 100644
--- a/tools/codegen/api/legacy_dispatcher.py
+++ b/tools/codegen/api/legacy_dispatcher.py
@@ -71,4 +71,6 @@ def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> Legacy
         assert_never(a)
 
 def arguments(func: FunctionSchema) -> Sequence[LegacyDispatcherArgument]:
-    return list(map(argument, cpp.group_arguments(func)))
+    signature_group = cpp.signature_group(func)
+    args = signature_group.signature_prefer_gathered().arguments
+    return list(map(argument, args))
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 668894775f54..48a2b3f56702 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -14,6 +14,7 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import *
 import tools.codegen.api.cpp as cpp
+from tools.codegen.api.cpp import CppSignature
 import tools.codegen.api.dispatcher as dispatcher
 import tools.codegen.api.legacy_dispatcher as legacy_dispatcher
 import tools.codegen.local as local
@@ -300,6 +301,28 @@ def func(f: NativeFunction) -> Optional[str]:
 
     return func
 
+# Return a string with a comma separated list of expressions that could be used
+# to call this operator. This can be used to generate code that wraps operators
+# and calls back into them. The process_tensoroptions argument determines how
+# tensor options should be treated. They can be
+# - PASS_THROUGH: Don't do anything, just handle them as regular arguments
+# - SCATTER: Expect a `TensorOptions options` in the scope and scatter it into `options.dtype, ...`
+# - GATHER: Expect `dtype, ...` in the scope and gather them into a TensorOptions for calling
+def exprs_str(signature: CppSignature,
+              process_tensoroptions: dispatcher.ProcessTensoroptions = dispatcher.ProcessTensoroptions.PASS_THROUGH,
+              exclude_this: bool = False,
+              ) -> str:
+    args = signature.cpp_arguments()
+    if exclude_this:
+        args = [a for a in args if not isinstance(a.argument, ThisArgument)]
+    exprs = dispatcher.cpparguments_exprs(args, process_tensoroptions=process_tensoroptions)
+    return ', '.join(map(lambda a: a.expr, exprs))
+
+def types_str(signature: CppSignature) -> str:
+    args = signature.cpp_arguments()
+    exprs = dispatcher.cpparguments_exprs(args, process_tensoroptions=dispatcher.ProcessTensoroptions.PASS_THROUGH)
+    return ', '.join(map(lambda a: a.type, exprs))
+
 # Generates Function.cpp and Function.h.  These files provide the
 # functional public C++ API, and the scaffolding to call into
 # the dispatcher from these functions.  See also compute_tensor_method.
@@ -311,32 +334,73 @@ def go(f: NativeFunction) -> Optional[str]:
         if Variant.function not in f.variants:
             return None
 
-        name = cpp.name(f.func)
-
         cpp_returns_type = cpp.returns_type(f.func.returns)
-        cpp_args = cpp.arguments(f.func)
-        cpp_args_str = ', '.join(map(str, cpp_args))
+        cpp_name = cpp.name(f.func)
+        signature_group = cpp.signature_group(f.func, method=False)
 
         if target is Target.DECLARATION:
-            return f"CAFFE2_API {cpp_returns_type} {name}({cpp_args_str});"
+            if signature_group.gathered_signature is None:
+                # There's no TensorOptions
+                return f"""
+CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=True)});
+"""
+            else:
+                # There's TensorOptions in the API. Create 2 APIs - one taking the TensorOptions object ("gathered_signature"),
+                # and one taking a scattered signature with ScalarType, Layout, Device separately ("signature").
+                # The gathered_signature already exists in several older PyTorch versions and had default arguments.
+                # For backward compatibility, we left it unchanged and added the scattered API on top of it.
+                # Note that the scattered API cannot have default arguments or calls will be ambigious.
+                return f"""
+CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=True)});
+CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)});
+"""
 
         assert target is Target.DEFINITION
 
-        dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args)
-        cpp_args_str_no_default = ', '.join(map(lambda a: a.str_no_default(), cpp_args))
         dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
-        dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs))
-        dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs))
 
-        return f"""
+        if signature_group.gathered_signature is None:
+            # There's no TensorOptions
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+"""
+        elif local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            # for c10-full ops, the scattered version is the real op and the gathered version is a proxy
+            # calling into the scattered version
+            return f"""
 // aten::{f.func}
-{cpp_returns_type} {name}({cpp_args_str_no_default}) {{
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{
     static auto op = c10::Dispatcher::singleton()
         .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
-        .typed<{dispatcher_returns_type} ({dispatcher_types_str})>();
-    return op.call({dispatcher_exprs_str});
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.SCATTER)});
+}}
+"""
+        else:
+            # for non-c10-full ops, the gathered version is the real op and the scattered version is a proxy
+            # calling into the gathered version
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.gathered_signature)})>();
+    return op.call({exprs_str(signature_group.gathered_signature)});
+}}
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.GATHER)});
 }}
 """
+
     return go
 
 # Generates TensorBody.h (sic) and TensorMethods.cpp.  These files provide the
@@ -352,30 +416,78 @@ def go(f: NativeFunction) -> Optional[str]:
         assert len(f.func.arguments) > 0
         assert sum(a.name == 'self' for a in f.func.arguments) == 1
 
-        name = cpp.name(f.func)
+        cpp_name = cpp.name(f.func)
         cpp_returns_type = cpp.returns_type(f.func.returns)
-        cpp_args = cpp.arguments(f.func, method=True)
-        cpp_args_exclude_this = [a for a in cpp_args if not isinstance(a.argument, ThisArgument)]
-        cpp_args_exclude_this_str = ', '.join(str(a) for a in cpp_args_exclude_this)
+        signature_group = cpp.signature_group(f.func, method=True)
 
         if target is Target.DECLARATION:
-            return f"{cpp_returns_type} {name}({cpp_args_exclude_this_str}) const;"
+            if signature_group.gathered_signature is None:
+                # There's no TensorOptions. Just create the API without concern for TensorOptions.
+                return f"{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=True)}) const;"
+            else:
+                # There's TensorOptions in the API. Create 2 APIs - one taking the TensorOptions object ("gathered_signature"),
+                # and one taking a scattered signature with ScalarType, Layout, Device separately ("signature").
+                # The gathered_signature already exists in several older PyTorch versions and had default arguments.
+                # For backward compatibility, we left it unchanged and added the scattered API on top of it.
+                # Note that the scattered API cannot have default arguments or calls will be ambigious.
+                return f"""
+{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=True)}) const;
+{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const;
+"""
 
         assert target is Target.DEFINITION
 
-        dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args)
-        cpp_args_exclude_this_str_no_default = ', '.join(a.str_no_default() for a in cpp_args_exclude_this)
         dispatcher_returns_type = dispatcher.returns_type(f.func.returns)
-        dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs))
-        dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs))
 
-        return f"""
+        result = f"""
+// aten::{f.func}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+"""
+
+        if signature_group.gathered_signature is None:
+            # There's no TensorOptions
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+"""
+        elif local.use_c10_dispatcher() is UseC10Dispatcher.full:
+            # for c10-full ops, the scattered version is the real op and the gathered version is a proxy
+            # calling into the scattered version
+            return f"""
+// aten::{f.func}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
+    static auto op = c10::Dispatcher::singleton()
+        .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>();
+    return op.call({exprs_str(signature_group.signature)});
+}}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) const {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.SCATTER, exclude_this=True)});
+}}
+"""
+        else:
+            # for non-c10-full ops, the gathered version is the real op and the scattered version is a proxy
+            # calling into the gathered version
+            return f"""
 // aten::{f.func}
-{cpp_returns_type} Tensor::{name}({cpp_args_exclude_this_str_no_default}) const {{
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) const {{
     static auto op = c10::Dispatcher::singleton()
         .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}")
-        .typed<{dispatcher_returns_type} ({dispatcher_types_str})>();
-    return op.call({dispatcher_exprs_str});
+        .typed<{dispatcher_returns_type} ({types_str(signature_group.gathered_signature)})>();
+    return op.call({exprs_str(signature_group.gathered_signature)});
+}}
+{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{
+    return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.GATHER, exclude_this=True)});
 }}
 """
 
@@ -698,7 +810,8 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
     kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments)
     out_arg_set = set(a.name for a in f.func.out_arguments)
 
-    cpp_args = cpp.arguments(f.func)
+    signature_group = cpp.signature_group(f.func)
+    cpp_args = signature_group.signature_prefer_gathered().cpp_arguments()
     arguments = [
         compute_cpp_argument_yaml(
             cpp_a, schema_order=False,

From 82cc86b64cf9b299ea4558b64ad49ddf4633227b Mon Sep 17 00:00:00 2001
From: Sebastian Messmer <messmer@fb.com>
Date: Fri, 2 Oct 2020 04:06:27 -0700
Subject: [PATCH 368/449] VariableKernel calls into scattered C++ api (#44158)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44158

Previously, the C++ API only supported calling ops with a gathered TensorOptions object. So even if the VariableKernel took scattered arguments,
it had to re-gather them to call into the C++ API. But a diff stacked below this one introduced a scattered API for the C++ frontend.

This reaps the benefits and makes sure that if the Variable kernel gets scattered arguments (i.e. it's a c10-full op), then it passes those on without regathering
ghstack-source-id: 113355690

Test Plan:
vs master: https://www.internalfb.com/intern/fblearner/details/216342597/

vs prev diff: https://www.internalfb.com/intern/fblearner/details/216342688/

Reviewed By: ezyang

Differential Revision: D23512538

fbshipit-source-id: 8ee6c1cc99443a2141db85072fd6dbc52b4d77fd
---
 tools/autograd/gen_autograd.py      |  2 ++
 tools/autograd/gen_variable_type.py | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 2aaba65b9a79..c12e9b2003d8 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -128,6 +128,8 @@ def load_aten_declarations(path):
 
         for arg in declaration['arguments']:
             arg['simple_type'] = get_simple_type(arg)
+        for arg in declaration['schema_order_arguments']:
+            arg['simple_type'] = get_simple_type(arg)
         for ret in declaration['returns']:
             ret['simple_type'] = get_simple_type(ret)
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index ee37c4749a5e..b7e61b5b7a83 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -260,10 +260,7 @@
 UNPACK_TENSOR = CodeTemplate("""\
 auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""")
 
-UNPACK_OPTIONS = CodeTemplate("""\
-auto ${arg_name}_ = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);""")
-
-LEGACY_UNPACK_OPTIONS = CodeTemplate("""\
+LEGACY_WRAP_OPTIONS = CodeTemplate("""\
 auto ${arg_name}_ = TensorOptions(${arg_name});""")
 
 DECLARE_GRAD_FN = CodeTemplate("""\
@@ -1263,7 +1260,12 @@ def requires_unpack(arg):
     body = []
     unpacked_args = []
     unpacked_args_simple_type = {}
-    for i, arg in enumerate(declaration['arguments']):
+    if declaration['use_c10_dispatcher'] == 'full':
+        arguments = declaration['schema_order_arguments']
+    else:
+        assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
+        arguments = declaration['arguments']
+    for i, arg in enumerate(arguments):
         if not requires_unpack(arg):
             unpacked_args.append(arg['name'])
             unpacked_args_simple_type[arg['name']] = arg['simple_type']
@@ -1285,11 +1287,9 @@ def requires_unpack(arg):
             # Okay, we are abusing the definition of 'unpack' here a bit,
             # although it's still getting the non-variable from the variable
             # (in this case via TensorOptions rather than Variable/Tensor).
-            if declaration['use_c10_dispatcher'] == 'full':
-                body.append(UNPACK_OPTIONS.substitute(arg_name=arg['name']))
-            else:
-                assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                body.append(LEGACY_UNPACK_OPTIONS.substitute(arg_name=arg['name']))
+            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper', \
+                "VariableKernel shouldn't take TensorOptions if the op is c10-full"
+            body.append(LEGACY_WRAP_OPTIONS.substitute(arg_name=arg['name']))
 
         unpacked_args.append(arg['name'] + '_')
         unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type']

From c703602e17da94fa7349cbc3e8841bee77409139 Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 2 Oct 2020 06:49:19 -0700
Subject: [PATCH 369/449] make broadcasting explanation clearer in matmul doc:
 #22763 (#45699)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45699

Test Plan: Imported from OSS

Reviewed By: mruberry

Differential Revision: D24065584

Pulled By: bdhirsh

fbshipit-source-id: 5e2cdd00ed18ad47d24d11751cfa5bee63853cc9
---
 torch/_torch_docs.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 4fb5724fcc00..d4980ae12366 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -4964,8 +4964,14 @@ def merge_dicts(*dicts):
   1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
   The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
   must be broadcastable).  For example, if :attr:`input` is a
+  :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
+  tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
+
+  Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
+  are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
   :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
-  tensor, :attr:`out` will be an :math:`(j \times k \times n \times p)` tensor.
+  tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
+  matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
 
 {tf32_note}
 

From 869b2ca04852ffcffb4e1381c9e1cc44159c7e5b Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 2 Oct 2020 07:45:07 -0700
Subject: [PATCH 370/449] some documentation and style fixes to smooth_l1_loss
 (#45587)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45587

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D24024313

Pulled By: bdhirsh

fbshipit-source-id: c50efb2934d7b9d3b090e92678319cde42c0df45
---
 aten/src/ATen/native/Loss.cpp | 8 ++++++--
 test/test_nn.py               | 4 ++++
 torch/nn/modules/loss.py      | 5 ++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 3563a747cdde..2a3e97cf5dd8 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -296,8 +296,10 @@ Tensor soft_margin_loss(
 }
 
 Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) {
-  if (beta <= 0)
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
       return at::native::l1_loss(input, target, reduction);
+  }
   Tensor loss;
   auto iter = TensorIterator::binary_op(loss, input, target);
   smooth_l1_stub(iter.device_type(), iter, beta);
@@ -305,8 +307,10 @@ Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t r
 }
 
 Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) {
-  if (beta <= 0)
+  TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.")
+  if (beta == 0) {
       return at::native::l1_loss_out(result, input, target, reduction);
+  }
   if (reduction != Reduction::None) {
     Tensor loss;
     auto iter = TensorIterator::binary_op(loss, input, target);
diff --git a/test/test_nn.py b/test/test_nn.py
index 56e727992bfe..e7a185c08951 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -7041,6 +7041,10 @@ def test_l1_loss_correct(self):
                 torch.nn.L1Loss()(input, torch.zeros_like(input)),
                 input.abs().mean())
 
+    def test_smoothl1loss_negative_beta_not_supported(self):
+        with self.assertRaises(RuntimeError):
+            F.smooth_l1_loss(torch.randn(2, 2), torch.randn(2, 2), beta=-1.0)
+
     def test_cosine_similarity(self):
         input1 = torch.randn(4, 4, requires_grad=True)
         input2 = torch.randn(4, 4, requires_grad=True)
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 564125a02b31..796861b0f4b3 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -758,7 +758,7 @@ def forward(self, input: Tensor, target: Tensor) -> Tensor:
 
 class SmoothL1Loss(_Loss):
     r"""Creates a criterion that uses a squared term if the absolute
-    element-wise error falls below 1 and an L1 term otherwise.
+    element-wise error falls below beta and an L1 term otherwise.
     It is less sensitive to outliers than the `MSELoss` and in some cases
     prevents exploding gradients (e.g. see `Fast R-CNN` paper by Ross Girshick).
     Also known as the Huber loss:
@@ -780,6 +780,9 @@ class SmoothL1Loss(_Loss):
 
     beta is an optional parameter that defaults to 1.
 
+    Note: When beta is set to 0, this is equivalent to :class:`L1Loss`.
+    Passing a negative value in for beta will result in an exception.
+
     The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
 
     Args:

From 4d08930ccb1b27a74db796b0477f1aeebc031f0a Mon Sep 17 00:00:00 2001
From: Brian Hirsh <hirsheybar@fb.com>
Date: Fri, 2 Oct 2020 07:45:07 -0700
Subject: [PATCH 371/449] remove beta defaulting in smooth_l1_loss_backward.
 added to the bc whitelist (#45588)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45588

Test Plan: Imported from OSS

Reviewed By: mrshenli

Differential Revision: D24024312

Pulled By: bdhirsh

fbshipit-source-id: 7246e5da741fbc5641deecaf057ae9a6e44e8c34
---
 aten/src/ATen/native/native_functions.yaml                  | 4 ++--
 test/backward_compatibility/check_backward_compatibility.py | 1 +
 tools/autograd/derivatives.yaml                             | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 82bf599ea7d2..92a20d8625b3 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6795,13 +6795,13 @@
   dispatch:
     CPU, CUDA: smooth_l1_loss
 
-- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0, *, Tensor(a!) grad_input) -> Tensor(a!)
+- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: smooth_l1_loss_backward_out
     CUDA: smooth_l1_loss_backward_out
 
-- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
 
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index 0e4bcd14ae01..a2f843d78f72 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -110,6 +110,7 @@
     ("aten::_foreach_sub", datetime.date(2020, 10, 1)),
     ("aten::_amp_non_finite_check_and_unscale_", datetime.date(9999, 1, 1)),
     ("aten::choose_qparams_optimized", datetime.date(2020, 10, 5)),
+    ("aten::smooth_l1_loss_backward", datetime.date(2020, 10, 15)),
 ]
 
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 707baa3bbeaf..92ee277e9ecf 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -1589,7 +1589,7 @@
   grad_output: replication_pad3d(grad, padding)
   self: zeros_like(self)
 
-- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta=1.0) -> Tensor
+- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
   grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta)
   self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)
   target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta)

From 888f3c12e75c981cd5d3a458e75c1e421d6c514a Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 2 Oct 2020 08:27:14 -0700
Subject: [PATCH 372/449] Test torch.svd using complex float and double numbers
 (#45572)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45572

Reviewed By: anjali411

Differential Revision: D24018160

Pulled By: malfet

fbshipit-source-id: 1b6103f5af94e9f74b73ed23aa02c0236b199b34
---
 test/test_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 126cbda5815f..9796803d46f1 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -20838,7 +20838,7 @@ def _test_svd_helper(self, shape, some, col_maj, device, dtype):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(*_float_types_no_half)
+    @dtypes(*(_float_types_no_half + _complex_types))
     def test_svd_square(self, device, dtype):
         self._test_svd_helper((10, 10), True, False, device, dtype)
 

From 24187a0b429f5edc0902609523e995f2afa0dd58 Mon Sep 17 00:00:00 2001
From: Sam Estep <sestep@fb.com>
Date: Fri, 2 Oct 2020 09:26:08 -0700
Subject: [PATCH 373/449] Enable type check for
 torch.quantization.fake_quantize (#45701)

Summary:
Addresses part of https://github.com/pytorch/pytorch/issues/42969.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45701

Reviewed By: walterddr

Differential Revision: D24066672

Pulled By: samestep

fbshipit-source-id: 53bb5e7b4703738d3de86fa89fb0980f1d6251f3
---
 mypy.ini                            | 3 ---
 torch/quantization/fake_quantize.py | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index e8791f497c35..8c33fe032f9e 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -74,9 +74,6 @@ ignore_errors = True
 [mypy-torch.quantization.stubs]
 ignore_errors = True
 
-[mypy-torch.quantization.fake_quantize]
-ignore_errors = True
-
 [mypy-torch.quantization._numeric_suite]
 ignore_errors = True
 
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 6cd06f8567d4..d06cd56a4404 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -41,6 +41,12 @@ class FakeQuantize(Module):
                            provides a method to calculate scale and zero-point.
 
     """
+
+    fake_quant_enabled: torch.Tensor
+    observer_enabled: torch.Tensor
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
     def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, **observer_kwargs):
         super(FakeQuantize, self).__init__()
         assert quant_min <= quant_max, \

From ad31068fe95cc7ed3bbe496d67a6f86eeb086ed8 Mon Sep 17 00:00:00 2001
From: Mingzhe Li <mingzhe0908@fb.com>
Date: Fri, 2 Oct 2020 09:54:42 -0700
Subject: [PATCH 374/449] Add a distributed package reviewer (#45744)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45744

Tag me as reviewer

Test Plan: na

Reviewed By: jiayisuse

Differential Revision: D23881569

fbshipit-source-id: 8452fa60fe3d017ae1f0da26c0ce476f2b9c170c
---
 CODEOWNERS | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 77b8d2cbcb36..42aa83bb61bf 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -23,9 +23,9 @@
 # Distributed package
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add
 # or remove yourself from it.
-/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
-/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
-/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma
+/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088
+/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088
+/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088
 
 # Distributed tests
 # This list is mostly if you'd like to be tagged as reviewer, feel free to add

From b234acd414fc45790febc5147db1f3a238814f0b Mon Sep 17 00:00:00 2001
From: Marcio Porto <mflporto@fb.com>
Date: Fri, 2 Oct 2020 10:03:01 -0700
Subject: [PATCH 375/449] Exposes SparseToDenseMask Caffe2 Operator (#45670)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45670

Reviewed By: esqu1

Differential Revision: D23868280

fbshipit-source-id: d6afa129c073fe611cb43a170025bc3c880a4bec
---
 caffe2/operators/sparse_to_dense_mask_op.cc   | 35 ++++++++---
 caffe2/operators/sparse_to_dense_mask_op.h    |  3 +
 .../operator_test/torch_integration_test.py   | 63 +++++++++++++++++++
 3 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc
index d968112c9ecc..b842d09e068d 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.cc
+++ b/caffe2/operators/sparse_to_dense_mask_op.cc
@@ -45,21 +45,21 @@ Convert sparse representations to dense with given indices.
 
 Transforms a sparse representation of map<id, value> represented as `indices`
 vector and `values` tensor into a compacted tensor where the first dimension
-corresponds to each id provided in mask argument. Missing values are filled with
-the value of `default_value`. After running this op:
+corresponds to each id provided in the mask argument. Missing values are filled
+with the value of `default_value`. After running this op:
 
   output[j, :] = values[i] // where mask[j] == indices[i]
   output[j, ...] = default_value // when mask[j] doesn't appear in indices
 
-If `lengths` is provided and not empty, and extra "batch" dimension is prepended
+If `lengths` is provided and not empty, an extra "batch" dimension is prepended
 to the output.
 
-`values` and `default_value` can have additional matching dimensions, operation
-is performed on the entire subtensor in thise case.
+`values` and `default_value` can have additional matching dimensions
+(the operation is performed on the entire subtensor in this case).
 
-For example, if `lengths` is supplied and `values` is 1-D vector of floats and
-`default_value` is a float scalar, the output is going to be a float matrix
-of size `len(lengths) X len(mask)`
+For example, if `lengths` is supplied and `values` is a 1-D vector of floats
+and `default_value` is a float scalar, the output is going to be a float
+matrix of size `len(lengths) X len(mask)`.
 )DOC")
     .Arg(
         "mask",
@@ -67,6 +67,10 @@ of size `len(lengths) X len(mask)`
     .Arg(
         "return_presence_mask",
         "bool whether to return presence mask, false by default")
+    .Arg(
+        "max_skipped_indices",
+        "int argument representing the maximum number of invalid row ids that "
+        "can be skipped before returning an error. 50 by default")
     .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data")
     .Input(1, "values", "Data tensor, first dimension has to match `indices`")
     .Input(
@@ -117,3 +121,18 @@ class GetSparseToDenseMaskGradient : public GradientMakerBase {
 REGISTER_GRADIENT(SparseToDenseMask, GetSparseToDenseMaskGradient);
 } // namespace
 } // namespace caffe2
+
+// clang-format off
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    SparseToDenseMask,
+    "_caffe2::SparseToDenseMask("
+      "Tensor indices, "
+      "Tensor values, "
+      "Tensor default_value, "
+      "Tensor? lengths, "
+      "int[] mask, "
+      "bool? return_presence_mask = False, "
+      "int? max_skipped_indices = 50"
+    ") -> (Tensor output, Tensor presence_mask)",
+    caffe2::SparseToDenseMaskOp<caffe2::CPUContext>);
+// clang-format on
diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h
index 8ed589c6d734..26213c0cff33 100644
--- a/caffe2/operators/sparse_to_dense_mask_op.h
+++ b/caffe2/operators/sparse_to_dense_mask_op.h
@@ -5,10 +5,13 @@
 #include <unordered_map>
 #include <vector>
 #include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/math.h"
 
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(SparseToDenseMask);
+
 namespace caffe2 {
 
 template <class Context>
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index 9bec64764240..628512953dca 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -268,6 +268,69 @@ def box_with_nms_limit_ref():
         for o, o_ref in zip(outputs, output_refs):
             torch.testing.assert_allclose(o, o_ref)
 
+    @given(
+        dim_1=st.integers(min_value=10, max_value=10),
+        dim_2=st.integers(min_value=3, max_value=3),
+        dim_3=st.integers(min_value=2, max_value=2),
+    )
+    def test_sparse_to_dense_mask(self, dim_1, dim_2, dim_3):
+        indices = np.array([i + 1 for i in range(dim_1)]).astype(np.int32)
+        values = np.random.rand(dim_1, dim_2, dim_3).astype(np.float32)
+        default_value = np.zeros((dim_2, dim_3)).astype(np.float32)
+        mask = [2, 4, 9]
+
+        def sparse_to_dense_mask_ref(return_presence_mask=False):
+            ref_op = core.CreateOperator(
+                "SparseToDenseMask",
+                ["indices", "values", "default_value"],
+                ["output", "presence_mask"],
+                mask=mask,
+                return_presence_mask=return_presence_mask,
+            )
+            workspace.FeedBlob("indices", indices)
+            workspace.FeedBlob("values", values)
+            workspace.FeedBlob("default_value", default_value)
+            workspace.RunOperatorOnce(ref_op)
+
+            if return_presence_mask:
+                return (
+                    workspace.FetchBlob("output"),
+                    workspace.FetchBlob("presence_mask"),
+                )
+
+            return workspace.FetchBlob("output")
+
+        # Testing return_presence_mask = False
+        output = sparse_to_dense_mask_ref()
+        output = torch.tensor(output)
+
+        a, _ = torch.ops._caffe2.SparseToDenseMask(
+            torch.tensor(indices),
+            torch.tensor(values),
+            torch.tensor(default_value),
+            None,
+            mask=mask,
+        )
+
+        torch.testing.assert_allclose(output, a)
+
+        # Testing return_presence_mask = True
+        output, presence_mask = sparse_to_dense_mask_ref(return_presence_mask=True)
+        output = torch.tensor(output)
+        presence_mask = torch.tensor(presence_mask)
+
+        a, b = torch.ops._caffe2.SparseToDenseMask(
+            torch.tensor(indices),
+            torch.tensor(values),
+            torch.tensor(default_value),
+            None,
+            mask=mask,
+            return_presence_mask=True,
+        )
+
+        torch.testing.assert_allclose(output, a)
+        torch.testing.assert_allclose(presence_mask, b)
+
     @given(
         A=st.integers(min_value=4, max_value=4),
         H=st.integers(min_value=10, max_value=10),

From cdf93b03de7ffc33e2762f7fefb8f3c7586fd051 Mon Sep 17 00:00:00 2001
From: Andrew Millspaugh <spaugh@fb.com>
Date: Fri, 2 Oct 2020 10:24:26 -0700
Subject: [PATCH 376/449] Add string versions of argument funcs in jit Node
 (#45464)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45464

Usage of Symbols to find arguments requires one to generate a nonsense symbol for inputs which don't already have one. The intention of symbols appears to be something of an internalized string, but the namespace component doesn't apply to an argument. In order to access the arguments by name without adding new symbols, versions of those functions with std::string input was added. These can be proved valid based on the existing codepath. Additionally, a hasNamedInput convenience function was added to remove the necessity of a try/catch block in user code.

The primary motivation is to be able to easily handle the variable number of arguments in glow, so that the arange op may be implemented.

Reviewed By: eellison

Differential Revision: D23972315

fbshipit-source-id: 3e0b41910cf07e916186f1506281fb221725a91b
---
 torch/csrc/jit/ir/ir.cpp | 26 ++++++++++++++++++++++----
 torch/csrc/jit/ir/ir.h   |  2 ++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index b602542b3ed3..e0b7e15556eb 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -846,22 +846,40 @@ void Value::replaceAllUsesAfterNodeWith(const Node* node, Value* newValue) {
       uses_.end());
 }
 
-size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
-  auto name_str = name.toUnqualString();
+size_t findArgument(
+    const FunctionSchema& the_schema,
+    const std::string& unqualName) {
   for (size_t i = 0; i < the_schema.arguments().size(); ++i) {
     const Argument* arg = &the_schema.arguments()[i];
-    if (arg->name() == name_str) {
+    if (arg->name() == unqualName) {
       return i;
     }
   }
   throw std::runtime_error(
-      std::string("Couldn't find an argument called ") + name.toQualString());
+      std::string("Couldn't find an argument called ") + unqualName);
+}
+
+size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
+  const auto unqualName = name.toUnqualString();
+  return findArgument(the_schema, unqualName);
 }
 
 c10::optional<IValue> Node::get(Symbol name) const {
   return toIValue(namedInput(name));
 }
 
+bool Node::hasNamedInput(const std::string& name) const {
+  for (const auto& argument : schema().arguments()) {
+    if (argument.name() == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Value* Node::namedInput(const std::string& unqualName) const {
+  return input(findArgument(schema(), unqualName));
+}
 Value* Node::namedInput(Symbol name) const {
   return input(findArgument(schema(), name));
 }
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 665bd9797b26..dbd9fb5ca755 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -414,6 +414,8 @@ struct TORCH_API Node {
     return inputs_.at(i);
   }
 
+  bool hasNamedInput(const std::string& unqualName) const;
+  Value* namedInput(const std::string& unqualName) const;
   Value* namedInput(Symbol name) const;
 
   c10::optional<IValue> get(Symbol name) const;

From 6e43f0db8b09e3516ca7238df384e9db99d89cfe Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Fri, 2 Oct 2020 10:41:55 -0700
Subject: [PATCH 377/449] Use correct signatures for METH_NOARGS. (#45528)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45528

As described in https://github.com/pytorch/pytorch/issues/45419,
resolving a bunch of cpython signature issues.

#Closes: https://github.com/pytorch/pytorch/issues/45419
ghstack-source-id: 113385726

Test Plan: sentinel

Reviewed By: albanD

Differential Revision: D24000626

fbshipit-source-id: d334596f1f0256063691aa044c8fb2face260817
---
 torch/csrc/Module.cpp                       | 52 ++++++++++-----------
 torch/csrc/autograd/python_variable.cpp     | 18 +++----
 torch/csrc/distributed/autograd/init.cpp    |  4 +-
 torch/csrc/distributed/c10d/init.cpp        |  2 +-
 torch/csrc/distributed/rpc/init.cpp         |  4 +-
 torch/csrc/distributed/rpc/testing/init.cpp |  4 +-
 6 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 027251234ff6..6f61b5e0a2d9 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -534,12 +534,12 @@ PyObject *THPModule_setQEngine(PyObject */* unused */, PyObject *arg)
   Py_RETURN_NONE;
 }
 
-PyObject *THPModule_qEngine(PyObject */* unused */)
+PyObject *THPModule_qEngine(PyObject *_unused, PyObject *noargs)
 {
   return THPUtils_packInt64(static_cast<int>(at::globalContext().qEngine()));
 }
 
-PyObject *THPModule_supportedQEngines(PyObject */* unused */)
+PyObject *THPModule_supportedQEngines(PyObject *_unused, PyObject *noargs)
 {
   auto qengines = at::globalContext().supportedQEngines();
   auto list = THPObjectPtr(PyList_New(qengines.size()));
@@ -553,7 +553,7 @@ PyObject *THPModule_supportedQEngines(PyObject */* unused */)
   return list.release();
 }
 
-PyObject *THPModule_isEnabledXNNPACK(PyObject * /* unused */)
+PyObject *THPModule_isEnabledXNNPACK(PyObject *_unused, PyObject *noargs)
 {
   if (at::globalContext().isXNNPACKAvailable()) Py_RETURN_TRUE;
   else Py_RETURN_FALSE;
@@ -574,52 +574,52 @@ static PyObject * THPModule_vmapmode_decrement_nesting(PyObject* _unused, PyObje
 //NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
 static PyMethodDef TorchMethods[] = {
   {"_initExtension",  (PyCFunction)THPModule_initExtension,   METH_O,       nullptr},
-  {"_autograd_init",  (PyCFunction)THPAutograd_initExtension, METH_NOARGS,  nullptr},
+  {"_autograd_init",  THPAutograd_initExtension, METH_NOARGS,  nullptr},
   {"_add_docstr",     (PyCFunction)THPModule_addDocStr,       METH_VARARGS, nullptr},
   {"_init_names",     (PyCFunction)THPModule_initNames,       METH_O,       nullptr},
-  {"_has_distributed",(PyCFunction)THPModule_hasDistributed,  METH_NOARGS,  nullptr},
+  {"_has_distributed",THPModule_hasDistributed,  METH_NOARGS,  nullptr},
   {"_set_default_tensor_type", (PyCFunction)THPModule_setDefaultTensorType, METH_O, nullptr},
   {"_set_default_dtype", (PyCFunction)THPModule_setDefaultDtype, METH_O, nullptr},
   {"_infer_size",     (PyCFunction)THPModule_inferSize,         METH_VARARGS, nullptr},
   {"_crash_if_csrc_asan", (PyCFunction)THPModule_crashIfCsrcASAN, METH_O, nullptr},
   {"_crash_if_csrc_ubsan", (PyCFunction)THPModule_crashIfCsrcUBSAN, METH_O, nullptr},
   {"_crash_if_aten_asan", (PyCFunction)THPModule_crashIfATenASAN, METH_O, nullptr},
-  {"_show_config",    (PyCFunction)THPModule_showConfig, METH_NOARGS, nullptr},
-  {"_parallel_info",    (PyCFunction)THPModule_parallelInfo, METH_NOARGS, nullptr},
+  {"_show_config",    THPModule_showConfig, METH_NOARGS, nullptr},
+  {"_parallel_info",    THPModule_parallelInfo, METH_NOARGS, nullptr},
   {"_set_backcompat_broadcast_warn", (PyCFunction)THPModule_setBackcompatBroadcastWarn, METH_O, nullptr},
-  {"_get_backcompat_broadcast_warn", (PyCFunction)THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr},
+  {"_get_backcompat_broadcast_warn", THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr},
   {"_set_backcompat_keepdim_warn", (PyCFunction)THPModule_setBackcompatKeepdimWarn, METH_O, nullptr},
-  {"_get_backcompat_keepdim_warn", (PyCFunction)THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr},
-  {"get_num_threads", (PyCFunction)THPModule_getNumThreads,     METH_NOARGS,  nullptr},
+  {"_get_backcompat_keepdim_warn", THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr},
+  {"get_num_threads", THPModule_getNumThreads,     METH_NOARGS,  nullptr},
   {"set_num_threads", (PyCFunction)THPModule_setNumThreads,     METH_O,       nullptr},
-  {"get_num_interop_threads", (PyCFunction)THPModule_getNumInteropThreads,     METH_NOARGS,  nullptr},
+  {"get_num_interop_threads", THPModule_getNumInteropThreads,     METH_NOARGS,  nullptr},
   {"set_num_interop_threads", (PyCFunction)THPModule_setNumInteropThreads,     METH_O,       nullptr},
-  {"_get_cudnn_enabled", (PyCFunction)THPModule_userEnabledCuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_enabled", THPModule_userEnabledCuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_enabled", (PyCFunction)THPModule_setUserEnabledCuDNN, METH_O,  nullptr},
-  {"_get_mkldnn_enabled", (PyCFunction)THPModule_userEnabledMkldnn, METH_NOARGS,     nullptr},
+  {"_get_mkldnn_enabled", THPModule_userEnabledMkldnn, METH_NOARGS,     nullptr},
   {"_set_mkldnn_enabled", (PyCFunction)THPModule_setUserEnabledMkldnn, METH_O,  nullptr},
-  {"_get_cudnn_allow_tf32", (PyCFunction)THPModule_allowTF32CuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_allow_tf32", THPModule_allowTF32CuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_allow_tf32", (PyCFunction)THPModule_setAllowTF32CuDNN, METH_O,  nullptr},
-  {"_get_cudnn_benchmark", (PyCFunction)THPModule_benchmarkCuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_benchmark", THPModule_benchmarkCuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_benchmark", (PyCFunction)THPModule_setBenchmarkCuDNN, METH_O,  nullptr},
-  {"_get_cudnn_deterministic", (PyCFunction)THPModule_deterministicCuDNN, METH_NOARGS,     nullptr},
+  {"_get_cudnn_deterministic", THPModule_deterministicCuDNN, METH_NOARGS,     nullptr},
   {"_set_cudnn_deterministic", (PyCFunction)THPModule_setDeterministicCuDNN, METH_O,  nullptr},
-  {"_get_deterministic", (PyCFunction)THPModule_deterministic, METH_NOARGS,     nullptr},
+  {"_get_deterministic", THPModule_deterministic, METH_NOARGS,     nullptr},
   {"_set_deterministic", (PyCFunction)THPModule_setDeterministic, METH_O,  nullptr},
-  {"_get_cublas_allow_tf32", (PyCFunction)THPModule_allowTF32CuBLAS, METH_NOARGS,     nullptr},
+  {"_get_cublas_allow_tf32", THPModule_allowTF32CuBLAS, METH_NOARGS,     nullptr},
   {"_set_cublas_allow_tf32", (PyCFunction)THPModule_setAllowTF32CuBLAS, METH_O,  nullptr},
-  {"_vmapmode_increment_nesting", (PyCFunction)THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr},
-  {"_vmapmode_decrement_nesting", (PyCFunction)THPModule_vmapmode_decrement_nesting, METH_NOARGS, nullptr},
+  {"_vmapmode_increment_nesting", THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr},
+  {"_vmapmode_decrement_nesting", THPModule_vmapmode_decrement_nesting, METH_NOARGS, nullptr},
   {"_to_dlpack",      (PyCFunction)THPModule_toDLPack,          METH_O,       nullptr},
   {"_from_dlpack",    (PyCFunction)THPModule_fromDLPack,        METH_O,       nullptr},
   {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O,     nullptr},
-  {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS,  nullptr},
-  {"_get_default_device", (PyCFunction)THPModule_getDefaultDevice, METH_NOARGS,   nullptr},
-  {"_get_qengine", (PyCFunction)THPModule_qEngine, METH_NOARGS, nullptr},
+  {"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS,  nullptr},
+  {"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS,   nullptr},
+  {"_get_qengine", THPModule_qEngine, METH_NOARGS, nullptr},
   {"_set_qengine", (PyCFunction)THPModule_setQEngine, METH_O, nullptr},
-  {"_supported_qengines", (PyCFunction)THPModule_supportedQEngines, METH_NOARGS, nullptr},
-  {"_is_xnnpack_enabled", (PyCFunction)THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr},
-  {"_is_torch_function_enabled", (PyCFunction)THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr},
+  {"_supported_qengines", THPModule_supportedQEngines, METH_NOARGS, nullptr},
+  {"_is_xnnpack_enabled", THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr},
+  {"_is_torch_function_enabled", THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr},
   {"_disabled_torch_function_impl", (PyCFunction)THPModule_disable_torch_function, METH_VARARGS, nullptr},
   {nullptr, nullptr, 0, nullptr}
 };
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 81e10a9a1d1b..28f9c3880d88 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -388,19 +388,19 @@ PyObject *THPVariable_get_ndim(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
-PyObject *THPVariable_get_names(THPVariable *self, void *unused)
+PyObject *THPVariable_get_names(PyObject *self, void *unused)
 {
   HANDLE_TH_ERRORS
-  if (check_has_torch_function((PyObject *)self)) {
-    return handle_torch_function_getter(self, "names");
+  if (check_has_torch_function(self)) {
+    return handle_torch_function_getter((THPVariable*)self, "names");
   }
   // The long-term plan is to return a list of (python) torch.Dimname.
   // However, for now, return a list of string.
-  size_t size = self->cdata.dim();
+  size_t size = ((THPVariable *)self)->cdata.dim();
   THPObjectPtr tuple(PyTuple_New(size));
   if (!tuple) throw python_error();
 
-  const auto dimnames = self->cdata.names();
+  const auto dimnames = ((THPVariable *)self)->cdata.names();
   for (size_t i = 0; i < size; ++i) {
     PyObject* str;
     if (dimnames[i].type() == at::NameType::WILDCARD) {
@@ -423,12 +423,12 @@ PyObject *THPVariable_get_names(THPVariable *self, void *unused)
   END_HANDLE_TH_ERRORS
 }
 
-int THPVariable_set_names(THPVariable *self, PyObject *names) {
+int THPVariable_set_names(PyObject *self, PyObject *names) {
   HANDLE_TH_ERRORS
-  if (check_has_torch_function((PyObject *)self)) {
-    return handle_torch_function_setter(self, "names", names);
+  if (check_has_torch_function(self)) {
+    return handle_torch_function_setter((THPVariable*)self, "names", names);
   }
-  auto& var = self->cdata;
+  auto& var = ((THPVariable *)self)->cdata;
   if (names == Py_None) {
     at::internal_set_names_inplace(var, at::nullopt);
   } else {
diff --git a/torch/csrc/distributed/autograd/init.cpp b/torch/csrc/distributed/autograd/init.cpp
index 9ab16fb6a93c..09de7abb87a5 100644
--- a/torch/csrc/distributed/autograd/init.cpp
+++ b/torch/csrc/distributed/autograd/init.cpp
@@ -15,7 +15,7 @@ namespace {
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
-PyObject* dist_autograd_init(PyObject* /* unused */) {
+PyObject* dist_autograd_init(PyObject* _unused, PyObject* noargs) {
   auto autograd_module =
       THPObjectPtr(PyImport_ImportModule("torch.distributed.autograd"));
   if (!autograd_module) {
@@ -196,7 +196,7 @@ Example::
 
 static PyMethodDef methods[] = { // NOLINT
     {"_dist_autograd_init",
-     (PyCFunction)dist_autograd_init,
+     dist_autograd_init,
      METH_NOARGS,
      nullptr},
     {nullptr, nullptr, 0, nullptr}};
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index bc52426f08ac..87e65388e3a7 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -131,7 +131,7 @@ void _register_comm_hook(
       std::move(state), std::move(comm_hook)));
 };
 
-PyObject* c10d_init(PyObject* _unused) {
+PyObject* c10d_init(PyObject* _unused, PyObject* noargs) {
   C10_LOG_API_USAGE_ONCE("c10d.python.import");
   auto c10d_module = THPObjectPtr(PyImport_ImportModule("torch.distributed"));
   if (!c10d_module) {
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 2dba0a02f2c9..29eee13a9fdb 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -31,7 +31,7 @@ constexpr std::chrono::milliseconds kDeleteAllUsersTimeout(100000);
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
-PyObject* rpc_init(PyObject* /* unused */) {
+PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
   auto rpc_module =
       THPObjectPtr(PyImport_ImportModule("torch.distributed.rpc"));
   if (!rpc_module) {
@@ -769,7 +769,7 @@ PyObject* rpc_init(PyObject* /* unused */) {
 } // namespace
 
 static PyMethodDef methods[] = { // NOLINT
-    {"_rpc_init", (PyCFunction)rpc_init, METH_NOARGS, nullptr},
+    {"_rpc_init", rpc_init, METH_NOARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
 PyMethodDef* python_functions() {
diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp
index cdb67e2ea6b5..a662faed88ba 100644
--- a/torch/csrc/distributed/rpc/testing/init.cpp
+++ b/torch/csrc/distributed/rpc/testing/init.cpp
@@ -17,7 +17,7 @@ namespace {
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
-PyObject* faulty_agent_init(PyObject* /* unused */) {
+PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) {
   // Add the FaultyProcessGroupAgent and its backend options object to the
   // python module torch.distributed.rpc._testing
   auto faulty_agent_module =
@@ -110,7 +110,7 @@ PyObject* faulty_agent_init(PyObject* /* unused */) {
 
 static PyMethodDef methods[] = { // NOLINT
     {"_faulty_agent_init",
-     (PyCFunction)faulty_agent_init,
+     faulty_agent_init,
      METH_NOARGS,
      nullptr},
     {nullptr, nullptr, 0, nullptr}};

From a052597e6c1525052714389c216990911fc84390 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Fri, 2 Oct 2020 11:05:52 -0700
Subject: [PATCH 378/449] Bump nightlies to 1.8.0 (#45696)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45696

Similar to https://github.com/pytorch/pytorch/pull/40519

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: samestep

Differential Revision: D24064381

Pulled By: seemethere

fbshipit-source-id: 1484b9c4fc5fa8cfa7be591a0a5d4b6e05968589
---
 .circleci/scripts/binary_populate_env.sh |  4 ++--
 android/README.md                        | 10 +++++-----
 android/gradle.properties                |  2 +-
 android/test_app/app/build.gradle        |  6 +++---
 torch/csrc/onnx/onnx.h                   |  2 +-
 version.txt                              |  2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index d4c31cefc7e5..ec7651823536 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -73,7 +73,7 @@ PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
 #TODO: We should be pulling semver version from the base version.txt
-BASE_BUILD_VERSION="1.7.0.dev$DATE"
+BASE_BUILD_VERSION="1.8.0.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@@ -130,7 +130,7 @@ if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then
 fi
 
 export DATE="$DATE"
-export NIGHTLIES_DATE_PREAMBLE=1.7.0.dev
+export NIGHTLIES_DATE_PREAMBLE=1.8.0.dev
 export PYTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION"
 export PYTORCH_BUILD_NUMBER="$PYTORCH_BUILD_NUMBER"
 export OVERRIDE_PACKAGE_VERSION="$PYTORCH_BUILD_VERSION"
diff --git a/android/README.md b/android/README.md
index bf5fa02e6cf4..e67b2e6ec071 100644
--- a/android/README.md
+++ b/android/README.md
@@ -15,8 +15,8 @@ repositories {
 }
 
 dependencies {
-    implementation 'org.pytorch:pytorch_android:1.5.0'
-    implementation 'org.pytorch:pytorch_android_torchvision:1.5.0'
+    implementation 'org.pytorch:pytorch_android:1.6.0'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.6.0'
 }
 ```
 
@@ -34,12 +34,12 @@ repositories {
 
 dependencies {
     ...
-    implementation 'org.pytorch:pytorch_android:1.7.0-SNAPSHOT'
-    implementation 'org.pytorch:pytorch_android_torchvision:1.7.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android:1.8.0-SNAPSHOT'
+    implementation 'org.pytorch:pytorch_android_torchvision:1.8.0-SNAPSHOT'
     ...
 }
 ```
-The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle.properties` in current folder, at this moment it is `1.7.0-SNAPSHOT`.
+The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle.properties` in current folder, at this moment it is `1.8.0-SNAPSHOT`.
 
 ## Building PyTorch Android from Source
 
diff --git a/android/gradle.properties b/android/gradle.properties
index 6e0dc0ac86b0..0ab42c56396d 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=1.7.0-SNAPSHOT
+VERSION_NAME=1.8.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 POM_URL=https://github.com/pytorch/pytorch/tree/master/android
diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle
index 9be0ad2d7aa6..37bdb35e2f19 100644
--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@@ -131,7 +131,7 @@ android {
 
 tasks.all { task ->
     // Disable externalNativeBuild for all but nativeBuild variant
-    if (task.name.startsWith('externalNativeBuild') 
+    if (task.name.startsWith('externalNativeBuild')
           && !task.name.contains('NativeBuild')) {
         task.enabled = false
     }
@@ -149,8 +149,8 @@ dependencies {
     //nativeBuildImplementation(name: 'pytorch_android_torchvision-release', ext: 'aar')
     //extractForNativeBuild(name: 'pytorch_android-release', ext: 'aar')
 
-    nightlyImplementation 'org.pytorch:pytorch_android:1.7.0-SNAPSHOT'
-    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.7.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android:1.8.0-SNAPSHOT'
+    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.8.0-SNAPSHOT'
 
     aarImplementation(name:'pytorch_android', ext:'aar')
     aarImplementation(name:'pytorch_android_torchvision', ext:'aar')
diff --git a/torch/csrc/onnx/onnx.h b/torch/csrc/onnx/onnx.h
index 900de0ea57c2..9daced8a3e4d 100644
--- a/torch/csrc/onnx/onnx.h
+++ b/torch/csrc/onnx/onnx.h
@@ -20,5 +20,5 @@ enum class TrainingMode {
 // onnx::IR_VERSION. with this change, the test_operators.py will be more
 // stable. only bump it when it's necessary
 static const size_t IR_VERSION = 6;
-static const char* PRODUCER_VERSION = "1.7";
+static const char* PRODUCER_VERSION = "1.8";
 }} // namespace torch::onnx
diff --git a/version.txt b/version.txt
index 56fee0696648..52d893bfbb54 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.7.0a0
+1.8.0a0

From 3799ba83e5a0430470986aa7924857b9cc310b94 Mon Sep 17 00:00:00 2001
From: Omkar Salpekar <osalpekar@fb.com>
Date: Fri, 2 Oct 2020 11:07:03 -0700
Subject: [PATCH 379/449] [Docs] Adding Store API Docs (#45543)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45543

This PR adds documentation for the c10d Store to the public docs. Previously these docs were missing although we exposed a lightly-used (but potentially useful) Python API for our distributed key-value store.
ghstack-source-id: 113409195

Test Plan: Will verify screenshots by building the docs.

Reviewed By: pritamdamania87

Differential Revision: D24005598

fbshipit-source-id: 45c3600e7c3f220710e99a0483a9ce921d75d044
---
 docs/source/distributed.rst          |  25 +++
 torch/csrc/distributed/c10d/init.cpp | 233 +++++++++++++++++++++++++--
 2 files changed, 244 insertions(+), 14 deletions(-)

diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index 9c2f4a103b5e..c83b5a1d34de 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -270,6 +270,31 @@ The machine with rank 0 will be used to set up all connections.
 This is the default method, meaning that ``init_method`` does not have to be specified (or
 can be ``env://``).
 
+Distributed Key-Value Store
+---------------------------
+
+The distributed package comes with a distributed key-value store, which can be
+used to share information between processes in the group as well as to
+initialize the distributed pacakge in
+:func:`torch.distributed.init_process_group` (by explicitly creating the store
+as an alternative to specifying ``init_method``.) There are 3 choices for
+Key-Value Stores: :class:`~torch.distributed.TCPStore`,
+:class:`~torch.distributed.FileStore`, and :class:`~torch.distributed.HashStore`.
+
+.. autoclass:: Store
+.. autoclass:: TCPStore
+.. autoclass:: HashStore
+.. autoclass:: FileStore
+.. autoclass:: PrefixStore
+
+.. autofunction:: torch.distributed.Store.set
+.. autofunction:: torch.distributed.Store.get
+.. autofunction:: torch.distributed.Store.add
+.. autofunction:: torch.distributed.Store.wait
+.. autofunction:: torch.distributed.Store.num_keys
+.. autofunction:: torch.distributed.Store.delete_key
+.. autofunction:: torch.distributed.Store.set_timeout
+
 Groups
 ------
 
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 87e65388e3a7..38a1811692c2 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -284,7 +284,12 @@ They are used in specifying strategies for reduction collectives, e.g.,
 
   auto store =
       py::class_<::c10d::Store, std::shared_ptr<::c10d::Store>, PythonStore>(
-          module, "Store")
+          module, "Store",
+          R"(
+Base class for all store implementations, such as the 3 provided by PyTorch
+distributed: (:class:`~torch.distributed.TCPStore`, :class:`~torch.distributed.FileStore`,
+and :class:`~torch.distributed.HashStore`).
+)")
           // Default constructor.
           .def(py::init<>())
           // Convert from std::string to std::vector<uint8>.
@@ -296,7 +301,23 @@ They are used in specifying strategies for reduction collectives, e.g.,
                 std::vector<uint8_t> value_(value.begin(), value.end());
                 store.set(key, value_);
               },
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Inserts the key-value pair into the store based on the supplied ``key`` and
+``value``. If ``key`` already exists in the store, it will overwrite the old
+value with the new supplied ``value``.
+
+Arguments:
+    key (str): The key to be added to the store.
+    value (str): The value associated with ``key`` to be added to the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key", "first_value")
+    >>> # Should return "first_value"
+    >>> store.get("first_key")
+)")
           // Convert from std::vector<uint8_t> to py::bytes.
           // The returned value is not guaranteed to be valid UTF-8.
           .def(
@@ -306,29 +327,141 @@ They are used in specifying strategies for reduction collectives, e.g.,
                 return py::bytes(
                     reinterpret_cast<char*>(value.data()), value.size());
               },
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Retrieves the value associated with the given ``key`` in the store. If ``key`` is not
+present in the store, the function will wait for ``timeout``, which is defined
+when initializing the store, before throwing an exception.
+
+Arguments:
+    key (str): The function will return the value associated with this key.
+
+Returns:
+    Value associated with ``key`` if ``key`` is in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key", "first_value")
+    >>> # Should return "first_value"
+    >>> store.get("first_key")
+)")
           .def(
               "add",
               &::c10d::Store::add,
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+The first call to add for a given ``key`` creates a counter associated
+with ``key`` in the store, initialized to ``amount``. Subsequent calls to add
+with the same ``key`` increment the counter by the specified ``amount``.
+Calling :meth:`~torch.distributed.store.add` with a key that has already
+been set in the store by :meth:`~torch.distributed.store.set` will result
+in an exception.
+
+Arguments:
+    key (str): The key in the store whose counter will be incremented.
+    amount (int): The quantity by which the counter will be incremented.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.add("first_key", 1)
+    >>> store.add("first_key", 6)
+    >>> # Should return 7
+    >>> store.get("first_key")
+)")
           .def(
               "delete_key",
               &::c10d::Store::deleteKey,
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Deletes the key-value pair associated with ``key`` from the store. Returns
+`true` if the key was successfully deleted, and `false` if it was not.
+
+.. warning::
+    The ``delete_key`` API is only supported by the :class:`~torch.distributed.TCPStore`. Using this API
+    with the :class:`~torch.distributed.FileStore` or :class:`~torch.distributed.HashStore` will result in an exception.
+
+Arguments:
+    key (str): The key to be deleted from the store
+
+Returns:
+    `true` if ``key`` was deleted, otherwise `false`.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key")
+    >>> # This should return true
+    >>> store.delete_key("first_key")
+    >>> # This should return false
+    >>> store.delete_key("bad_key")
+)")
           .def(
               "num_keys",
               &::c10d::Store::getNumKeys,
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Returns the number of keys set in the store. Note that this number will typically
+be one greater than the number of keys added by :meth:`~torch.distributed.store.set`
+and :meth:`~torch.distributed.store.add` since one key is used to coordinate all
+the workers using the store.
+
+.. warning::
+    The ``num_keys`` API is only supported by the :class:`~torch.distributed.TCPStore`. Using this API
+    with the :class:`~torch.distributed.FileStore` or :class:`~torch.distributed.HashStore` will result in an exception.
+
+Returns:
+    The number of keys present in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set("first_key", "first_value")
+    >>> # This should return 2
+    >>> store.num_keys()
+)")
           .def(
               "set_timeout",
               &::c10d::Store::setTimeout,
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Sets the store's default timeout. This timeout is used during initialization and in
+:meth:`~torch.distributed.store.wait` and :meth:`~torch.distributed.store.get`.
+
+Arguments:
+    timeout (timedelta): timeout to be set in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> store.set_timeout(timedelta(seconds=10))
+    >>> # This will throw an exception after 10 seconds
+    >>> store.wait(["bad_key"])
+)")
           .def(
               "wait",
               [](::c10d::Store& store, const std::vector<std::string>& keys) {
                 store.wait(keys);
               },
-              py::call_guard<py::gil_scoped_release>())
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Waits for each key in ``keys`` to be added to the store. If not all keys are
+set before the ``timeout`` (set during store initialization), then ``wait``
+will throw an exception.
+
+Arguments:
+    keys (list): List of keys on which to wait until they are set in the store.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> # This will throw an exception after 30 seconds
+    >>> store.wait(["bad_key"])
+)")
           .def(
               "wait",
               [](::c10d::Store& store,
@@ -336,16 +469,79 @@ They are used in specifying strategies for reduction collectives, e.g.,
                  const std::chrono::milliseconds& timeout) {
                 store.wait(keys, timeout);
               },
-              py::call_guard<py::gil_scoped_release>());
-
-  shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store)
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Waits for each key in ``keys`` to be added to the store, and throws an exception
+if the keys have not been set by the supplied ``timeout``.
+
+Arguments:
+    keys (list): List of keys on which to wait until they are set in the store.
+    timeout (timedelta): Time to wait for the keys to be added before throwing an exception.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> # Using TCPStore as an example, other store types can also be used
+    >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> # This will throw an exception after 10 seconds
+    >>> store.wait(["bad_key"], timedelta(seconds=10))
+)");
+
+  shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store,
+      R"(
+A store implementation that uses a file to store the underlying key-value pairs.
+
+Arguments:
+    file_name (str): path of the file in which to store the key-value pairs
+    world_size (int): The total number of processes using the store
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store1 = dist.FileStore("/tmp/filestore", 2)
+    >>> store2 = dist.FileStore("/tmp/filestore", 2)
+    >>> # Use any of the store methods from either the client or server after initialization
+    >>> store1.set("first_key", "first_value")
+    >>> store2.get("first_key")
+
+      )")
       .def(py::init<const std::string&, int>());
 
 #ifndef _WIN32
-  shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store)
+  shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store,
+      R"(
+A thread-safe store implementation based on an underlying hashmap. This store can be used
+within the same process (for example, by other threads), but cannot be used across processes.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> store = dist.HashStore()
+    >>> # store can be used from other threads
+    >>> # Use any of the store methods after initialization
+    >>> store.set("first_key", "first_value")
+      )")
       .def(py::init<>());
 
-  shared_ptr_class_<::c10d::TCPStore>(module, "TCPStore", store)
+  shared_ptr_class_<::c10d::TCPStore>(module, "TCPStore", store,
+      R"(
+A TCP-based distributed key-value store implementation. The server store holds
+the data, while the client stores can connect to the server store over TCP and
+perform actions such as :meth:`~torch.distributed.store.set` to insert a key-value
+pair, :meth:`~torch.distributed.store.get` to retrieve a key-value pair, etc.
+
+Arguments:
+    host_name (str): The hostname or IP Address the server store should run on.
+    port (int): The port on which the server store should listen for incoming requests.
+    world_size (int): The total number of store users (number of clients + 1 for the server).
+    is_master (bool): True when initializing the server store, False for client stores.
+    timeout (timedelta): Timeout used by the store during initialization and for methods such as :meth:`~torch.distributed.store.get` and :meth:`~torch.distributed.store.wait`.
+
+Example::
+    >>> import torch.distributed as dist
+    >>> server_store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30))
+    >>> client_store = dist.TCPStore("127.0.0.1", 0, false)
+    >>> # Use any of the store methods from either the client or server after initialization
+    >>> server_store.set("first_key", "first_value")
+    >>> client_store.get("first_key")
+      )")
       .def(
           py::init<
               const std::string&,
@@ -361,7 +557,16 @@ They are used in specifying strategies for reduction collectives, e.g.,
               std::chrono::milliseconds(::c10d::Store::kDefaultTimeout));
 #endif
 
-  shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store)
+  shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store,
+      R"(
+A wrapper around any of the 3 key-value stores (:class:`~torch.distributed.TCPStore`,
+:class:`~torch.distributed.FileStore`, and :class:`~torch.distributed.HashStore`)
+that adds a prefix to each key inserted to the store.
+
+Arguments:
+    prefix (str): The prefix string that is prepended to each key before being inserted into the store.
+    store (torch.distributed.store): A store object that forms the underlying key-value store.
+      )")
       .def(py::init<const std::string&, std::shared_ptr<::c10d::Store>>());
 
   auto processGroup =

From 6acd7b686c73a1ad3f7e67883ddcdd1fbc4ee043 Mon Sep 17 00:00:00 2001
From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com>
Date: Fri, 2 Oct 2020 11:10:12 -0700
Subject: [PATCH 380/449] adding sharding option to run_test.py (#45583)

Summary:
Added a sharding option to run_test.py to enable users to run a subset of the many tests. The new `--shard` argument takes in two integer values, `x` and `y`, where the larger value would denote the number of shards and the smaller value would denote which shard to run.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45583

Reviewed By: malfet

Differential Revision: D24083469

Pulled By: janeyx99

fbshipit-source-id: 1777bd7822c95b3bf37079deff9381c6f8eaf4cc
---
 test/run_test.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/run_test.py b/test/run_test.py
index 4309e65478ba..9ce938f6cd7b 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -450,6 +450,14 @@ def parse_args():
         nargs='*',
         help='additional arguments passed through to unittest, e.g., '
              'python run_test.py -i sparse -- TestSparse.test_factory_size_check')
+    parser.add_argument(
+        '--shard',
+        nargs=2,
+        type=int,
+        help='runs a shard of the tests (taking into account other selections), e.g., '
+        '--shard 2 3 will break up the selected tests into 3 shards and run the tests '
+        'in the 2nd shard (the number of shards will be whichever argument is greater)',
+    )
     return parser.parse_args()
 
 
@@ -517,6 +525,14 @@ def get_selected_tests(options):
         last_index = find_test_index(options.last, selected_tests, find_last_index=True)
         selected_tests = selected_tests[:last_index + 1]
 
+    if options.shard:
+        assert len(options.shard) == 2, "Unexpected shard format"
+        assert min(options.shard) > 0, "Shards must be positive numbers"
+        which_shard, num_shards = options.shard
+        assert which_shard <= num_shards, "Selected shard must be less or equal that total number of shards"
+        assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}"
+        selected_tests = selected_tests[which_shard - 1 :: num_shards]
+
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == 'win32' and not options.ignore_win_blocklist:

From 7726754e706d886ae37b529c47487e46150a58f1 Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Fri, 2 Oct 2020 11:44:13 -0700
Subject: [PATCH 381/449] Add function signature for pixel_shuffle (#45661)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45661

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D24078627

Pulled By: ansleyadelaide

fbshipit-source-id: 44917ff5932e4d0adcc18ce24ecfc0b5686818e3
---
 torch/nn/functional.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 870b24b2effd..72d55d30ad6d 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -2825,6 +2825,8 @@ def multi_margin_loss(input, target, p=1, margin=1., weight=None, size_average=N
 
 
 pixel_shuffle = _add_docstr(torch.pixel_shuffle, r"""
+pixel_shuffle(input, upscale_factor) -> Tensor
+
 Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` to a
 tensor of shape :math:`(*, C, H \times r, W \times r)`.
 

From db8b0762720281cd06c1facfea4b329a0ce9e3e3 Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Fri, 2 Oct 2020 11:46:39 -0700
Subject: [PATCH 382/449] Change signature for torch.poisson (#45656)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45656

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D24078609

Pulled By: ansleyadelaide

fbshipit-source-id: 97a95b08334ed0d710e032a267b940c2fc9f7f40
---
 torch/_torch_docs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index d4980ae12366..a939eacad1fd 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -5671,7 +5671,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.poisson,
            r"""
-poisson(input *, generator=None) -> Tensor
+poisson(input, generator=None) -> Tensor
 
 Returns a tensor of the same size as :attr:`input` with each element
 sampled from a Poisson distribution with rate parameter given by the corresponding

From 322855e380f22c632c0beda4869643acc9aa7d49 Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Fri, 2 Oct 2020 12:09:16 -0700
Subject: [PATCH 383/449] type check for torch.quantization.observer (#45630)

Summary:
add type checker for observer

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45630

Reviewed By: malfet

Differential Revision: D24058304

Pulled By: walterddr

fbshipit-source-id: ac1c0f5ff0d34b0445bd1364653fc5c9d7571b05
---
 mypy.ini                       |  3 --
 torch/quantization/observer.py | 60 ++++++++++++++++++++++------------
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 8c33fe032f9e..ea7bdb1a83ed 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -68,9 +68,6 @@ ignore_errors = True
 [mypy-torch.testing._internal.distributed.*]
 ignore_errors = True
 
-[mypy-torch.quantization.observer]
-ignore_errors = True
-
 [mypy-torch.quantization.stubs]
 ignore_errors = True
 
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 2909b0c1adfe..5c8257d213e1 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -2,7 +2,7 @@
 import warnings
 from abc import ABCMeta, abstractmethod
 from functools import partial
-from typing import List, Tuple, Optional, Dict, Union
+from typing import Any, List, Tuple, Optional, Dict, Union
 from collections import OrderedDict
 import torch
 import torch.nn as nn
@@ -38,7 +38,7 @@ def __repr__(self):
     return r
 
 
-ABC = ABCMeta(str("ABC"), (object,), {})  # compatible with Python 2 *and* 3:
+ABC: Any = ABCMeta(str("ABC"), (object,), {})  # compatible with Python 2 *and* 3:
 
 
 class ObserverBase(ABC, nn.Module):
@@ -111,6 +111,8 @@ class _ObserverBase(ObserverBase):
     #   min_val and max_val buffers from torch.Size([0]) to torch.Size([])
     _version = 2
 
+    eps: torch.Tensor
+
     def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
                  reduce_range=False, quant_min=None, quant_max=None):
         super(_ObserverBase, self).__init__(dtype=dtype)
@@ -155,8 +157,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                                                         missing_keys, unexpected_keys, error_msgs)
 
     @torch.jit.export
-    def _validate_qmin_qmax(self, quant_min, quant_max):
-        # type: (int, int) -> None
+    def _validate_qmin_qmax(self, quant_min: int, quant_max: int) -> None:
         r"""Validates that the user-specified quantization range is properly initialized
         and within the given bound supported by the observer dtype.
 
@@ -176,8 +177,7 @@ def _validate_qmin_qmax(self, quant_min, quant_max):
         assert quant_min < quant_max, "qmin must be strictly less than qmax for user-specified quantization range."
 
     @torch.jit.export
-    def _calculate_qmin_qmax(self):
-        # type: () -> Tuple[int, int]
+    def _calculate_qmin_qmax(self) -> Tuple[int, int]:
         r"""Calculates actual qmin and qmax based on the quantization range,
         observer datatype and if range is reduced.
         """
@@ -216,8 +216,7 @@ def _calculate_qmin_qmax(self):
         return quant_min, quant_max
 
     @torch.jit.export
-    def _calculate_qparams(self, min_val, max_val):
-        # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
+    def _calculate_qparams(self, min_val: torch.Tensor, max_val: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         r"""Calculates the quantization parameters, given min and max
         value tensors. Works for both per tensor and per channel cases
 
@@ -362,6 +361,8 @@ class MinMaxObserver(_ObserverBase):
     .. note:: If the running minimum equals to the running maximum, the scale
               and zero_point are set to 1.0 and 0.
     """
+    min_val: torch.Tensor
+    max_val: torch.Tensor
 
     def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
                  reduce_range=False, quant_min=None, quant_max=None):
@@ -501,6 +502,9 @@ class PerChannelMinMaxObserver(_ObserverBase):
     .. note:: If the running minimum equals to the running maximum, the scales
               and zero_points are set to 1.0 and 0.
     """
+    min_vals: torch.Tensor
+    max_vals: torch.Tensor
+
 
     def __init__(self, ch_axis=0, dtype=torch.quint8,
                  qscheme=torch.per_channel_affine, reduce_range=False,
@@ -679,6 +683,9 @@ class HistogramObserver(_ObserverBase):
     3. Compute the scale and zero point the same way as in the
         :class:`~torch.quantization.MinMaxObserver`
     """
+    histogram: torch.Tensor
+    min_val: torch.Tensor
+    max_val: torch.Tensor
 
     def __init__(self, bins=2048, upsample_rate=128, dtype=torch.quint8,
                  qscheme=torch.per_tensor_affine, reduce_range=False):
@@ -821,8 +828,10 @@ def _compute_quantization_error(next_start_bin, next_end_bin, norm_type):
         return new_min, new_max
 
     @torch.jit.ignore
-    def _adjust_min_max(self, combined_min, combined_max, upsample_rate):
-        # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor, int, int]
+    def _adjust_min_max(self,
+                        combined_min: torch.Tensor,
+                        combined_max: torch.Tensor,
+                        upsample_rate: int) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
         # We ensure that:
         # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins)
         # This allows us to have a common grid of resolution s, where we can align
@@ -830,17 +839,22 @@ def _adjust_min_max(self, combined_min, combined_max, upsample_rate):
         # start_idx maps min_val to the histogram bin index.
 
         hist_bin_width = (self.max_val - self.min_val) / (self.bins * upsample_rate)
-        downsample_rate = torch.ceil((combined_max - combined_min) / (self.bins * hist_bin_width)).to(torch.int).item()
+        downsample_rate = int(torch.ceil((combined_max - combined_min) / (self.bins * hist_bin_width)).item())
         e = downsample_rate * (self.bins * hist_bin_width) - (combined_max - combined_min)
         # Relax only the max, not the min, so that for one sided distributions, min stays at zero
         combined_max = combined_max + e
         combined_min = combined_min
-        start_idx = torch.round((self.min_val - combined_min) / hist_bin_width).to(torch.int).item()
+        start_idx = int(torch.round((self.min_val - combined_min) / hist_bin_width).item())
         return combined_min, combined_max, downsample_rate, start_idx
 
     @torch.jit.ignore
-    def _combine_histograms(self, orig_hist, new_hist, upsample_rate, downsample_rate, start_idx, Nbins):
-        # type: (Tensor, Tensor, int, int, int, int) -> Tensor
+    def _combine_histograms(self,
+                            orig_hist: torch.Tensor,
+                            new_hist: torch.Tensor,
+                            upsample_rate: int,
+                            downsample_rate: int,
+                            start_idx: int,
+                            Nbins: int) -> torch.Tensor:
         # First up-sample the histogram with new data by a factor of L
         # This creates an approximate probability density thats piecwise constant
         upsampled_histogram = new_hist.repeat_interleave(upsample_rate)
@@ -862,7 +876,7 @@ def _combine_histograms(self, orig_hist, new_hist, upsample_rate, downsample_rat
         return orig_hist
 
     def forward(self, x_orig):
-        # type: (Tensor) -> Tensor
+        # type: (torch.Tensor) -> torch.Tensor
         x = x_orig.detach()
         min_val = self.min_val
         max_val = self.max_val
@@ -874,7 +888,10 @@ def forward(self, x_orig):
             self.min_val.copy_(min_val)
             self.max_val.resize_(max_val.shape)
             self.max_val.copy_(max_val)
-            torch.histc(x, self.bins, min=min_val, max=max_val, out=self.histogram)
+            assert min_val.numel() == 1 and max_val.numel() == 1, (
+                "histogram min/max values must be scalar."
+            )
+            torch.histc(x, self.bins, min=int(min_val), max=int(max_val), out=self.histogram)
         else:
             new_min, new_max = torch._aminmax(x)
             combined_min = torch.min(new_min, min_val)
@@ -884,7 +901,10 @@ def forward(self, x_orig):
             # and then downsampling the histogram efficiently
             combined_min, combined_max, downsample_rate, start_idx = \
                 self._adjust_min_max(combined_min, combined_max, self.upsample_rate)
-            combined_histogram = torch.histc(x, self.bins, min=combined_min, max=combined_max)
+            assert combined_min.numel() == 1 and combined_max.numel() == 1, (
+                "histogram min/max values must be scalar."
+            )
+            combined_histogram = torch.histc(x, self.bins, min=int(combined_min), max=int(combined_max))
             if combined_min == min_val and combined_max == max_val:
                 combined_histogram += self.histogram
             else:
@@ -1075,7 +1095,7 @@ def get_observer_state_dict(mod):
         for k, v in mod.state_dict().items():
             if 'activation_post_process' in k:
                 od[k] = v
-    od._metadata = mod.state_dict()._metadata
+    od._metadata = mod.state_dict()._metadata  # type: ignore[attr-defined]
     return od
 
 def load_observer_state_dict(mod, obs_dict):
@@ -1084,8 +1104,8 @@ def load_observer_state_dict(mod, obs_dict):
     load the stats back into the model. The observer state_dict can be saved
     using torch.quantization.get_observer_state_dict
     """
-    missing_keys = []
-    unexpected_keys = []
+    missing_keys: List[str] = []
+    unexpected_keys: List[str] = []
     for name, module in mod.named_modules():
         prefix = name + '.'
         if _is_activation_post_process(module):

From 8619de84f2f930af51f2c9276454a39b71271449 Mon Sep 17 00:00:00 2001
From: Xiao Wang <24860335+xwang233@users.noreply.github.com>
Date: Fri, 2 Oct 2020 12:21:54 -0700
Subject: [PATCH 384/449] Fix cuDNN error message when it's Conv2d (#45729)

Summary:
Originally introduced in https://github.com/pytorch/pytorch/issues/45023. When I was doing test in the original PR, it was a Conv3d, so this problem was not discovered.

Arrays in `ConvolutionParams` have a fixed length of 3 or 5. This is because `max_dim` is set as a constexpr of 3, regardless of Conv2d or Conv3d. The current code will make some error message be weird. See below in the comments.

https://github.com/pytorch/pytorch/blob/9201c37d020007979e144693d86c8e8599e2fd8f/aten/src/ATen/native/cudnn/Conv.cpp#L212-L226

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45729

Reviewed By: mruberry

Differential Revision: D24081542

Pulled By: ngimel

fbshipit-source-id: 141f8946f4d0db63a723131775731272abeaa6ab
---
 aten/src/ATen/native/cudnn/Conv.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp
index 65548d68440b..4524af2fe244 100644
--- a/aten/src/ATen/native/cudnn/Conv.cpp
+++ b/aten/src/ATen/native/cudnn/Conv.cpp
@@ -315,9 +315,9 @@ std::string repro_from_args(const ConvolutionArgs& args) {
   ss <<   "device='cuda', requires_grad=True)" << to_channels_last << "\n";
   ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", ";
   ss <<   "kernel_size=" << args.weight.sizes().slice(2) << ", ";
-  ss <<   "padding=" << ArrayRef<int>(args.params.padding) << ", ";
-  ss <<   "stride=" << ArrayRef<int>(args.params.stride) << ", ";
-  ss <<   "dilation=" << ArrayRef<int>(args.params.dilation) << ", ";
+  ss <<   "padding=" << ArrayRef<int>(args.params.padding, dim-2) << ", ";
+  ss <<   "stride=" << ArrayRef<int>(args.params.stride, dim-2) << ", ";
+  ss <<   "dilation=" << ArrayRef<int>(args.params.dilation, dim-2) << ", ";
   ss <<   "groups=" << args.params.groups << ")\n";
   ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n";
   ss << "out = net(data)\n";

From f8c1ca5dd8eeac7a67f61f6c4ef0a330399e9061 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Fri, 2 Oct 2020 12:41:23 -0700
Subject: [PATCH 385/449] Enable NamedTuple data type to work with DDP (#44220)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44220

Closes https://github.com/pytorch/pytorch/issues/44009
Currently if a dataloader returns objects created with a
collections.namedtuple, this will incorrectly be cast to a tuple. As a result, if we have data of these types, there can be runtime errors during the forward pass if the module is expecting a named tuple.

Fix this in
`scatter_gather.py` to resolve the issue reported in
https://github.com/pytorch/pytorch/issues/44009
ghstack-source-id: 113423287

Test Plan: CI

Reviewed By: colesbury

Differential Revision: D23536752

fbshipit-source-id: 3838e60162f29ebe424e83e474c4350ae838180b
---
 test/test_cuda.py                             | 44 ++++++++++++++++++
 torch/nn/parallel/scatter_gather.py           |  8 ++++
 .../_internal/distributed/distributed_test.py | 45 +++++++++++++++++++
 3 files changed, 97 insertions(+)

diff --git a/test/test_cuda.py b/test/test_cuda.py
index e3d94671e997..498fd199066f 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1,6 +1,7 @@
 import collections
 import io
 import tempfile
+from typing import NamedTuple
 import unittest
 import sys
 from itertools import repeat, chain, product
@@ -14,6 +15,7 @@
 import torch.cuda
 import torch.cuda.comm as comm
 from torch import multiprocessing as mp
+from torch.nn.parallel import scatter_gather
 from torch._six import inf, nan, container_abcs
 
 from test_torch import AbstractTestCases
@@ -3134,6 +3136,48 @@ def test_matmul_device_mismatch(self):
                 with self.assertRaisesRegex(RuntimeError, "expected (it|them) to be on GPU"):
                     torch.addmm(s, m1, m2)
 
+    @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
+    def test_scatter_namedtuple(self):
+        # tests ability to scatter namedtuples and retrieve a list where each
+        # element is of the expected namedtuple type.
+        fields = ("a", "b")
+        TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields)
+        num_gpus = torch.cuda.device_count()
+        a = torch.rand(num_gpus * 2, device=0)
+        b = torch.rand(num_gpus * 2, device=0)
+        a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+        b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+
+        inp = TestNamedTupleInput_0(a, b)
+        target_gpus = [torch.device(i) for i in range(num_gpus)]
+        scatter_out = scatter_gather.scatter(inp, target_gpus)
+
+        for i, x in enumerate(scatter_out):
+            self.assertTrue(isinstance(x, type(inp)))
+            self.assertEqual(x._fields, fields)
+            expected_a = a_tensors_for_gpu[i]
+            expected_b = b_tensors_for_gpu[i]
+            self.assertEqual(expected_a, x.a)
+            self.assertEqual(expected_b, x.b)
+
+        class TestNamedTupleInput_1(NamedTuple):
+            a: torch.tensor
+            b: torch.tensor
+
+        a = torch.rand(num_gpus * 2, device=0)
+        b = torch.rand(num_gpus * 2, device=0)
+        a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+        b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)]
+        inp = TestNamedTupleInput_1(a, b)
+
+        scatter_out = scatter_gather.scatter(inp, target_gpus)
+        for i, x in enumerate(scatter_out):
+            self.assertTrue(isinstance(x, type(inp)))
+            self.assertEqual(x._fields, fields)
+            expected_a = a_tensors_for_gpu[i]
+            expected_b = b_tensors_for_gpu[i]
+            self.assertEqual(expected_a, x.a)
+            self.assertEqual(expected_b, x.b)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
index 1635d40e29e8..a90d85f037c3 100644
--- a/torch/nn/parallel/scatter_gather.py
+++ b/torch/nn/parallel/scatter_gather.py
@@ -1,6 +1,12 @@
 import torch
 from ._functions import Scatter, Gather
 
+def _is_namedtuple(obj):
+    # Check if type was created from collections.namedtuple or a typing.NamedTuple.
+    return (
+        isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields")
+    )
+
 
 def scatter(inputs, target_gpus, dim=0):
     r"""
@@ -11,6 +17,8 @@ def scatter(inputs, target_gpus, dim=0):
     def scatter_map(obj):
         if isinstance(obj, torch.Tensor):
             return Scatter.apply(target_gpus, None, dim, obj)
+        if _is_namedtuple(obj):
+            return list(type(obj)(*args) for args in zip(*map(scatter_map, obj)))
         if isinstance(obj, tuple) and len(obj) > 0:
             return list(zip(*map(scatter_map, obj)))
         if isinstance(obj, list) and len(obj) > 0:
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 5d43882ff024..01cddee92365 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -1,4 +1,5 @@
 import copy
+from collections import namedtuple
 import itertools
 import random
 import math
@@ -3423,3 +3424,47 @@ def forward(self, x):
                 # Synchronize since we run multiple iterations of this test, to
                 # isolate failure hangs.
                 torch.cuda.synchronize(device=self.rank)
+
+        @require_backend({"gloo", "nccl"})
+        @require_backends_available({"gloo", "nccl"})
+        @skip_if_lt_x_gpu(2)
+        @skip_if_rocm
+        def test_ddp_namedtuple(self):
+            expected_fields = ("a", "b")
+            TestNamedTupleInput_0 = namedtuple("NamedTuple", expected_fields)
+
+            batch = 5
+            dim = 10
+
+            class TestNamedTupleInput_1(NamedTuple):
+                a: torch.tensor
+                b: torch.tensor
+
+            a = torch.rand(batch, dim, device=self.rank)
+            b = torch.rand(batch, dim, device=self.rank)
+
+            class NamedTupleModule(torch.nn.Module):
+                def __init__(_self):  # noqa
+                    super().__init__()
+                    _self.lin = nn.Linear(10, 1)
+
+                def forward(_self, input, expected_type):  # noqa
+                    # Without NamedTuple support, this would be of type tuple.
+                    self.assertTrue(
+                        isinstance(input, expected_type),
+                        f"Expected type {expected_type} but got {type(input)}",
+                    )
+                    self.assertEqual(input._fields, expected_fields)
+                    self.assertEqual(a, input.a)
+                    self.assertEqual(b, input.b)
+                    return _self.lin(torch.mul(input.a, input.b))
+
+            model = torch.nn.parallel.DistributedDataParallel(
+                NamedTupleModule().cuda(self.rank), device_ids=[self.rank]
+            )
+            inp = TestNamedTupleInput_0(a, b)
+            # The following would fail if DDP does not propagate NamedTuples correctly.
+            model(inp, type(inp))
+
+            inp = TestNamedTupleInput_1(a, b)
+            model(inp, type(inp))

From 5a47a2126de86aa5d906e5a032b9aef2765924e9 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Fri, 2 Oct 2020 12:51:11 -0700
Subject: [PATCH 386/449] Revert D24018160: [pytorch][PR] Test torch.svd using
 complex float and double numbers

Test Plan: revert-hammer

Differential Revision:
D24018160 (https://github.com/pytorch/pytorch/commit/888f3c12e75c981cd5d3a458e75c1e421d6c514a)

Original commit changeset: 1b6103f5af94

fbshipit-source-id: 3040250db25995fc0d41fd0f497550dded43cad9
---
 test/test_torch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_torch.py b/test/test_torch.py
index 9796803d46f1..126cbda5815f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -20838,7 +20838,7 @@ def _test_svd_helper(self, shape, some, col_maj, device, dtype):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(*(_float_types_no_half + _complex_types))
+    @dtypes(*_float_types_no_half)
     def test_svd_square(self, device, dtype):
         self._test_svd_helper((10, 10), True, False, device, dtype)
 

From e3d2defdc862c4b28858c2a920532b5baab2e963 Mon Sep 17 00:00:00 2001
From: Bert Maher <bertrand@fb.com>
Date: Fri, 2 Oct 2020 12:51:55 -0700
Subject: [PATCH 387/449] [te] Get llvm codegen to compile with llvm9 and
 llvm-fb (#45726)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45726

FB has an old internal platform that uses some random llvm version
that looks sort of like llvm 7.  I've guarded that with the appropriate
LLVM_VERSION_PATCH.

I've also swapped out some of our uses of ThreadSafeModule/ThreadSafeContext
for the variants without ThreadSafe in the name.  As far as I can tell we
weren't using the bundled locks anyways, but I'm like 85% sure this is OK since
we compile under the Torch JIT lock anyways.

Test Plan: unit tests

Reviewed By: ZolotukhinM, asuhan

Differential Revision: D24072697

fbshipit-source-id: 7f56b9f3cbe5e6d54416acdf73876338df69ddb2
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp |  82 +++++++++-----
 torch/csrc/jit/tensorexpr/llvm_jit.cpp     | 121 ++++++++++++++++++++-
 torch/csrc/jit/tensorexpr/llvm_jit.h       |   3 +-
 3 files changed, 176 insertions(+), 30 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 609c5a731c42..2d20bd1b47d0 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -77,7 +77,7 @@ llvm::CmpInst::Predicate llvm_comparison_predicate(
 
 class LLVMCodeGenImpl : public IRVisitor {
  private:
-  llvm::orc::ThreadSafeContext context_;
+  std::unique_ptr<llvm::LLVMContext> context_;
   llvm::IRBuilder<> irb_;
   std::unique_ptr<llvm::TargetMachine> TM_;
   std::unique_ptr<llvm::orc::PytorchLLVMJIT> jit_;
@@ -313,8 +313,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
   emitWrapper(params);
   emitKernel(stmt, params);
 
-  cantFail(jit_->addModule(
-      llvm::orc::ThreadSafeModule(std::move(module_), context_)));
+  cantFail(jit_->addModule(std::move(module_), std::move(context_)));
   auto sym = jit_->findSymbol("wrapper");
   kernelAddress_ = cantFail(sym.getAddress());
   argv_ = std::make_unique<void*[]>(params.size());
@@ -323,7 +322,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
 }
 
 llvm::LLVMContext& LLVMCodeGenImpl::getContext() {
-  return *context_.getContext();
+  return *context_;
 }
 
 llvm::Type* LLVMCodeGenImpl::dtypeToLLVM(Dtype dtype) {
@@ -973,7 +972,7 @@ void LLVMCodeGenImpl::visit(const Load* v) {
       auto addr = irb_.CreateGEP(base, first_idx);
       auto vaddr = irb_.CreateBitOrPointerCast(
           addr, llvm::PointerType::get(loadType, 0));
-      value_ = irb_.CreateAlignedLoad(loadType, vaddr, 4);
+      value_ = irb_.CreateAlignedLoad(vaddr, 4);
       return;
     }
   }
@@ -1221,14 +1220,48 @@ void LLVMCodeGenImpl::visit(const BaseCallNode* v) {
 
 static void applyMathFunctionAttributes(llvm::Function* f) {
   f->addFnAttr(llvm::Attribute::ReadNone);
-  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::NoUnwind);
   // TODO: Adding this attr should be correct, but as of LLVM 9.0.1 adding it
   // causes some math functions to incorrectly be turned into tail calls.
   // f->addFnAttr(llvm::Attribute::Speculatable);
+#if LLVM_VERSION_MAJOR == 9
+  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::WillReturn);
+#endif
 }
 
+namespace {
+#if LLVM_VERSION_MAJOR == 9
+
+using FunctionCallee = llvm::FunctionCallee;
+
+#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
+
+struct FunctionCallee {
+  FunctionCallee() {}
+
+  FunctionCallee(llvm::Constant* fn)
+      : v_(fn), ft_(cast<llvm::Function>(v_)->getFunctionType()) {}
+
+  llvm::FunctionType* getFunctionType() {
+    return ft_;
+  }
+
+  llvm::Value* getCallee() {
+    return v_;
+  }
+
+ private:
+  llvm::Value* v_{nullptr};
+  llvm::FunctionType* ft_{nullptr};
+};
+
+#else
+#error Only LLVM versions 8 or 9 are supported.
+#endif
+
+} // namespace
+
 void LLVMCodeGenImpl::visit(const Intrinsics* v) {
   llvm::FunctionType* call_ty = nullptr;
   llvm::Value* call_fn = nullptr;
@@ -1250,7 +1283,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1275,7 +1308,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1321,7 +1354,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1350,7 +1383,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1374,16 +1407,15 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
         SIMD_BINARY_MATH_CASE(kFmod, "fmodf", FloatTy_)
 #undef SIMD_BINARY_MATH_CASE
 
-#define BINARY_MATH_CASE(enum, name, type)                             \
-  case enum: {                                                         \
-    auto callee = module_->getOrInsertFunction(                        \
-        name, llvm::FunctionType::get(type, {type, type}, false), {}); \
-    call_ty = callee.getFunctionType();                                \
-    call_fn = callee.getCallee();                                      \
-    applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));  \
-  } break;
-        BINARY_MATH_CASE(kRemainder, "remainderf", FloatTy_)
-#undef BINARY_MATH_CASE
+      case kRemainder: {
+        FunctionCallee callee = module_->getOrInsertFunction(
+            "remainderf",
+            llvm::FunctionType::get(FloatTy_, {FloatTy_, FloatTy_}, false),
+            {});
+        call_ty = callee.getFunctionType();
+        call_fn = callee.getCallee();
+        applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));
+      } break;
 
       default: {
         throw unimplemented_lowering(v);
@@ -1395,7 +1427,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1420,7 +1452,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1477,7 +1509,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1506,7 +1538,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1532,7 +1564,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 
 #define BINARY_MATH_CASE(enum, name, type)                             \
   case enum: {                                                         \
-    auto callee = module_->getOrInsertFunction(                        \
+    FunctionCallee callee = module_->getOrInsertFunction(              \
         name, llvm::FunctionType::get(type, {type, type}, false), {}); \
     call_ty = callee.getFunctionType();                                \
     call_fn = callee.getCallee();                                      \
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
index c4cc9337ce16..8a8e2bf48513 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
@@ -2,7 +2,23 @@
 
 #include <torch/csrc/jit/tensorexpr/llvm_jit.h>
 
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
+#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
+#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
 #include <llvm/ExecutionEngine/Orc/LLJIT.h>
+#include <llvm/ExecutionEngine/Orc/LambdaResolver.h>
+#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
+#include <llvm/ExecutionEngine/Orc/SymbolStringPool.h>
+#include <llvm/ExecutionEngine/RTDyldMemoryManager.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/Mangler.h>
+#include <llvm/Support/DynamicLibrary.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+
 #include <sleef.h>
 #include <algorithm>
 #include <memory>
@@ -14,6 +30,7 @@ namespace orc {
 
 // Lightly modified implementation from LLVM's Kaleidoscope JIT tutorial:
 // https://llvm.org/docs/tutorial/BuildingAJIT1.html
+#if LLVM_VERSION_MAJOR == 9
 class TORCH_API PytorchLLVMJITImpl {
  private:
   std::unique_ptr<LLJIT> LLJ;
@@ -420,8 +437,9 @@ class TORCH_API PytorchLLVMJITImpl {
 #endif
   }
 
-  Error addModule(ThreadSafeModule M) {
-    if (auto Err = LLJ->addIRModule(std::move(M))) {
+  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C) {
+    if (auto Err =
+            LLJ->addIRModule(ThreadSafeModule(std::move(M), std::move(C)))) {
       return Err;
     }
     return Error::success();
@@ -441,8 +459,99 @@ PytorchLLVMJIT::PytorchLLVMJIT()
 
 PytorchLLVMJIT::~PytorchLLVMJIT() = default;
 
-Error PytorchLLVMJIT::addModule(ThreadSafeModule M) {
-  return impl_->addModule(std::move(M));
+Error PytorchLLVMJIT::addModule(
+    std::unique_ptr<Module> M,
+    std::unique_ptr<LLVMContext> C) {
+  return impl_->addModule(std::move(M), std::move(C));
+}
+
+JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
+  return impl_->findSymbol(std::move(Name));
+}
+
+const DataLayout& PytorchLLVMJIT::getDataLayout() {
+  return impl_->getDataLayout();
+}
+
+#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
+
+class TORCH_API PytorchLLVMJITImpl {
+ private:
+  ExecutionSession ES;
+  std::shared_ptr<SymbolResolver> Resolver;
+  std::unique_ptr<TargetMachine> TM;
+  const DataLayout DL;
+  RTDyldObjectLinkingLayer ObjectLayer;
+  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+
+ public:
+  PytorchLLVMJITImpl()
+      : Resolver(createLegacyLookupResolver(
+            ES,
+            [this](const std::string& Name) -> JITSymbol {
+              if (auto Sym = CompileLayer.findSymbol(Name, false))
+                return Sym;
+              else if (auto Err = Sym.takeError())
+                return std::move(Err);
+              if (auto SymAddr =
+                      RTDyldMemoryManager::getSymbolAddressInProcess(Name))
+                return JITSymbol(SymAddr, JITSymbolFlags::Exported);
+              return nullptr;
+            },
+            [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })),
+        TM(EngineBuilder().selectTarget()),
+        DL(TM->createDataLayout()),
+        ObjectLayer(
+            ES,
+            [this](VModuleKey) {
+              return RTDyldObjectLinkingLayer::Resources{
+                  std::make_shared<SectionMemoryManager>(), Resolver};
+            }),
+        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
+    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+  }
+
+  TargetMachine& getTargetMachine() {
+    return *TM;
+  }
+
+  VModuleKey addModule(std::unique_ptr<Module> M) {
+    // Add the module to the JIT with a new VModuleKey.
+    auto K = ES.allocateVModule();
+    cantFail(CompileLayer.addModule(K, std::move(M)));
+    return K;
+  }
+
+  JITSymbol findSymbol(const std::string Name) {
+    std::string MangledName;
+    raw_string_ostream MangledNameStream(MangledName);
+    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
+    return CompileLayer.findSymbol(MangledNameStream.str(), true);
+  }
+
+  JITTargetAddress getSymbolAddress(const std::string Name) {
+    return cantFail(findSymbol(Name).getAddress());
+  }
+
+  void removeModule(VModuleKey K) {
+    cantFail(CompileLayer.removeModule(K));
+  }
+
+  const DataLayout& getDataLayout() {
+    return DL;
+  }
+};
+
+PytorchLLVMJIT::PytorchLLVMJIT()
+    : impl_(std::make_unique<PytorchLLVMJITImpl>()) {}
+
+PytorchLLVMJIT::~PytorchLLVMJIT() = default;
+
+Error PytorchLLVMJIT::addModule(
+    std::unique_ptr<Module> M,
+    std::unique_ptr<LLVMContext> C) {
+  impl_->addModule(std::move(M));
+  return Error::success();
 }
 
 JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
@@ -453,6 +562,10 @@ const DataLayout& PytorchLLVMJIT::getDataLayout() {
   return impl_->getDataLayout();
 }
 
+#else // LLVM_VERSION_MAJOR
+#error Only LLVM versions 8 or 9 are supported.
+#endif
+
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 0a96efd1298a..fa73cdfca3bc 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -21,11 +21,12 @@ class TORCH_API PytorchLLVMJIT {
   PytorchLLVMJIT();
   ~PytorchLLVMJIT();
 
-  Error addModule(ThreadSafeModule M);
+  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C);
 
   JITSymbol findSymbol(const std::string Name);
 
   TargetMachine& getTargetMachine();
+
   const DataLayout& getDataLayout();
 
  private:

From 7d809f5d8e93cb6d332297ece071083845e30e26 Mon Sep 17 00:00:00 2001
From: anjali411 <chourdiaanjali123@gmail.com>
Date: Fri, 2 Oct 2020 13:08:49 -0700
Subject: [PATCH 388/449] Update backward definition for more operators and
 reenable tests in test_ops.py (#44444)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44444

This PR:
1. Fixes https://github.com/pytorch/pytorch/issues/41510. Updates backward formula for the following functions: `asin`, `acos`, `asinh`, `acosh`, `atan`, `atanh`, `div`, `log`, `log10`, `log2`, `log1p`, `pow`, `reciprocal`, `angle`.
2. Re-enables the tests in `test_ops.py`.
3. Adds dispatch for complex dtypes for `tanh_backward`.
4. Re-enables commented tests in `common_methods_invocation.py`.

Test Plan: Imported from OSS

Reviewed By: glaringlee

Differential Revision: D24027761

Pulled By: anjali411

fbshipit-source-id: c1f707c2a039149a6e04bbde53ee120d9119d99a
---
 aten/src/ATen/native/Pow.cpp                  |  9 +--
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp  | 16 +++-
 .../cuda/BinaryMiscBackwardOpsKernels.cu      | 16 +++-
 test/test_autograd.py                         |  6 +-
 test/test_jit.py                              |  2 +-
 test/test_ops.py                              | 18 ++---
 tools/autograd/derivatives.yaml               | 30 +++----
 tools/autograd/gen_variable_type.py           |  3 +-
 torch/csrc/autograd/FunctionsManual.cpp       | 80 ++++++++++++-------
 torch/csrc/autograd/FunctionsManual.h         |  1 +
 .../_internal/common_methods_invocations.py   | 37 ++++++---
 11 files changed, 136 insertions(+), 82 deletions(-)

diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index c10a617a5928..8fc2668f49c7 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -27,13 +27,10 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) {
            "result type ", common_dtype, "can't be cast to the desired output type ",
            result.scalar_type());
 
-  if (exp.isComplex() && (exp.toComplexDouble() == 0.0) ) {
+  auto exponent = (exp.isComplex()) ? exp.toComplexDouble() : exp.toDouble();
+  if (exponent == 0.0) {
     result.resize_as_(base).fill_(1);
-  } else if (exp.isComplex() && (exp.toComplexDouble() == 1.0) ) {
-    result.resize_as_(base).fill_(base);
-  } else if (!exp.isComplex() && (exp.toDouble() == 0.0)) {
-    result.resize_as_(base).fill_(1);
-  } else if (!exp.isComplex() && (exp.toDouble() == 1.0)) {
+  } else if (exponent == 1.0) {
     result.resize_as_(base).copy_(base);
   } else {
     auto iter = TensorIterator::unary_op(result, base.to(common_dtype));
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index fce8c348919b..2efde913fba1 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -589,7 +589,20 @@ void logit_backward_kernel(TensorIterator& iter, Scalar eps_scalar) {
 }
 
 void tanh_backward_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
+  if (isComplexType(iter.dtype())) {
+    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
+    auto one_vec = Vec256<scalar_t>(scalar_t{1});
+    cpu_kernel_vec(
+      iter,
+      [=](scalar_t a, scalar_t b) -> scalar_t {
+        return a * std::conj(scalar_t{1} - b * b);
+      },
+      [=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
+        return a * (one_vec - b * b).conj();
+      });
+  });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
     auto one_vec = Vec256<scalar_t>(scalar_t{1});
     cpu_kernel_vec(
       iter,
@@ -600,6 +613,7 @@ void tanh_backward_kernel(TensorIterator& iter) {
         return a * (one_vec - b * b);
       });
   });
+  }
 }
 
 void mse_kernel(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
index 9b7bc28a829e..ed7e2190f75e 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
@@ -60,13 +60,21 @@ void logit_backward_kernel_cuda(TensorIterator& iter, Scalar eps_scalar) {
 }
 
 void tanh_backward_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
-    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] {
+  if(isComplexType(iter.dtype())) {
+    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_complex_cuda", [&]() {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * (scalar_t{1.} - b * b);
+        return a * std::conj(scalar_t{1.} - b * b);
       });
     });
-  });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
+      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] {
+        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+          return a * (scalar_t{1.} - b * b);
+        });
+      });
+    });
+  }
 }
 
 REGISTER_DISPATCH(sigmoid_backward_stub, &sigmoid_backward_kernel_cuda);
diff --git a/test/test_autograd.py b/test/test_autograd.py
index d6661b4662fe..63f245576c5b 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4794,8 +4794,8 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
 # the tests for these ops which do not have 'complex' in variant should not run for complex
 # and only run for floating point
 
-# TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition
-separate_complex_tests = ['view_as_real', 'real', 'imag', 'asin', 'acos']  # ['log', 'log10', 'log1p', 'log2', 'reciprocal', 'tan']
+separate_complex_tests = ['view_as_real', 'real', 'imag', 'asin', 'acos', 'div', 'log',
+                          'log10', 'log1p', 'log2', 'pow', 'tan', 'reciprocal', 'rsqrt']
 
 # NOTE: Some non-holomorphic are separately tested in TestAutogradComplex until gradcheck works properly
 # for non-holomorphic functions
@@ -4806,7 +4806,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
                 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
-                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot'] + separate_complex_tests
+                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'atan', 'angle'] + separate_complex_tests
 
 # TODO(@anjali411): add tests for 'sub', 'div
 # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
diff --git a/test/test_jit.py b/test/test_jit.py
index 658ca08839b1..59739f4bb8bb 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -15597,7 +15597,7 @@ def add_autograd_test(
 
     # Disable complex tests
     # TODO: Add complex support for jit
-    if 'complex' in variant_name or name in ['view_as_complex', 'complex']:
+    if 'complex' in variant_name or name in ['view_as_complex', 'complex', 'angle']:
         return
 
     # Skips aliases, which are tested in test_op_aliases.py
diff --git a/test/test_ops.py b/test/test_ops.py
index 5be450d4d41f..1d85f86113e9 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -89,41 +89,35 @@ def _gradgrad_test_helper(self, device, dtype, op, variant):
         return self._check_helper(device, dtype, op, variant, 'gradgradcheck')
 
     # Tests that gradients are computed correctly
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_fn_grad(self, device, dtype, op):
         self._grad_test_helper(device, dtype, op, op.get_op())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_method_grad(self, device, dtype, op):
         self._grad_test_helper(device, dtype, op, op.get_method())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_inplace_grad(self, device, dtype, op):
         if not op.test_inplace_grad:
             self.skipTest("Skipped! Inplace gradcheck marked to skip.")
         self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
 
-    # TODO(@anjali411) enable this for torch.cdouble.
     # Test that gradients of gradients are computed correctly
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_fn_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, op.get_op())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_method_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, op.get_method())
 
-    # TODO(@anjali411) enable this for torch.cdouble.
-    @dtypes(torch.double)
+    @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_inplace_gradgrad(self, device, dtype, op):
         if not op.test_inplace_grad:
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 92ee277e9ecf..54df6eace85b 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -162,7 +162,7 @@
   self: grad * self.sgn()
 
 - name: acos(Tensor self) -> Tensor
-  self: grad * -((-self * self + 1).rsqrt())
+  self: grad * -((-self * self + 1).rsqrt()).conj()
 
 - name: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   self: grad
@@ -213,7 +213,7 @@
   self: grad
 
 - name: angle(Tensor self) -> Tensor
-  self: grad.to(self.scalar_type()) * (self*Scalar(c10::complex<double>{0.0, 1.0})).conj() / self.abs().pow(2)
+  self: angle_backward(grad, self)
 
 # The four items below are necessary because TensorIterator doesn't work on
 # Variables (codegen does not unwrap the input Tensor for all() and any() ).
@@ -230,19 +230,19 @@
   self: not_implemented("all")
 
 - name: acosh(Tensor self) -> Tensor
-  self: grad * (self.pow(2) - 1).rsqrt()
+  self: grad * (self.pow(2) - 1).rsqrt().conj()
 
 - name: acosh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of acosh")
 
 - name: asinh(Tensor self) -> Tensor
-  self: grad * (self.pow(2) + 1).rsqrt()
+  self: grad * (self.pow(2) + 1).rsqrt().conj()
 
 - name: asinh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of asinh")
 
 - name: atanh(Tensor self) -> Tensor
-  self: grad * 1 / (1 - self.pow(2))
+  self: grad * 1 / (1 - self.pow(2)).conj()
 
 - name: atanh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of atanh")
@@ -251,10 +251,10 @@
   self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
 
 - name: asin(Tensor self) -> Tensor
-  self: grad * (-self * self + 1).rsqrt()
+  self: grad * (-self * self + 1).rsqrt().conj()
 
 - name: atan(Tensor self) -> Tensor
-  self: grad / (self * self + 1)
+  self: grad / (self * self + 1).conj()
 
 - name: atan2(Tensor self, Tensor other) -> Tensor
   self, other: atan2_backward(grad, self, other, grad_input_mask)
@@ -610,16 +610,16 @@
   self: grad * polygamma(n + 1, self)
 
 - name: log(Tensor self) -> Tensor
-  self: grad.div(self)
+  self: grad.div(self.conj())
 
 - name: log10(Tensor self) -> Tensor
-  self: grad / (self * 2.3025850929940456)
+  self: grad / (self.conj() * 2.3025850929940456)
 
 - name: log1p(Tensor self) -> Tensor
   self: log1p_backward(grad, self)
 
 - name: log2(Tensor self) -> Tensor
-  self: grad / (self * 0.6931471805599453)
+  self: grad / (self.conj() * 0.6931471805599453)
 
 - name: logaddexp(Tensor self, Tensor other) -> Tensor
   self: grad / (1 + exp(other - self))
@@ -875,7 +875,7 @@
   self: zeros_like(grad)
 
 - name: reciprocal(Tensor self) -> Tensor
-  self: -grad * result * result
+  self: -grad * (result * result).conj()
 
 - name: remainder.Scalar(Tensor self, Scalar other) -> Tensor
   self: grad
@@ -900,7 +900,7 @@
   self: zeros_like(grad)
 
 - name: rsqrt(Tensor self) -> Tensor
-  self: -0.5 * grad * result.pow(3)
+  self: -0.5 * grad * result.pow(3).conj()
 
 - name: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
   self: grad.clone().scatter_(dim, index, 0)
@@ -1037,7 +1037,7 @@
   index: non_differentiable
 
 - name: tan(Tensor self) -> Tensor
-  self: grad * (1 + result.pow(2))
+  self: grad * (1 + result.pow(2)).conj()
 
 - name: tanh(Tensor self) -> Tensor
   self: tanh_backward(grad, result)
@@ -1661,8 +1661,8 @@
   output: grad * grad_output * (-2 * output + 1)
 
 - name: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
-  grad_output: tanh_backward(grad, output)
-  output: -2 * output * grad * grad_output
+  grad_output: tanh_backward(grad, output.conj())
+  output: grad.conj() * (-2 * output.conj() * grad_output)
 
 # cudnn
 - name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index b7e61b5b7a83..8046c6a288e6 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -160,7 +160,8 @@
     'cosh', '__rmul__', 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex',
     'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
     'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
-    'dot', 'vdot', 'cholesky'
+    'dot', 'vdot', 'cholesky', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
+    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 1314a98e9562..91045ee88099 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -87,6 +87,14 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) {
   return tensor;
 }
 
+Tensor handle_r_to_c(Tensor self, Tensor gradient_result) {
+  if (!self.is_complex() && gradient_result.is_complex()) {
+    // R -> C
+    return at::real(gradient_result);
+  }
+  return gradient_result;
+}
+
 Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result) {
   if (!at::isComplexType(self_st) && gradient_result.is_complex()) {
     // R -> C
@@ -177,16 +185,18 @@ Tensor norm_backward(Tensor grad, const Tensor & self, const optional<Scalar> &
 }
 
 Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent_) {
-  double exponent = exponent_.toDouble();
+  auto exponent = (exponent_.isComplex()) ? exponent_.toComplexDouble() : exponent_.toDouble();
   if (exponent == 0.0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
-    return grad * exponent * self.pow(exponent - 1);
+    auto out = grad * (exponent * self.pow(exponent - 1)).conj();
+    return handle_r_to_c(self, out);
   }
 }
 
 Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & exponent) {
-  return at::where(exponent == 0.0, at::zeros({}, grad.options()), grad * exponent * self.pow(exponent - 1));
+  auto out = at::where(exponent == 0.0, at::zeros({}, grad.options()), grad * (exponent * self.pow(exponent - 1)).conj());
+  return handle_r_to_c(self, out);
 }
 
 // Caveats:
@@ -198,18 +208,46 @@ Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & expone
 // d(a^b)/db = 0 for a > 0 and b -> +0.
 // Currently, tensorflow agrees with us.
 Tensor pow_backward_exponent(Tensor grad, const Tensor& self, const Tensor& exponent, Tensor result) {
-  return grad * at::where(at::logical_and(self == 0, exponent >= 0),
+  Tensor cond;
+  if (exponent.is_complex()) {
+    auto is_real_exp = at::logical_and(at::imag(exponent) == 0, at::real(exponent) >= 0);
+    cond = at::logical_and(self == 0, is_real_exp);
+  } else {
+    cond = at::logical_and(self == 0, exponent >= 0);
+  }
+  auto out = grad * at::where(cond,
                           at::zeros({}, grad.options()),
-                          result * self.log());
+                          (result * self.log()).conj());
+  return handle_r_to_c(exponent, out);
 }
 
 Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exponent, Tensor result) {
-  if (base.toDouble() == 0) {
-    return grad * at::where(exponent >= 0,
+  auto base_ = base.isComplex() ? base.toComplexDouble() : base.toDouble();
+  auto grad_lambda = [](auto a, auto b) { return (a * std::log(b)).conj(); };
+  if (base_ == 0.0) {
+    auto cond = [](auto exp) {
+      if (exp.is_complex()) {
+        return at::logical_and(at::imag(exp) == 0, at::real(exp) >= 0);
+      } else {
+        return exp >=0;
+      }
+    };
+    auto out = grad * at::where(cond(exponent),
                             at::zeros({}, grad.options()),
-                            result * std::log(base.toDouble()));
+                            grad_lambda(result, base_));
+    return handle_r_to_c(exponent, out);
+  } else {
+    auto out = grad * grad_lambda(result, base_);
+    return handle_r_to_c(exponent, out);
+  }
+}
+
+Tensor angle_backward(Tensor grad, const Tensor& self) {
+  if (self.is_complex()) {
+    return at::where(self == 0.0, at::zeros({}, self.options()),
+                     grad * self / self.abs().pow(2) * Scalar(c10::complex<double>{0.0, 1.0}));
   } else {
-    return grad * result * std::log(base.toDouble());
+    return at::zeros_like(self, at::MemoryFormat::Preserve);
   }
 }
 
@@ -226,35 +264,23 @@ Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) {
     // https://arxiv.org/pdf/1701.00392.pdf Section 4.20
     return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result)));
   } else {
-    return at::zeros_like(grad, at::MemoryFormat::Preserve);
+    return at::zeros_like(self, at::MemoryFormat::Preserve);
   }
 }
 
 Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
-  auto result = grad * other.conj();
-  if (!at::isComplexType(self_st) && result.is_complex()) {
-    // R -> C
-    result = at::real(result);
-  }
-  return result;
+  auto out = grad * other.conj();
+  return handle_r_to_c(self_st, out);
 }
 
 Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) {
   auto result = grad / other.conj();
-  if (!at::isComplexType(self_st) && result.is_complex()) {
-    // R -> C
-    result = at::real(result);
-  }
-  return result;
+  return handle_r_to_c(self_st, result);
 }
 
 Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) {
   auto result = -grad * ((self / other) / other).conj();
-  if (!other.is_complex() && result.is_complex()) {
-    // R -> C
-    result = at::real(result);
-  }
-  return result;
+  return handle_r_to_c(other, result);
 }
 
 Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) {
@@ -2621,7 +2647,7 @@ Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
       "Use a different mathematical operation which preserves sparsity of gradients, ",
       "or report a bug if you think this is an error.");
   }
-  return grad / (self + 1);
+  return grad / (self + 1).conj();
 }
 
 Tensor sparse_constructor_values_backward(const Tensor& sparse_grad_out, const Tensor& indices, IntArrayRef values_shape) {
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 00171cbbf656..28022c8fb930 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -44,6 +44,7 @@ at::Tensor pow_backward(at::Tensor grad, const at::Tensor & self, const at::Scal
 at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at::Tensor & exponent);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Scalar & base, const at::Tensor& exponent, at::Tensor result);
+at::Tensor angle_backward(at::Tensor grad, const at::Tensor& self);
 at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st);
 at::Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st);
 at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 290645fd0d32..979b5b3d78f9 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -560,6 +560,15 @@ def method_tests():
             (True, [], ['aten::mul', 'aten::reciprocal'])),
         ('__rdiv__', uniform_scalar(1e-1, requires_grad=True), (3.14,), 'scalar_constant',
             (True, [], ['aten::mul', 'aten::reciprocal'])),
+        ('div', (S, S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex', (True,)),
+        ('div', (S, S, S), (torch.rand(S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_rhs', (True,)),
+        ('div', (S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_lhs', (True,)),
+        ('div', (S, 1, S), (torch.rand(M, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_all', (True,)),
+        ('div', (), (uniform_scalar(0.1j),), 'complex_scalar', (True,)),
+        ('div', (S, S, S), (uniform_scalar(0.1j),), 'complex_scalar_broadcast_rhs', (True,)),
+        ('div', (), (uniform_scalar(0.1j),), 'complex_scalar_broadcast_lhs', (True,)),
+        ('div', torch.rand(S, S, S, dtype=torch.cdouble) + 1e-1, (3.14j,), 'complex_constant', (True,)),
+        ('div', uniform_scalar(1e-1j, requires_grad=True), (3.14j,), 'complex_scalar_constant', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(S, S, S) + 0.1,), '', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(1,) + 0.1,), 'broadcast_rhs', (True,)),
         ('pow', torch.rand(1,) + 1e-3, (torch.rand(S, S, S) + 0.1,), 'broadcast_lhs', (True,)),
@@ -568,8 +577,11 @@ def method_tests():
         ('pow', torch.rand(S, S, S) + 1e-3, (uniform_scalar(0.1),), 'scalar_broadcast_rhs', (True,)),
         ('pow', uniform_scalar(1e-3, requires_grad=True), (torch.rand(S, S, S) + 0.1,), 'scalar_broadcast_lhs', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant', (True,)),
+        ('pow', torch.rand(S, S, S, dtype=torch.cdouble) + 1e-3 * (1 + 1j), (3.14,), 'complex_constant', (True,)),
         ('__rpow__', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant', (True, 'aten::pow')),
         ('pow', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant', (True,)),
+        ('pow', uniform_scalar(1e-3 * (1 + 1j), requires_grad=True), (3.14,), 'complex_scalar_constant', (True,)),
+        ('pow', uniform_scalar(1e-3 * (1 + 1j), requires_grad=True), (3.14j,), 'complex_imaginary_exponent', (True,)),
         ('__rpow__', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant', (True, 'aten::pow')),
         ('transpose', (1, 2, 3), (1, 2), 'dim', (False,), [0, 1]),
         ('transpose', (), (0, 0), 'scalar', (False,)),
@@ -640,13 +652,12 @@ def method_tests():
         ('log1p', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar', (True,)),
         ('log2', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('log2', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition.
-        # ('log', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
-        # ('log', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        # ('log10', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
-        # ('log10', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        # ('log2', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
-        # ('log2', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        ('log', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('log', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        ('log10', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('log10', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        ('log2', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('log2', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
         ('tanh', (S, S, S), NO_ARGS, '', (True,)),
         ('tanh', (), NO_ARGS, 'scalar', (True,)),
         ('sigmoid', (S, S, S), NO_ARGS, '', (True,)),
@@ -667,6 +678,8 @@ def method_tests():
         ('complex', (S, S, S), ((S, S, S),), ''),
         ('abs', (S, S, S), NO_ARGS, '', (True,)),
         ('abs', (), NO_ARGS, 'scalar', (True,)),
+        ('angle', (S, S, S), NO_ARGS, '', (True,)),
+        ('angle', (), NO_ARGS, 'scalar', (True,)),
         ('clamp', (S, S, S), (0, 1), '', (True,)),
         ('clamp', (S, S, S), (None, 0.5), 'min', (True,)),
         ('clamp', (S, S, S), (0.5, None), 'max', (True,)),
@@ -681,8 +694,7 @@ def method_tests():
         ('cos', (S, S, S), NO_ARGS, '', (True,)),
         ('cos', (), NO_ARGS, 'scalar', (True,)),
         ('tan', torch.randn(S, S, S).clamp(-1, 1), NO_ARGS, '', (True,)),
-        # TODO(@anjali411): add the commented test back after updating the formula based on tensorflow definition.
-        # ('tan', (S, S, S), NO_ARGS, 'complex', (True,)),
+        ('tan', (S, S, S), NO_ARGS, 'complex', (True,)),
         ('asin', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
         ('acos', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
         ('atan', (S, S, S), NO_ARGS, '', (True,)),
@@ -694,9 +706,8 @@ def method_tests():
         ('atan2', (S, 1, S), ((S, S),), 'broadcast_all'),
         ('reciprocal', torch.rand(S, S, S) + 0.1, NO_ARGS, '', (True,)),
         ('reciprocal', uniform_scalar(0.1, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition.
-        # ('reciprocal', torch.randn(S, S, S, dtype=torch.cdouble) + 0.1, NO_ARGS, 'complex', (True,)),
-        # ('reciprocal', uniform_scalar(0.1j), NO_ARGS, 'complex_scalar', (True,)),
+        ('reciprocal', torch.randn(S, S, S, dtype=torch.cdouble) + 0.1, NO_ARGS, 'complex', (True,)),
+        ('reciprocal', uniform_scalar(0.1j), NO_ARGS, 'complex_scalar', (True,)),
         ('round', (S, S, S), NO_ARGS, '', (True,)),
         ('round', (), NO_ARGS, 'scalar', (True,)),
         ('sign', (S, S, S), NO_ARGS),
@@ -713,6 +724,8 @@ def method_tests():
         ('deg2rad', (S, S, S), NO_ARGS),
         ('rsqrt', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('rsqrt', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
+        ('rsqrt', torch.rand(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
+        ('rsqrt', uniform_scalar(1e-2 * (1 + 1j), requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
         ('frac', (S, S, S), NO_ARGS, '', (True,)),
         ('frac', (), NO_ARGS, 'scalar', (True,)),
         ('fmod', (S, S, S), (1.5,), '', (True,)),

From c31066ac9dd9e8497bd5d1db3b503f9709d40322 Mon Sep 17 00:00:00 2001
From: Marcio Porto <mflporto@fb.com>
Date: Fri, 2 Oct 2020 13:39:11 -0700
Subject: [PATCH 389/449] Torch Integration Test Formatting Changes (#45740)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45740

Reviewed By: esqu1

Differential Revision: D23869021

fbshipit-source-id: 5910d44f9475bd7a53dc0478b69b39572dc8666f
---
 .../operator_test/torch_integration_test.py   | 155 ++++++++++--------
 1 file changed, 87 insertions(+), 68 deletions(-)

diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
index 628512953dca..82ebd2d65f49 100644
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ b/caffe2/python/operator_test/torch_integration_test.py
@@ -1,12 +1,12 @@
 
 
+import struct
+import unittest
+
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
-import struct
 import torch
-import unittest
-
 from caffe2.python import core, workspace
 from hypothesis import given, settings
 from scipy.stats import norm
@@ -77,7 +77,7 @@ def create_bbox_transform_inputs(roi_counts, num_classes, rotated):
 def bytes_to_floats(byte_matrix):
     floats = np.empty([np.shape(byte_matrix)[0], 1], dtype=np.float32)
     for i, byte_values in enumerate(byte_matrix):
-        floats[i], = struct.unpack('f', bytearray(byte_values))
+        (floats[i],) = struct.unpack("f", bytearray(byte_values))
     return floats
 
 
@@ -85,7 +85,7 @@ def floats_to_bytes(floats):
     byte_matrix = np.empty([np.shape(floats)[0], 4], dtype=np.uint8)
     for i, value in enumerate(floats):
         assert isinstance(value, np.float32), (value, floats)
-        as_bytes = struct.pack('f', value)
+        as_bytes = struct.pack("f", value)
         # In Python3 bytes will be a list of int, in Python2 a list of string
         if isinstance(as_bytes[0], int):
             byte_matrix[i] = list(as_bytes)
@@ -443,7 +443,7 @@ def inference_lstm_ref():
             return (
                 workspace.FetchBlob("output"),
                 workspace.FetchBlob("hidden"),
-                workspace.FetchBlob("cell")
+                workspace.FetchBlob("cell"),
             )
 
         output, hidden, cell = inference_lstm_ref()
@@ -589,7 +589,7 @@ def rand_rotated_roi():
                     np.random.rand() * H,
                     np.random.rand() * W,
                     np.random.rand() * H,
-                    np.random.rand() * 360 - 180
+                    np.random.rand() * 360 - 180,
                 ]
             ).astype(np.float32)
 
@@ -676,18 +676,19 @@ def test_collect_and_distribute_fpn_rpn_proposals_op(self, roi_counts):
         for x, y in zip(fpn_outputs, all_outputs[1:]):
             torch.testing.assert_allclose(x, y)
 
-    @given(X=hu.tensor(),
-           fast_gelu=st.booleans())
+    @given(X=hu.tensor(), fast_gelu=st.booleans())
     def _test_gelu_op(self, X, fast_gelu, device):
         def _gelu_ref(_X):
-            return (_X * norm.cdf(_X).astype(np.float32), )
-        expected_output, = _gelu_ref(X)
+            return (_X * norm.cdf(_X).astype(np.float32),)
+
+        (expected_output,) = _gelu_ref(X)
         actual_output = torch.ops._caffe2.Gelu(torch.tensor(X), fast_gelu)
 
         rtol = 1e-3 if fast_gelu else 1e-4
         atol = 1e-5
         torch.testing.assert_allclose(
-            expected_output, actual_output.cpu(), rtol=rtol, atol=atol)
+            expected_output, actual_output.cpu(), rtol=rtol, atol=atol
+        )
 
     def test_gelu_op(self):
         self._test_gelu_op(device="cpu")
@@ -696,13 +697,11 @@ def test_gelu_op(self):
     def test_gelu_op_cuda(self):
         self._test_gelu_op(device="cuda")
 
-
-    @given(inputs=hu.lengths_tensor(
-        dtype=np.float32,
-        min_value=1,
-        max_value=5,
-        allow_empty=True,
-    ))
+    @given(
+        inputs=hu.lengths_tensor(
+            dtype=np.float32, min_value=1, max_value=5, allow_empty=True
+        )
+    )
     def _test_lengths_op(self, inputs, ref_op_name, torch_op, device):
         data, lengths = inputs
 
@@ -715,7 +714,8 @@ def _lengths_ref(X, Y):
 
         expected_output = _lengths_ref(data, lengths)
         actual_output = torch_op(
-            torch.tensor(data), torch.tensor(lengths, dtype=torch.int32))
+            torch.tensor(data), torch.tensor(lengths, dtype=torch.int32)
+        )
 
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
@@ -754,8 +754,12 @@ def _test_resize_nearest_op(self, device):
 
         def _resize_nearest_ref(X):
             ref_op = core.CreateOperator(
-                "ResizeNearest", ["X"], ["Y"],
-                width_scale=2.0, height_scale=1.5, order="NCHW",
+                "ResizeNearest",
+                ["X"],
+                ["Y"],
+                width_scale=2.0,
+                height_scale=1.5,
+                order="NCHW",
             )
             workspace.FeedBlob("X", X)
             workspace.RunOperatorOnce(ref_op)
@@ -764,7 +768,9 @@ def _resize_nearest_ref(X):
         expected_output = _resize_nearest_ref(data)
         actual_output = torch.ops._caffe2.ResizeNearest(
             torch.tensor(data).to(device),
-            order="NCHW", width_scale=2.0, height_scale=1.5,
+            order="NCHW",
+            width_scale=2.0,
+            height_scale=1.5,
         )
 
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
@@ -779,9 +785,7 @@ def test_resize_nearest_op_cuda(self):
     @given(input_data=hu.tensor(min_dim=2, max_dim=2))
     def test_Fused8BitRowwiseQuantizedToFloat(self, input_data):
         QuantizeOp = core.CreateOperator(
-            "FloatToFused8BitRowwiseQuantized",
-            ["input_data"],
-            ["quantized_data"],
+            "FloatToFused8BitRowwiseQuantized", ["input_data"], ["quantized_data"]
         )
 
         workspace.FeedBlob("input_data", input_data)
@@ -804,16 +808,15 @@ def test_piecewise_linear_op(self, binary_input):
             num_dims = 3
         data = np.random.rand(1024, num_dims).astype(np.float32)
         slopes = np.zeros(4 * num_dims).astype(np.float32)
-        bounds = np.sort(np.random.rand(5, num_dims).astype(np.float32), axis=0).flatten('F')
+        bounds = np.sort(
+            np.random.rand(5, num_dims).astype(np.float32), axis=0
+        ).flatten("F")
         intercepts = np.random.rand(4 * num_dims).astype(np.float32)
 
         def _piecewise_linear_ref(X):
             ref_op = core.CreateOperator(
                 "PiecewiseLinearTransform",
-                ["data",
-                    "bounds",
-                    "slopes",
-                    "intercepts"],
+                ["data", "bounds", "slopes", "intercepts"],
                 ["calibrated"],
                 binary=binary_input,
             )
@@ -826,7 +829,12 @@ def _piecewise_linear_ref(X):
 
         expected_output = _piecewise_linear_ref(data)
         actual_output = torch.ops._caffe2.PiecewiseLinearTransform(
-            torch.tensor(data), bounds.tolist(), slopes.tolist(), intercepts.tolist(), binary_input)
+            torch.tensor(data),
+            bounds.tolist(),
+            slopes.tolist(),
+            intercepts.tolist(),
+            binary_input,
+        )
 
         torch.testing.assert_allclose(torch.tensor(expected_output), actual_output)
 
@@ -853,9 +861,7 @@ def test_index_hash_op(self):
         data = np.random.randint(low=0, high=1000, size=(4, 4, 4))
 
         def _index_hash_ref(X):
-            ref_op = core.CreateOperator(
-                "IndexHash", ["X"], ["Y"], seed=0, modulo=100
-            )
+            ref_op = core.CreateOperator("IndexHash", ["X"], ["Y"], seed=0, modulo=100)
             workspace.FeedBlob("X", X)
             workspace.RunOperatorOnce(ref_op)
             return workspace.FetchBlob("Y")
@@ -880,33 +886,32 @@ def _bucketize_ref(X):
             return workspace.FetchBlob("Y")
 
         expected_output = _bucketize_ref(data)
-        actual_output = torch.ops._caffe2.Bucketize(
-            torch.tensor(data), boundaries
-        )
+        actual_output = torch.ops._caffe2.Bucketize(torch.tensor(data), boundaries)
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
-    @given(X=hu.tensor(),
-           eps=st.floats(min_value=1e-4, max_value=1e-2),
-           )
+    @given(X=hu.tensor(), eps=st.floats(min_value=1e-4, max_value=1e-2))
     def test_logit(self, X, eps):
         def ref(X, eps):
-            ref_op = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps)
+            ref_op = core.CreateOperator("Logit", ["X"], ["Y"], eps=eps)
             workspace.FeedBlob("X", X)
             workspace.RunOperatorOnce(ref_op)
             return workspace.FetchBlob("Y")
+
         expected_output = ref(X, eps)
-        actual_output = torch.ops._caffe2.Logit(
-            torch.tensor(X), eps
-        )
+        actual_output = torch.ops._caffe2.Logit(torch.tensor(X), eps)
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
     def test_percentile(self):
-        original_values = np.array([[3., 5., 3], [5., 1., 6.]]).astype(np.float32)
-        value_to_pct = np.array([[3, 0.2], [5, 0.5], [1, 0.3], [3, 0.6]]).astype(np.float32)
+        original_values = np.array([[3.0, 5.0, 3], [5.0, 1.0, 6.0]]).astype(np.float32)
+        value_to_pct = np.array([[3, 0.2], [5, 0.5], [1, 0.3], [3, 0.6]]).astype(
+            np.float32
+        )
         lengths = np.array([2, 1, 1]).astype(np.int32)
 
         def _percentile_ref(original_values, value_to_pct, lengths):
-            ref_op = core.CreateOperator('Percentile', ["original_values", "value_to_pct", "lengths"], ["Y"])
+            ref_op = core.CreateOperator(
+                "Percentile", ["original_values", "value_to_pct", "lengths"], ["Y"]
+            )
             workspace.FeedBlob("original_values", original_values)
             workspace.FeedBlob("value_to_pct", value_to_pct)
             workspace.FeedBlob("lengths", lengths)
@@ -915,7 +920,9 @@ def _percentile_ref(original_values, value_to_pct, lengths):
 
         expected_output = _percentile_ref(original_values, value_to_pct, lengths)
         actual_output = torch.ops._caffe2.Percentile(
-            torch.tensor(original_values), torch.Tensor(value_to_pct), torch.Tensor(lengths).int()
+            torch.tensor(original_values),
+            torch.Tensor(value_to_pct),
+            torch.Tensor(lengths).int(),
         )
         torch.testing.assert_allclose(expected_output, actual_output.cpu())
 
@@ -925,7 +932,9 @@ def test_batch_bucket_one_hot_op(self):
         boundaries = np.array([0.1, 2.5, 1, 3.1, 4.5]).astype(np.float32)
 
         def _batch_bucket_one_hot_ref(data, lengths, boundaries):
-            ref_op = core.CreateOperator('BatchBucketOneHot', ["data", "lengths", "boundaries"], ["Y"])
+            ref_op = core.CreateOperator(
+                "BatchBucketOneHot", ["data", "lengths", "boundaries"], ["Y"]
+            )
             workspace.FeedBlob("data", data)
             workspace.FeedBlob("lengths", lengths)
             workspace.FeedBlob("boundaries", boundaries)
@@ -984,26 +993,43 @@ def test_gather_ranges_to_dense_op(self):
     def test_merge_id_lists(self, lengths_0, lengths_1):
         def _merge_id_lists(lengths, values):
             ref_op = core.CreateOperator(
-                'MergeIdLists',
+                "MergeIdLists",
                 ["lengths_0", "values_0", "lengths_1", "values_1"],
-                ["merged_lengths", "merged_values"]
+                ["merged_lengths", "merged_values"],
             )
             workspace.FeedBlob("lengths_0", lengths[0])
             workspace.FeedBlob("values_0", values[0])
             workspace.FeedBlob("lengths_1", lengths[1])
             workspace.FeedBlob("values_1", values[1])
             workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("merged_lengths"), workspace.FetchBlob("merged_values")
+            return (
+                workspace.FetchBlob("merged_lengths"),
+                workspace.FetchBlob("merged_values"),
+            )
 
-        lengths = [np.array([lengths_0]).astype(np.int32), np.array([lengths_1]).astype(np.int32)]
+        lengths = [
+            np.array([lengths_0]).astype(np.int32),
+            np.array([lengths_1]).astype(np.int32),
+        ]
         values = [
-            np.random.choice(np.arange(0, 10), size=lengths_0, replace=False).astype(np.int32),
-            np.random.choice(np.arange(10, 20), size=lengths_1, replace=False).astype(np.int32)
+            np.random.choice(np.arange(0, 10), size=lengths_0, replace=False).astype(
+                np.int32
+            ),
+            np.random.choice(np.arange(10, 20), size=lengths_1, replace=False).astype(
+                np.int32
+            ),
         ]
 
-        expected_merged_lengths, expected_merged_values = _merge_id_lists(lengths, values)
+        expected_merged_lengths, expected_merged_values = _merge_id_lists(
+            lengths, values
+        )
         output_merged_lengths, output_merged_values = torch.ops._caffe2.MergeIdLists(
-            [torch.tensor(lengths[0]), torch.tensor(values[0]), torch.tensor(lengths[1]), torch.tensor(values[1])]
+            [
+                torch.tensor(lengths[0]),
+                torch.tensor(values[0]),
+                torch.tensor(lengths[1]),
+                torch.tensor(values[1]),
+            ]
         )
         torch.testing.assert_allclose(expected_merged_lengths, output_merged_lengths)
         torch.testing.assert_allclose(expected_merged_values, output_merged_values)
@@ -1066,18 +1092,11 @@ def test_learning_rate(self):
     def test_pack_segments(self):
         s = torch.rand(3, 3, 3)
         lengths = torch.tensor([2, 1])
-        packed_tensor, _ = torch.ops._caffe2.PackSegments(
-            lengths,
-            s,
-        )
+        packed_tensor, _ = torch.ops._caffe2.PackSegments(lengths, s)
         self.assertEqual(packed_tensor.numpy().shape, (2, 2, 3, 3))
-        unpacked_tensor = torch.ops._caffe2.UnpackSegments(
-            lengths,
-            packed_tensor,
-        )
+        unpacked_tensor = torch.ops._caffe2.UnpackSegments(lengths, packed_tensor)
         torch.testing.assert_allclose(s, unpacked_tensor)
 
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From 73e9daa35f9fff6be74a3a289496941076686c4b Mon Sep 17 00:00:00 2001
From: Xianjie Chen <cxj@fb.com>
Date: Fri, 2 Oct 2020 14:19:36 -0700
Subject: [PATCH 390/449] [caffe2] Optimize Dedup version of
 RowWiseSparseAdagrad fused op by WarpReduce (#45649)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45649

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44275

* This Diff applies WarpReduce optimization for dedup version of RowWiseSparseAdagrad fused op. Basically we can achieve ~1.33x performance improvement with this Diff.

* Port the way from D23948802 to find the num_dup
* fix the likely bug about fp16 in the dedup kernel

Reviewed By: jianyuh

Differential Revision: D23561994

fbshipit-source-id: 1a633fcdc924593063a67f9ce0d36eadb19a7efb
---
 caffe2/sgd/adagrad_fused_op_gpu.cu  | 343 ++++++++++++++++++++--------
 caffe2/sgd/adagrad_fused_op_gpu.cuh |  21 ++
 2 files changed, 264 insertions(+), 100 deletions(-)

diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
index 814a24c74183..2347f0cd8bc8 100644
--- a/caffe2/sgd/adagrad_fused_op_gpu.cu
+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
@@ -308,69 +308,132 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel(
 
   const float LR = lr[0];
   // num_indices blocks, each block process one index
-  int sorted_linear_indice_id = blockIdx.x; // the index of sorted_linear_ind
+  int sorted_linear_indice_id;
+  if (ExactBlock) {
+    sorted_linear_indice_id =
+        blockIdx.x * blockDim.y + threadIdx.y; // the index of sorted_linear_ind
+  } else {
+    sorted_linear_indice_id = blockIdx.x; // the index of sorted_linear_ind
+  }
   if (sorted_linear_indice_id >= num_indices) {
     // don't have warp divergence when embedding dim is multiple of 32
     return;
   }
 
+  // the index row in the embedding table
+  SIndex index = sorted_linear_ind_data[sorted_linear_indice_id];
+
   // check if this thread block is responsible for this whole linear index
   bool linear_index_start =
       (sorted_linear_indice_id == 0 ||
-       sorted_linear_ind_data[sorted_linear_indice_id - 1] !=
-           sorted_linear_ind_data[sorted_linear_indice_id]);
+       sorted_linear_ind_data[sorted_linear_indice_id - 1] != index);
 
   if (!linear_index_start) {
     // don't have warp divergence when embedding dim is multiple of 32
     return;
   }
 
-  // the index row in the embedding table
-  SIndex index = sorted_linear_ind_data[sorted_linear_indice_id];
-  // find the num of duplicated indices.
-  int num_dup = 1;
-  while (sorted_linear_indice_id + num_dup < num_indices &&
-         sorted_linear_ind_data[sorted_linear_indice_id + num_dup] == index) {
-    num_dup += 1;
-  }
+  if (ExactBlock) {
+    // find the num of duplicated indices.
+    int num_dup = 1;
+    while (true) {
+      int segment_continue = 0;
+      if (sorted_linear_indice_id + num_dup + threadIdx.x < num_indices) {
+        segment_continue =
+            sorted_linear_ind_data[sorted_linear_indice_id + num_dup + threadIdx.x] ==
+            index;
+      }
+#ifndef __HIP_PLATFORM_HCC__
+      int32_t num_dup_incr = __popc(__ballot_sync(0xFFFFFFFF, segment_continue));
+#else
+      int32_t num_dup_incr = __popc(__ballot(segment_continue));
+#endif
+      num_dup += num_dup_incr;
+      if (num_dup_incr != kWarpSize) {
+        break;
+      }
+    }
 
-  // TODO: Tuning NumThreads for sum_squares
-  typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
-  __shared__ BlockReduce::TempStorage temp_storage;
-  int valid = min(block_size, blockDim.x);
+    float sum_squares = 0.0;
+    extern __shared__ float x_ij[];
 
-  float sum_squares = 0.0;
-  __shared__ float row_sum_squares_avg;
-  extern __shared__ float x_ij[];
+    // we need to avoid index collision for the threads in the same block.
+    // Different threadIdx.y works on different `index`.
+    int sm_offset = threadIdx.y * block_size;
 
-  for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
-    // i: index in the embedding dimension
-    float t_x_ij = 0.0;
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      // i: index in the embedding dimension
+      float t_x_ij = 0.0;
 
-    for (int dup_id = 0; dup_id < num_dup; dup_id++) {
-      int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id];
-      t_x_ij += grad[group * block_size + i];
+      for (int dup_id = 0; dup_id < num_dup; dup_id++) {
+        int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id];
+        t_x_ij += grad[group * block_size + i];
+      }
+      t_x_ij += weight_decay *
+          rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]);
+      sum_squares += t_x_ij * t_x_ij;
+
+      x_ij[sm_offset + i] = t_x_ij;
     }
-    t_x_ij += weight_decay *
-      rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]);;
-    sum_squares += t_x_ij * t_x_ij;
-    x_ij[i] = t_x_ij;
-  }
-  float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
 
-  if (threadIdx.x == 0) {
-    row_sum_squares_avg = reduce_result / static_cast<float>(block_size);
-    float mom_new = param_mom[index] + static_cast<T>(row_sum_squares_avg);
+    // We have a strong assumption that blockDim.x = 32, which is equal to the warp size.
+    float row_sum_squares_avg = warpReduceAllSum<float>(sum_squares) / static_cast<float>(block_size);
+    float mom_new = param_mom[index] + row_sum_squares_avg;
     param_mom[index] = mom_new;
-  }
-  __syncthreads();
 
-  // update param
-  float step = LR / (sqrtf(param_mom[index]) + epsilon);
-  for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
-    const size_t paramIdx = index * block_size + i; // index for param
-    param[paramIdx] =
-        rand_factor.convertTypeFromTargetToParam(param[paramIdx] + x_ij[i] * step);
+    // update param
+    float step = LR / (sqrtf(mom_new) + epsilon);
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      const size_t paramIdx = index * block_size + i; // index for param
+      param[paramIdx] = rand_factor.convertTypeFromTargetToParam(
+          rand_factor.convertTypeFromParamToTarget(param[paramIdx]) + x_ij[sm_offset + i] * step);
+    }
+  } else {
+    // find the num of duplicated indices.
+    int num_dup = 1;
+    while (sorted_linear_indice_id + num_dup < num_indices &&
+          sorted_linear_ind_data[sorted_linear_indice_id + num_dup] == index) {
+      num_dup += 1;
+    }
+
+    // TODO: Tuning NumThreads for sum_squares
+    typedef cub::BlockReduce<float, CAFFE_CUDA_NUM_THREADS> BlockReduce;
+    __shared__ BlockReduce::TempStorage temp_storage;
+    int valid = min(block_size, blockDim.x);
+
+    float sum_squares = 0.0;
+    __shared__ float row_sum_squares_avg;
+    extern __shared__ float x_ij[];
+
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      // i: index in the embedding dimension
+      float t_x_ij = 0.0;
+
+      for (int dup_id = 0; dup_id < num_dup; dup_id++) {
+        int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id];
+        t_x_ij += grad[group * block_size + i];
+      }
+      t_x_ij += weight_decay *
+          rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]);
+      sum_squares += t_x_ij * t_x_ij;
+      x_ij[i] = t_x_ij;
+    }
+    float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid);
+
+    if (threadIdx.x == 0) {
+      row_sum_squares_avg = reduce_result / static_cast<float>(block_size);
+      float mom_new = param_mom[index] + row_sum_squares_avg;
+      param_mom[index] = mom_new;
+    }
+    __syncthreads();
+
+    // update param
+    float step = LR / (sqrtf(param_mom[index]) + epsilon);
+    for (int i = threadIdx.x; i < block_size; i += blockDim.x) {
+      const size_t paramIdx = index * block_size + i; // index for param
+      param[paramIdx] = rand_factor.convertTypeFromTargetToParam(
+        rand_factor.convertTypeFromParamToTarget(param[paramIdx]) + x_ij[i] * step);
+    }
   }
 }
 
@@ -570,7 +633,10 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
       gradient_mean_kernel<T>
-          <<<num_lengths, std::min(maxThreads, block_size), 0, context_.cuda_stream()>>>(
+          <<<num_lengths,
+             std::min(maxThreads, block_size),
+             0,
+             context_.cuda_stream()>>>(
               grad, lengths, grad_buffer_data, block_size);
     }
 
@@ -934,7 +1000,10 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
       gradient_mean_kernel<T>
-          <<<num_lengths, std::min(maxThreads, block_size), 0, context_.cuda_stream()>>>(
+          <<<num_lengths,
+             std::min(maxThreads, block_size),
+             0,
+             context_.cuda_stream()>>>(
               grad, lengths, grad_buffer_data, block_size);
     }
 
@@ -1179,10 +1248,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
     sorted_seg_id_buffer_.ResizeLike(indicesInput);
 
     linear_index_weight_offsets_dedup_kernel<IndexType>
-        <<<num_lengths,
-           32,
-           0,
-           context_.cuda_stream()>>>(
+        <<<num_lengths, 32, 0, context_.cuda_stream()>>>(
             indices,
             prefix_sum_length_data,
             seg_id_buffer_.template mutable_data<int>());
@@ -1206,60 +1272,137 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
       seed.y = maxThreads * block_size;
     }
 
-    CAFFE_ENFORCE_LE(block_size, 10240,
-      "Block size is too big and will exceed the max size of the shared memory");
-    if (round_option_ == STOCHASTIC) {
-      rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
-          IndexType,
-          TParam,
-          T,
-          false,
-          STOCHASTIC>
-          <<<num_indices,
-             std::min(maxThreads, block_size),
-             block_size * sizeof(float),
-             context_.cuda_stream()>>>(
-              prefix_sum_length_data,
-              N,
-              block_size,
-              num_lengths,
-              num_indices,
-              epsilon_,
-              paramOut,
-              momentOut,
-              indices,
-              is_mean ? grad_buffer_data : grad,
-              sorted_linear_ind_buffer_.template data<IndexType>(),
-              sorted_seg_id_buffer_.template data<int>(),
-              lr,
-              seed,
-              weight_decay_);
+    if (block_size <= maxThreads / 2 && block_size % 32 == 0) {
+      // Fast path when the embedding dimension is a multiple of 32, using
+      // WarpReduce.
+      constexpr int kWarpNum = 8;
+      const dim3 threads(kWarpSize, kWarpNum);
+      const dim3 blocks((num_indices + kWarpNum - 1) / kWarpNum);
+      CAFFE_ENFORCE_LE(
+          kWarpNum * kWarpSize,
+          maxThreads,
+          "the total number of threads in a block should be smaller than or equal to maxThreads");
+
+      const int sm_size = block_size * kWarpNum * sizeof(float);
+      // Maximum shared memory allocated per thread block is 48 KB on Maxwell/Pascal
+      CAFFE_ENFORCE_LE(
+        sm_size,
+        1024 * 48,
+        "Block size is too big and will exceed the max size of the shared memory");
+
+      if (round_option_ == STOCHASTIC) {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            true,
+            STOCHASTIC>
+            <<<blocks,
+               threads,
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      } else {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            true,
+            NEAREST>
+            <<<blocks,
+               threads,
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      }
     } else {
-      rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
-          IndexType,
-          TParam,
-          T,
-          false,
-          NEAREST>
-          <<<num_indices,
-             std::min(maxThreads, block_size),
-             block_size * sizeof(float),
-             context_.cuda_stream()>>>(
-              prefix_sum_length_data,
-              N,
-              block_size,
-              num_lengths,
-              num_indices,
-              epsilon_,
-              paramOut,
-              momentOut,
-              indices,
-              is_mean ? grad_buffer_data : grad,
-              sorted_linear_ind_buffer_.template data<IndexType>(),
-              sorted_seg_id_buffer_.template data<int>(),
-              lr,
-              seed,
-              weight_decay_);
+      const int sm_size = block_size * sizeof(float);
+      // Maximum shared memory allocated per thread block is 48 KB on Maxwell/Pascal
+      CAFFE_ENFORCE_LE(
+        sm_size,
+        1024 * 48,
+        "Block size is too big and will exceed the max size of the shared memory");
+      if (round_option_ == STOCHASTIC) {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            false,
+            STOCHASTIC>
+            <<<num_indices,
+               std::min(maxThreads, block_size),
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      } else {
+        rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
+            IndexType,
+            TParam,
+            T,
+            false,
+            NEAREST>
+            <<<num_indices,
+               std::min(maxThreads, block_size),
+               sm_size,
+               context_.cuda_stream()>>>(
+                prefix_sum_length_data,
+                N,
+                block_size,
+                num_lengths,
+                num_indices,
+                epsilon_,
+                paramOut,
+                momentOut,
+                indices,
+                is_mean ? grad_buffer_data : grad,
+                sorted_linear_ind_buffer_.template data<IndexType>(),
+                sorted_seg_id_buffer_.template data<int>(),
+                lr,
+                seed,
+                weight_decay_);
+      }
     }
 
     return true;
diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cuh b/caffe2/sgd/adagrad_fused_op_gpu.cuh
index 9a5f53bead12..e695dac37e4d 100644
--- a/caffe2/sgd/adagrad_fused_op_gpu.cuh
+++ b/caffe2/sgd/adagrad_fused_op_gpu.cuh
@@ -26,6 +26,27 @@
 
 namespace caffe2 {
 
+constexpr int kWarpSize = 32;
+
+template <typename T>
+inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(0xffffffff, val, laneMask, width);
+#else
+  return __shfl_xor(val, laneMask, width);
+#endif
+}
+
+/// Sums a register value across all warp threads
+template <typename T, int ReduceWidth = kWarpSize>
+inline __device__ T warpReduceAllSum(T val) {
+#pragma unroll
+  for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) {
+    val += shfl_xor(val, mask);
+  }
+  return val;
+}
+
 enum roundOption : int { NEAREST = 0, STOCHASTIC = 1 };
 
 template <typename paramType, typename targetType, roundOption roundOpt>

From d150d3e276cea1ae955df4afc2b30496c0802dce Mon Sep 17 00:00:00 2001
From: Yanan Cao <gmagogsfm@gmail.com>
Date: Fri, 2 Oct 2020 14:39:14 -0700
Subject: [PATCH 391/449] Make sure each warnings.warn only executes once
 inside TorchScript. (#45382)

Summary:
* Add a pass at end of runCleanupPasses to annotate `aten::warn` so that each has its unique id
* Enhanced interpreter so that it tracks which `aten::warn` has been executed before and skip them
* Improved insertInstruction so that it correctly checks for overflow

Fixes https://github.com/pytorch/pytorch/issues/45108

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45382

Reviewed By: mrshenli

Differential Revision: D24060677

Pulled By: gmagogsfm

fbshipit-source-id: 9221bc55b9ce36b374bdf614da3fe47496b481c1
---
 aten/src/ATen/core/interned_strings.h    |   3 +-
 test/jit/test_warn.py                    | 165 +++++++++++++++++++++++
 test/test_jit.py                         |   1 +
 tools/build_variables.bzl                |   1 +
 torch/csrc/jit/frontend/ir_emitter.cpp   |   5 +
 torch/csrc/jit/passes/annotate_warns.cpp |  29 ++++
 torch/csrc/jit/passes/annotate_warns.h   |  11 ++
 torch/csrc/jit/runtime/instruction.h     |   2 +-
 torch/csrc/jit/runtime/interpreter.cpp   |  73 ++++++++--
 9 files changed, 275 insertions(+), 15 deletions(-)
 create mode 100644 test/jit/test_warn.py
 create mode 100644 torch/csrc/jit/passes/annotate_warns.cpp
 create mode 100644 torch/csrc/jit/passes/annotate_warns.h

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 0dad790a2fb6..69aaf167acee 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -360,7 +360,8 @@ namespace c10 {
   _(attr, scope)                     \
   _(attr, keepdims)                  \
   _(attr, cache_id)                  \
-  _(attr, new_axis)
+  _(attr, new_axis)                  \
+  _(attr, warn_id)
 #else
 #define FORALL_NS_SYMBOLS(_) \
   _(namespaces, prim)              \
diff --git a/test/jit/test_warn.py b/test/jit/test_warn.py
new file mode 100644
index 000000000000..6a89ba4dc385
--- /dev/null
+++ b/test/jit/test_warn.py
@@ -0,0 +1,165 @@
+import os
+import sys
+import io
+
+import torch
+import warnings
+from contextlib import redirect_stderr
+from torch.testing import FileCheck
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+from torch.testing._internal.jit_utils import JitTestCase
+
+if __name__ == '__main__':
+    raise RuntimeError("This test file is not meant to be run directly, use:\n\n"
+                       "\tpython test/test_jit.py TESTNAME\n\n"
+                       "instead.")
+
+
+class TestWarn(JitTestCase):
+    def test_warn(self):
+        @torch.jit.script
+        def fn():
+            warnings.warn("I am warning you")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_only_once(self):
+        @torch.jit.script
+        def fn():
+            for _ in range(10):
+                warnings.warn("I am warning you")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_only_once_in_loop_func(self):
+        def w():
+            warnings.warn("I am warning you")
+
+        @torch.jit.script
+        def fn():
+            for _ in range(10):
+                w()
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_once_per_func(self):
+        def w1():
+            warnings.warn("I am warning you")
+
+        def w2():
+            warnings.warn("I am warning you")
+
+        @torch.jit.script
+        def fn():
+            w1()
+            w2()
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=2,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_once_per_func_in_loop(self):
+        def w1():
+            warnings.warn("I am warning you")
+
+        def w2():
+            warnings.warn("I am warning you")
+
+        @torch.jit.script
+        def fn():
+            for _ in range(10):
+                w1()
+                w2()
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=2,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_multiple_calls_multiple_warnings(self):
+        @torch.jit.script
+        def fn():
+            warnings.warn("I am warning you")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            fn()
+            fn()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you",
+                count=2,
+                exactly=True) \
+            .run(f.getvalue())
+
+    def test_warn_multiple_calls_same_func_diff_stack(self):
+        def warn(caller: str):
+            warnings.warn("I am warning you from " + caller)
+
+        @torch.jit.script
+        def foo():
+            warn("foo")
+
+        @torch.jit.script
+        def bar():
+            warn("bar")
+
+        f = io.StringIO()
+        with redirect_stderr(f):
+            foo()
+            bar()
+
+        FileCheck() \
+            .check_count(
+                str="UserWarning: I am warning you from foo",
+                count=1,
+                exactly=True) \
+            .check_count(
+                str="UserWarning: I am warning you from bar",
+                count=1,
+                exactly=True) \
+            .run(f.getvalue())
diff --git a/test/test_jit.py b/test/test_jit.py
index 59739f4bb8bb..4d133663a766 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -32,6 +32,7 @@
 from jit.test_enum import TestEnum  # noqa: F401
 from jit.test_profiler import TestProfiler  # noqa: F401
 from jit.test_slice import TestSlice  # noqa: F401
+from jit.test_warn import TestWarn  # noqa: F401
 
 # Torch
 from torch import Tensor
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index f5fe9d24aad5..96301611c2e5 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -148,6 +148,7 @@ core_sources_full = [
     "torch/csrc/jit/ir/scope.cpp",
     "torch/csrc/jit/ir/subgraph_matcher.cpp",
     "torch/csrc/jit/jit_log.cpp",
+    "torch/csrc/jit/passes/annotate_warns.cpp",
     "torch/csrc/jit/passes/bailout_graph.cpp",
     "torch/csrc/jit/passes/batch_mm.cpp",
     "torch/csrc/jit/passes/canonicalize.cpp",
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 99ce4140c58a..fce8cd314c49 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/frontend/schema_matching.h>
 #include <torch/csrc/jit/frontend/script_type_parser.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/annotate_warns.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -4090,6 +4091,10 @@ void runCleanupPasses(std::shared_ptr<Graph>& to_clean) {
 
   // For jitter
   CanonicalizeOutputs(to_clean);
+
+  // Annotate aten::warns so that each has its unique ID. This enables us to
+  // mimic Python behavior of only emitting each warning only once.
+  AnnotateWarns(to_clean);
 }
 
 // we consider _N where N is a number, to be a non-meaningful name
diff --git a/torch/csrc/jit/passes/annotate_warns.cpp b/torch/csrc/jit/passes/annotate_warns.cpp
new file mode 100644
index 000000000000..3e0dc9faa1c1
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_warns.cpp
@@ -0,0 +1,29 @@
+#include <torch/csrc/jit/passes/annotate_warns.h>
+
+#include <atomic>
+
+namespace torch {
+namespace jit {
+
+void AnnotateWarns(Block* b) {
+  static std::atomic<int64_t> idx(0);
+  for (Node* n : b->nodes()) {
+    for (Block* child_b : n->blocks()) {
+      AnnotateWarns(child_b);
+    }
+
+    if (n->kind() != aten::warn) {
+      continue;
+    }
+
+    n->i_(attr::warn_id, idx);
+    idx++;
+  }
+}
+
+void AnnotateWarns(const std::shared_ptr<Graph>& graph) {
+  AnnotateWarns(graph->block());
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/passes/annotate_warns.h b/torch/csrc/jit/passes/annotate_warns.h
new file mode 100644
index 000000000000..18e9f67641e0
--- /dev/null
+++ b/torch/csrc/jit/passes/annotate_warns.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void AnnotateWarns(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/runtime/instruction.h b/torch/csrc/jit/runtime/instruction.h
index 8cfbb17e7685..dae7a0bcad3f 100644
--- a/torch/csrc/jit/runtime/instruction.h
+++ b/torch/csrc/jit/runtime/instruction.h
@@ -52,7 +52,7 @@ namespace jit {
   _(ISINSTANCE, "TI") /* check object is one of  types[X:X+N]  */              \
   _(TUPLE_SLICE, "II") /* slice tup[X:(X+N)] */                                \
   _(FORK, "CN") /* launch a thread to run code entry x with N inputs  */       \
-  _(WARN, "") /* emit a warning with line information */                       \
+  _(WARN, "I") /* emit a warning with line information */                      \
   _(ENTER, "EN") /* enter scope of a contextmanager */                         \
   _(EXIT, "EX") /* exit the last entered contextmanager */
 
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 812acaf6f208..d06d4279a31a 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -428,6 +428,16 @@ struct TLSCurrentInterpreterGuard {
   InterpreterStateImpl* prev_state_;
 };
 
+template <class Ttarget, class Tsource>
+Ttarget safe_narrow_cast(Tsource v) {
+  Ttarget res = static_cast<Ttarget>(v);
+  // Casting it back to check whether it overflew.
+  if (static_cast<Tsource>(res) != v) {
+    throw std::runtime_error("safe_narrow_cast<>() failed due to overflow");
+  }
+  return res;
+}
+
 struct CodeImpl {
   friend struct InterpreterState;
   std::vector<Instruction> instructions_;
@@ -535,7 +545,10 @@ struct CodeImpl {
   }
 
   void insertInstruction(OpCode op, int64_t X = 0, uint64_t N = 0) {
-    instructions_.emplace_back(op, X, N);
+    instructions_.emplace_back(
+        op,
+        safe_narrow_cast<int32_t, int64_t>(X),
+        safe_narrow_cast<int16_t, int64_t>(N));
     instructions_source_.emplace_back(current_node_);
 
     // check that we didn't accidentally emit nodes out of topological order
@@ -873,7 +886,11 @@ struct CodeImpl {
 
   void emitWarn(Node* node) {
     emitLoadInputs(node->inputs());
-    insertInstruction(WARN);
+    int32_t idx = -1;
+    if (node->hasAttribute(attr::warn_id)) {
+      idx = static_cast<int32_t>(node->i(attr::warn_id));
+    }
+    insertInstruction(WARN, idx);
   }
 
   void emitEnter(Node* node) {
@@ -1017,6 +1034,22 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   }
 
  private:
+  struct WarnedNodes {
+   public:
+    // Inserts idx into warned_nodes_, returns a boolean indicates whether
+    // insertion actually happened (idx wasn't originally in the set).
+    bool insert(int32_t idx) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      return warned_nodes_.insert(idx).second;
+    }
+
+   private:
+    std::mutex mutex_;
+    std::unordered_set<int32_t> warned_nodes_;
+  };
+
+  WarnedNodes warned_nodes_;
+
   // if we need to suspend, where do we reset the stack?
   // answer: to where it was when we were called, not
   // including any inputs to this function
@@ -1487,21 +1520,35 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             ++frame.pc;
           } break;
           case WARN: {
-            Node* node = frame.function->instructions_source_.at(frame.pc);
+            // Keeps track of which WARN instruction has been executed before,
+            // we only want to execute each WARN once to match default Python
+            // warning behavior.
+            bool need_warn = true;
+            if (inst.X != -1) {
+              need_warn = warned_nodes_.insert(inst.X);
+            }
+
+            Node* node =
+                frames.back().function->instructions_source_.at(frame.pc);
             auto range = node->sourceRange().source();
             if (range->filename()) {
-              auto line = range->starting_line_no() +
-                  range->lineno_for_offset(node->sourceRange().start());
               drop(stack, 1);
-              c10::SourceLocation location{
-                  "", range->filename()->c_str(), uint32_t(line)};
-              // Sends the warning to the warning handler with the
-              // "verbatim" flag. This flag ensures the warning handler
-              // will print the exception as configured.
-              c10::Warning::warn(
-                  location, pop(stack).toStringRef(), /*verbatim=*/true);
+              const auto msg = pop(stack).toStringRef();
+              if (need_warn) {
+                auto line = range->starting_line_no() +
+                    range->lineno_for_offset(node->sourceRange().start());
+                c10::SourceLocation location{
+                    "", range->filename()->c_str(), uint32_t(line)};
+                // Sends the warning to the warning handler with the
+                // "verbatim" flag. This flag ensures the warning handler
+                // will print the exception as configured.
+                c10::Warning::warn(location, msg, /*verbatim=*/true);
+              }
             } else {
-              TORCH_WARN(pop(stack).toStringRef());
+              const auto msg = pop(stack).toStringRef();
+              if (need_warn) {
+                TORCH_WARN(msg);
+              }
             }
             ++frame.pc;
           } break;

From 8cb728024200932b06b22f04e1dce12527bd6372 Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 2 Oct 2020 15:11:30 -0700
Subject: [PATCH 392/449] Revert "Remove device maps from TensorPipe for v1.7
 release (#45353)" (#45762)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45762

This reverts commit 5211fb97ac4c246151f1286c78d63e0e317a8a4a.

Test Plan: Imported from OSS

Reviewed By: colesbury

Differential Revision: D24088231

Pulled By: mrshenli

fbshipit-source-id: b6ee15ec5ae137ea127bdc2db8e1842764bc01d4
---
 torch/csrc/distributed/rpc/init.cpp           |  14 +-
 .../csrc/distributed/rpc/tensorpipe_agent.cpp |  54 ++-
 torch/distributed/rpc/backend_registry.py     |  66 ++-
 torch/distributed/rpc/options.py              |  78 +++-
 .../_internal/distributed/rpc/rpc_test.py     | 381 ++++++++++++++++--
 5 files changed, 542 insertions(+), 51 deletions(-)

diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 29eee13a9fdb..f0b31b5389d2 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -483,12 +483,15 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
               optional<std::vector<std::string>>,
               optional<std::vector<std::string>>,
               float,
-              std::string>(),
+              std::string,
+              std::unordered_map<std::string, tensorpipe::DeviceMap>>(),
           py::arg("num_worker_threads") = kDefaultNumWorkerThreads,
           py::arg("_transports") = optional<std::vector<std::string>>(),
           py::arg("_channels") = optional<std::vector<std::string>>(),
           py::arg("rpc_timeout") = kDefaultRpcTimeoutSeconds,
-          py::arg("init_method") = kDefaultInitMethod)
+          py::arg("init_method") = kDefaultInitMethod,
+          py::arg("device_maps") =
+              std::unordered_map<std::string, tensorpipe::DeviceMap>())
       .def_readwrite(
           "num_worker_threads",
           &TensorPipeRpcBackendOptions::numWorkerThreads,
@@ -496,7 +499,12 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
               The number of threads in the thread-pool used by
               :class:`~torch.distributed.rpc.TensorPipeAgent` to execute
               requests.
-          )");
+          )")
+      .def_readwrite(
+          "device_maps",
+          &TensorPipeRpcBackendOptions::deviceMaps,
+          R"(The device map locations.)")
+      .def("set_device_map", &TensorPipeRpcBackendOptions::setDeviceMap);
 
   module.attr("_DEFAULT_NUM_WORKER_THREADS") =
       py::cast(kDefaultNumWorkerThreads);
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 0484cbc955cb..11c5408c2c35 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -32,9 +32,12 @@ const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls";
 inline void checkCPUTensor(const torch::Tensor& tensor) {
   TORCH_CHECK(
       tensor.device() == at::kCPU,
-      "TensorPipe RPC backend only supports CPU tensors, please move your ",
-      "tensors to CPU before sending them over RPC. Found tensor on device: ",
-      tensor.device());
+      "TensorPipeAgent only supports CPU tensors by default. Sending "
+      "GPU tensors using RPC requires explicitly configurations using "
+      "`set_device_map` on `TensorPipeRpcBackendOptions`. Got a tensor "
+      "with device ",
+      tensor.device(),
+      ", but no device map is specified.");
 }
 
 std::vector<c10::DeviceIndex> getDevicesForTensors(
@@ -477,16 +480,41 @@ void TensorPipeAgent::sendCompletedResponseMessage(
   Message&& responseMessage = std::move(*futureResponseMessage).moveValue();
   responseMessage.setId(messageId);
   if (!error) {
-    for (const auto& tensor : responseMessage.tensors()) {
-      if (!tensor.device().is_cpu()) {
-        responseMessage = createExceptionResponse(
-            c10::str(
-                "TensorPipe RPC backend only supports CPU tensors, please ",
-                "move your tensors to CPU before sending them over RPC. Found ",
-                "tensor on device: ",
-                tensor.device()),
-            responseMessage.id());
-        break;
+    const auto& iter = reverseDeviceMaps_.find(pipe->getRemoteName());
+    if (iter == opts_.deviceMaps.end()) {
+      for (const auto& t : responseMessage.tensors()) {
+        if (!t.device().is_cpu()) {
+          responseMessage = createExceptionResponse(
+              c10::str(
+                  "TensorPipe RPC backend only supports CPU tensors by default,"
+                  " please move your tensors to CPU before sending them over "
+                  "RPC, or call `set_device_map` on "
+                  "`TensorPipeRpcBackendOptions` to explicitly configure "
+                  "device mapping. Response device mapping is not available for "
+                  "destination ",
+                  pipe->getRemoteName(),
+                  ", but found tensor on device: ",
+                  t.device()),
+              responseMessage.id());
+          break;
+        }
+      }
+    } else {
+      const auto& deviceMap = iter->second;
+      for (const auto& t : responseMessage.tensors()) {
+        if (!t.device().is_cpu() &&
+            deviceMap.find(t.device().index()) == deviceMap.end()) {
+          responseMessage = createExceptionResponse(
+              c10::str(
+                  "TensorPipe RPC backend only supports CPU tensors by default."
+                  " Response device mapping is not available for destination ",
+                  pipe->getRemoteName(),
+                  " for device ",
+                  t.device(),
+                  " but received a tensor on that device."),
+              responseMessage.id());
+          break;
+        }
       }
     }
 
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 57ff9de99ad2..6dac7cb0863a 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -3,8 +3,10 @@
 from datetime import timedelta
 import enum
 
+import torch
 import torch.distributed as dist
 
+from . import api
 from . import constants as rpc_constants
 
 
@@ -183,6 +185,57 @@ def _tensorpipe_construct_rpc_backend_options_handler(
     )
 
 
+# detect if any worker has invalid device_map configurations, and return
+# names of failed workers
+def _tensorpipe_check_device_maps(agent, device_maps):
+    if device_maps is None:
+        device_maps = {}
+
+    def check_one_worker(name, device_maps, all_device_counts):
+        device_count = all_device_counts[name]
+        wrong_worker_names = set(device_maps) - set(all_device_counts)
+        if wrong_worker_names:
+            raise ValueError(f"Wrong worker names: {wrong_worker_names}")
+        for worker_name in all_device_counts:
+            remote_device_count = all_device_counts[worker_name]
+            if worker_name in device_maps:
+                device_map = device_maps[worker_name]
+                key_set = set(device_map.keys())
+                val_set = set(device_map.values())
+                if not all([
+                    len(device_map) == len(key_set),
+                    len(device_map) == len(val_set),  # check 1-to-1 mapping
+                    min(key_set) >= 0,
+                    max(key_set) < device_count,  # check local range
+                    min(val_set) >= 0,
+                    max(val_set) < remote_device_count  # check remote range
+                ]):
+                    raise ValueError(
+                        f"Invalid device_map configuration on {name}:\n"
+                        f"device_maps = {device_maps}"
+                    )
+
+    gathered = api._all_gather([torch.cuda.device_count(), device_maps])
+    all_device_counts = {name: gathered[name][0] for name in gathered}
+    all_device_maps = {name: gathered[name][1] for name in gathered}
+    for worker_name in all_device_maps:
+        worker_device_maps = all_device_maps[worker_name]
+        check_one_worker(worker_name, worker_device_maps, all_device_counts)
+
+    # passed all checked, construct reverse mapping for return values
+    reverse_device_maps = {}
+    local_name = api.get_worker_info().name
+    for worker_name in all_device_maps:
+        remote_device_maps = all_device_maps[worker_name]
+        if local_name in remote_device_maps:
+            remote_device_map = remote_device_maps[local_name]
+            reverse_device_maps[worker_name] = {
+                remote_device_map[k]: k for k in remote_device_map
+            }
+
+    agent._set_reverse_device_maps(reverse_device_maps)
+
+
 def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_options):
     from . import TensorPipeRpcBackendOptions
     from . import TensorPipeAgent
@@ -206,10 +259,21 @@ def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_
     group = _init_process_group(store, rank, world_size)
 
     # TODO: add try-except and destroy _agent in all processes if any fails.
-    return TensorPipeAgent(
+    agent = TensorPipeAgent(
         store, name, rank, world_size, group, rpc_backend_options
     )
 
+    api._init_rpc_states(agent)
+
+    try:
+        _tensorpipe_check_device_maps(agent, rpc_backend_options.device_maps)
+        agent.join()
+    except Exception:
+        api.shutdown()
+        raise
+
+    return agent
+
 
 register_backend(
     "TENSORPIPE",
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index a5b72333aa65..149a2544d217 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -1,7 +1,9 @@
 from . import _TensorPipeRpcBackendOptionsBase
 from . import constants as rpc_contants
 
-from typing import List
+import torch
+
+from typing import Dict, List
 
 
 class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
@@ -25,6 +27,11 @@ class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
             store used for rendezvous. It takes any value accepted for the
             same argument of :meth:`~torch.distributed.init_process_group`
             (default: ``env://``).
+        device_maps (Dict[str, Dict]): Device placement mappings from this
+            worker to the callee. Key is the callee worker name and value the
+            dictionary (``Dict`` of ``int``, ``str``, or ``torch.device``) that
+            maps this worker's devices to the callee worker's devices.
+            (default: ``None``)
     """
     def __init__(
         self,
@@ -32,6 +39,7 @@ def __init__(
         num_worker_threads: int = rpc_contants.DEFAULT_NUM_WORKER_THREADS,
         rpc_timeout: float = rpc_contants.DEFAULT_RPC_TIMEOUT_SEC,
         init_method: str = rpc_contants.DEFAULT_INIT_METHOD,
+        device_maps: Dict = None,
         _transports: List = None,
         _channels: List = None,
     ):
@@ -40,5 +48,71 @@ def __init__(
             _transports,
             _channels,
             rpc_timeout,
-            init_method
+            init_method,
+            device_maps if device_maps else {}
         )
+
+    def set_device_map(self, to: str, device_map: Dict):
+        r"""
+        Set device mapping between each RPC caller and callee pair. This
+        function can be called multiple times to incrementally add
+        device placement configurations.
+
+        Arguments:
+            worker_name (str): Callee name.
+            device_map (Dict of int, str, or torch.device): Device placement
+                mappings from this worker to the callee. This map must be
+                invertible.
+
+        Example::
+            >>> # both workers
+            >>> def add(x, y):
+            >>>     print(x)  # tensor([1., 1.], device='cuda:1')
+            >>>     return x + y, (x + y).to(2)
+            >>>
+            >>> # on worker 0
+            >>> options = TensorPipeRpcBackendOptions(
+            >>>     num_worker_threads=8,
+            >>>     device_maps={"worker1": {0, 1}}
+            >>>     # maps worker0's cuda:0 to worker1's cuda:1
+            >>> )
+            >>> options.set_device_map("worker1", {1, 2})
+            >>> # maps worker0's cuda:1 to worker1's cuda:2
+            >>>
+            >>> rpc.init_rpc(
+            >>>     "worker0",
+            >>>     rank=0,
+            >>>     world_size=2
+            >>>     backend=rpc.BackendType.TENSORPIPE,
+            >>>     rpc_backend_options=options
+            >>> )
+            >>>
+            >>> x = torch.ones(2)
+            >>> rets = rpc.rpc_sync("worker1", add, args=(x.to(0), 1))
+            >>> # The first argument will be moved to cuda:1 on worker1. When
+            >>> # sending the return value back, it will follow the invert of
+            >>> # the device map, and hence will be moved back to cuda:0 and
+            >>> # cuda:1 on worker0
+            >>> print(rets[0])  # tensor([2., 2.], device='cuda:0')
+            >>> print(rets[0])  # tensor([2., 2.], device='cuda:1')
+        """
+        device_index_map = {}
+        curr_device_maps = super().device_maps
+        for k in device_map:
+            v = device_map[k]
+            k, v = torch.device(k), torch.device(v)
+            if k.type != 'cuda' or v.type != 'cuda':
+                raise ValueError(
+                    "`set_device_map` only supports CUDA devices, "
+                    f"but got device pair {k}: {v}"
+
+                )
+            if to in curr_device_maps and k.index in curr_device_maps[to]:
+                curr_v = super().device_maps[to][k.index]
+                if curr_v != v.index:
+                    raise ValueError(
+                        "`set_device_map` only supports 1-to-1 mapping, "
+                        f"trying to map {k} to {v} and {curr_v}"
+                    )
+            device_index_map[k.index] = v.index
+        super().set_device_map(to, device_index_map)
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index b525adbf31c0..896bd1cc78db 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -2902,38 +2902,6 @@ def _return_gpu_tensor_list():
     def _gpu_tensor_list_arg(tensor_list):
         return torch.rand(3, 3)
 
-    @skip_if_lt_x_gpu(2)
-    @dist_init
-    def test_cuda(self):
-        dst = worker_name((self.rank + 1) % self.world_size)
-        t1 = torch.rand(3, 3).cuda(0)
-        t2 = torch.rand(3, 3).cuda(1)
-        t3 = torch.rand(3, 3)
-
-        # cuda tensors as args fail.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, torch.add, args=(t1, t2))
-
-        # mix of cpu and cuda tensors as args fail.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, torch.add, args=(t1, t3))
-
-        # gpu tensor list as args fails.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, RpcTest._gpu_tensor_list_arg, args=([t1, t2]))
-
-        # cuda tensors as return values fail.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor, args=())
-
-        # cuda tensors as a list of return value fails
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor_list, args=())
-
-        # Sending to self should fail too.
-        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
-            rpc.rpc_sync(worker_name(self.rank), torch.add, args=(t1, t2))
-
     def _create_rref(self):
         owner_rank = (self.rank + 2) % self.world_size
         return rpc.remote(
@@ -3667,6 +3635,39 @@ def test_logs_deprecation_warning(self):
             "\n".join(cm.output),
         )
 
+    @skip_if_lt_x_gpu(2)
+    @dist_init
+    def test_cuda(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        t1 = torch.rand(3, 3).cuda(0)
+        t2 = torch.rand(3, 3).cuda(1)
+        t3 = torch.rand(3, 3)
+
+        # cuda tensors as args fail.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, torch.add, args=(t1, t2))
+
+        # mix of cpu and cuda tensors as args fail.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, torch.add, args=(t1, t3))
+
+        # gpu tensor list as args fails.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, RpcTest._gpu_tensor_list_arg, args=([t1, t2]))
+
+        # cuda tensors as return values fail.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor, args=())
+
+        # cuda tensors as a list of return value fails
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(dst, RpcTest._return_gpu_tensor_list, args=())
+
+        # Sending to self should fail too.
+        with self.assertRaisesRegex(RuntimeError, "RPC backend only supports CPU tensors.*Found tensor on device: cuda:0"):
+            rpc.rpc_sync(worker_name(self.rank), torch.add, args=(t1, t2))
+
+
     def test_single_threaded_rref_owner(self):
         # We need a process group in order to perform a barrier at the end.
         dist.init_process_group(
@@ -4313,3 +4314,319 @@ def test_tensorpipe_options_throw_on_timedelta_timeout(self):
                 num_worker_threads=self.rpc_backend_options.num_worker_threads,
                 rpc_timeout=timeout,
             )
+
+    def _test_device_maps(self, options, errMsg="Invalid device_map"):
+        with self.assertRaisesRegex(ValueError, errMsg):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=options,
+            )
+
+        self.assertFalse(rpc.api._is_current_rpc_agent_set())
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_wrong_worker_name(self):
+        options = self.rpc_backend_options
+        options.set_device_map("none_exist", {0: 1})
+        self._test_device_maps(options, "Wrong worker names")
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_maps_invalid_max_local_device(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {torch.cuda.device_count(): 0})
+
+        self._test_device_maps(options)
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_maps_invalid_max_remote_device(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {0: torch.cuda.device_count()})
+
+        self._test_device_maps(options)
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_many_to_one(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {1: 0})
+        options.set_device_map(dst, {0: 0})
+
+        self._test_device_maps(options)
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_one_to_many(self):
+        if self.rank == 0:
+            options = self.rpc_backend_options
+            dst = worker_name((self.rank + 1) % self.world_size)
+            options.set_device_map(dst, {0: 1})
+            with self.assertRaisesRegex(
+                ValueError, "`set_device_map` only supports 1-to-1 mapping"
+            ):
+                options.set_device_map(dst, {0: 0})
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_maps_invalid_min_device(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        with self.assertRaisesRegex(
+            RuntimeError, "Device index must not be negative"
+        ):
+            options.set_device_map(dst, {-1: 0})
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Device index must not be negative"
+        ):
+            options.set_device_map(dst, {0: -1})
+
+    @staticmethod
+    def _gpu_add(x, y):
+        if all([x.is_cuda, x.device.index == 1, y.is_cuda, y.device.index == 1]):
+            return (x + y).to(0)
+        else:
+            raise ValueError("Wrong device affinity")
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_gpu(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {0: 1, 1: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        ret = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentRpcTest._gpu_add,
+            args=(torch.zeros(2).to(0), torch.ones(2).to(0))
+        )
+        self.assertEqual(ret.device, torch.device(1))
+        self.assertEqual(ret, (torch.zeros(2) + torch.ones(2)).to(1))
+        rpc.shutdown()
+
+    @staticmethod
+    def _gpu_add_multi_gpu(x, y):
+        if all([x.is_cuda, x.device.index == 0, y.is_cuda, y.device.index == 1]):
+            return x + y.to(0), x.to(1) - y
+        else:
+            raise ValueError("Wrong device affinity")
+
+    def _test_device_maps_multi_gpu(self, dst):
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {1: 0})
+        options.set_device_map(dst, {0: 1})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        rets = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentRpcTest._gpu_add_multi_gpu,
+            args=(torch.zeros(2).to(1), torch.ones(2).to(0))
+        )
+        self.assertEqual(rets[0].device, torch.device(1))
+        self.assertEqual(rets[1].device, torch.device(0))
+        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(1))
+        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_multi_gpu(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._test_device_maps_multi_gpu(dst)
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_multi_gpu_self(self):
+        dst = worker_name(self.rank)
+        self._test_device_maps_multi_gpu(dst)
+
+    @staticmethod
+    def _gpu_add_return_to_gpu(x, y):
+        if x.device.type == 'cpu' and y.device.type == 'cpu':
+            return (x + y).to(0), (x - y).to(1), (x * y).to(2), (x / y).to(3)
+        else:
+            raise ValueError("Wrong device affinity")
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_in_options(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
+                init_method=options.init_method,
+                num_worker_threads=options.num_worker_threads,
+                device_maps={dst: {0: 1, 1: 0}}
+            )
+        )
+
+        rets = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentRpcTest._gpu_add_multi_gpu,
+            args=(torch.zeros(2).to(1), torch.ones(2).to(0))
+        )
+        self.assertEqual(rets[0].device, torch.device(1))
+        self.assertEqual(rets[1].device, torch.device(0))
+        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(1))
+        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
+        rpc.shutdown()
+
+    def _test_device_maps_return_to_gpu(self, dst):
+        options = self.rpc_backend_options
+
+        options.set_device_map(dst, {0: 1})
+        options.set_device_map(dst, {1: 2})
+        options.set_device_map(dst, {2: 3})
+        options.set_device_map(dst, {3: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        rets = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentRpcTest._gpu_add_return_to_gpu,
+            args=(torch.zeros(2), torch.ones(2))
+        )
+        for i in range(len(rets)):
+            self.assertEqual(rets[i].device, torch.device((3 + i) % 4))
+        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(3))
+        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
+        self.assertEqual(rets[2], (torch.zeros(2) * torch.ones(2)).to(1))
+        self.assertEqual(rets[3], (torch.zeros(2) / torch.ones(2)).to(2))
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_return_to_gpu(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._test_device_maps_return_to_gpu(dst)
+
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_return_to_gpu_self(self):
+        dst = worker_name(self.rank)
+        self._test_device_maps_return_to_gpu(dst)
+
+    @staticmethod
+    def _add_to_gpu(x, y):
+        return (x + y).to(0)
+
+    def _test_device_maps_missing_config(self, mode):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        errMsg = (
+            "TensorPipeAgent only supports CPU tensors by default.*"
+            "`set_device_map` on `TensorPipeRpcBackendOptions`"
+        )
+
+        with self.assertRaisesRegex(RuntimeError, errMsg):
+            if mode == RPCExecMode.SYNC:
+                rpc.rpc_sync(dst, torch.add, args=(torch.zeros(2).to(0), 1))
+            elif mode == RPCExecMode.REMOTE:
+                rpc.remote(dst, torch.add, args=(torch.zeros(2).to(0), 1)).to_here()
+            else:
+                raise ValueError(f"unexpected mode {mode}")
+
+        # make sure RPC is still functioning
+        ret = rpc.rpc_sync(dst, torch.add, args=(torch.ones(2), 1))
+        self.assertEqual(ret, torch.ones(2) + 1)
+
+    def _test_device_maps_missing_config_response(self, mode):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        errMsg = "Response device mapping is not available"
+
+        with self.assertRaisesRegex(RuntimeError, errMsg):
+            if mode == RPCExecMode.SYNC:
+                rpc.rpc_sync(
+                    dst,
+                    TensorPipeAgentRpcTest._add_to_gpu,
+                    args=(torch.zeros(2), 1)
+                )
+            elif mode == RPCExecMode.REMOTE:
+                rpc.remote(
+                    dst,
+                    TensorPipeAgentRpcTest._add_to_gpu,
+                    args=(torch.zeros(2), 1)
+                ).to_here()
+            else:
+                raise ValueError(f"unexpected mode {mode}")
+
+        # make sure RPC is still functioning
+        ret = rpc.rpc_sync(dst, torch.add, args=(torch.ones(2), 1))
+        self.assertEqual(ret, torch.ones(2) + 1)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config(self):
+        self._test_device_maps_missing_config(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_loop(self):
+        for _ in range(self.rpc_backend_options.num_worker_threads + 5):
+            self._test_device_maps_missing_config(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_response(self):
+        self._test_device_maps_missing_config_response(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_response_loop(self):
+        for _ in range(self.rpc_backend_options.num_worker_threads + 5):
+            self._test_device_maps_missing_config_response(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_remote(self):
+        self._test_device_maps_missing_config(RPCExecMode.REMOTE)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_remote_response(self):
+        self._test_device_maps_missing_config_response(RPCExecMode.REMOTE)
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_remote(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {1: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        rref = rpc.remote(
+            dst,
+            TensorPipeAgentRpcTest._add_to_gpu,
+            args=(torch.zeros(2), 1)
+        )
+
+        self.assertEqual(rref.to_here(), torch.ones(2).to(1))
+
+        rpc.shutdown()

From 2fa062002e75c2b900cb4e4b56b59fb8787106b4 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Fri, 2 Oct 2020 16:19:14 -0700
Subject: [PATCH 393/449] CUDA BFloat16 infrastructure (#44925)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44925

Reviewed By: agolynski

Differential Revision: D23783910

Pulled By: ngimel

fbshipit-source-id: dacac2ad87d58056bdc68bfe0b7ab1de5c2af0d8
---
 BUILD.bazel                  |  1 +
 c10/util/BFloat16-inl.h      | 29 +++++++++++++++++++++++++++++
 c10/util/BFloat16.h          |  9 +++++++++
 cmake/Dependencies.cmake     |  3 ++-
 torch/utils/cpp_extension.py |  1 +
 5 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/BUILD.bazel b/BUILD.bazel
index 6aacd560c40a..a8ea7988a242 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -723,6 +723,7 @@ torch_cuda_half_options = [
     "-DCUDA_HAS_FP16=1",
     "-D__CUDA_NO_HALF_OPERATORS__",
     "-D__CUDA_NO_HALF_CONVERSIONS__",
+    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__",
     "-D__CUDA_NO_HALF2_OPERATORS__",
 ]
 
diff --git a/c10/util/BFloat16-inl.h b/c10/util/BFloat16-inl.h
index da6ce3859552..57e2a69b86fb 100644
--- a/c10/util/BFloat16-inl.h
+++ b/c10/util/BFloat16-inl.h
@@ -7,15 +7,44 @@ namespace c10 {
 
 /// Constructors
 inline C10_HOST_DEVICE BFloat16::BFloat16(float value) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  x = __bfloat16_as_ushort(__float2bfloat16(value));
+#else
   // RNE by default
   x = detail::round_to_nearest_even(value);
+#endif
 }
 
 /// Implicit conversions
 inline C10_HOST_DEVICE BFloat16::operator float() const {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
   return detail::f32_from_bits(x);
+#endif
 }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const {
+  return *reinterpret_cast<const __nv_bfloat16*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __ldg(reinterpret_cast<const __nv_bfloat16 *>(ptr));
+#else
+  return *ptr;
+#endif
+}
+#endif
+
 /// Arithmetic
 
 inline C10_HOST_DEVICE BFloat16 operator+(const BFloat16& a, const BFloat16& b) {
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index 375b1086e073..0bd115d568f6 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -7,6 +7,10 @@
 #include <cmath>
 #include <cstring>
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+
 namespace c10 {
 
 namespace detail {
@@ -84,6 +88,11 @@ struct alignas(2) BFloat16 {
   constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t) : x(bits){};
   inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
+#endif
 };
 
 } // namespace c10
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 023bbe9e8d07..1bbb98fb3614 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1504,7 +1504,8 @@ if(NOT INTERN_BUILD_MOBILE)
 
   if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
     message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor")
-    list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__")
+    list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__"
+      "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__")
     add_compile_options(-DCUDA_HAS_FP16=1)
   else()
     message(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 49fb7988c0d9..feecc39acd87 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -152,6 +152,7 @@ def _join_rocm_home(*paths) -> str:
 COMMON_NVCC_FLAGS = [
     '-D__CUDA_NO_HALF_OPERATORS__',
     '-D__CUDA_NO_HALF_CONVERSIONS__',
+    '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
     '-D__CUDA_NO_HALF2_OPERATORS__',
     '--expt-relaxed-constexpr'
 ]

From 53aea60bcee8bea63d2580431dfffb2e96e16e05 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Fri, 2 Oct 2020 17:05:42 -0700
Subject: [PATCH 394/449] [FX] Make output a non-special Node (#45599)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45599

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D24027586

Pulled By: jamesr66a

fbshipit-source-id: 747c25e3c7668ca45f03bed0be71fd3c9af67286
---
 test/fx/quantization.py                       |  6 ++-
 test/test_fx.py                               | 44 ++++++++++++-------
 torch/fx/__init__.py                          | 36 +++++++--------
 torch/fx/experimental/GraphManipulation.py    |  1 -
 torch/fx/experimental/Partitioner.py          |  4 ++
 torch/fx/experimental/shape_prop.py           |  4 +-
 .../experimental/subgraph_creation_example.py | 12 ++---
 torch/fx/graph.py                             | 39 ++++++++++------
 torch/fx/graph_module.py                      |  8 +---
 torch/fx/node.py                              |  2 +-
 torch/fx/symbolic_trace.py                    |  2 +-
 torch/quantization/fx/fuse.py                 |  1 -
 torch/quantization/fx/quantize.py             | 33 +++++++++-----
 13 files changed, 113 insertions(+), 79 deletions(-)

diff --git a/test/fx/quantization.py b/test/fx/quantization.py
index 968c797c9163..a2de582937aa 100644
--- a/test/fx/quantization.py
+++ b/test/fx/quantization.py
@@ -219,6 +219,7 @@ def observe(self, args):
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
+        output_node : Optional[Node] = None
         for node in self.graph.nodes:
             if node.op == 'placeholder':
                 result = next(args_iter)
@@ -232,6 +233,8 @@ def load_arg(a):
                 result = getattr(self_obj, node.target)(*args, **kwargs)
             elif node.op == 'call_module':
                 result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'output':
+                return load_arg(node.args[0])
 
             env[node.name] = result
             root_node, obj = self.matches.get(node.name, (None, None))
@@ -240,7 +243,7 @@ def load_arg(a):
             if node.name in self.quants:
                 self.quants[node.name].observe(node, env)
 
-        return load_arg(self.graph.result)
+        raise RuntimeError('Graph had no output node!')
 
     def quantize(self):
         self.quantized_graph = Graph()
@@ -281,7 +284,6 @@ def load_or_emit(n):
                 else:
                     quant_env[node.name] = r
 
-        self.quantized_graph.output(load_arg(self.graph.result, quantized=False))
         return GraphModule(self.root, self.quantized_graph)
 
     def _find_matches(self, patterns):
diff --git a/test/test_fx.py b/test/test_fx.py
index 34e2604988e1..7f82d47895fc 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -181,8 +181,9 @@ def forward(self, a, b):
         m = M()
         g = symbolic_trace(m).graph
         new_g = torch.fx.Graph()
-        new_g.graph_copy(g)
-        t = Proxy(new_g.nodes[-1])
+        val_map : Dict[Node, Node] = {}
+        output_val = new_g.graph_copy(g, val_map)
+        t = Proxy(output_val)
         # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules.
         new_g.output((t + t).node)
         gm = GraphModule(m, new_g)
@@ -196,8 +197,9 @@ def forward(self, a, b):
         m = M()
         g = symbolic_trace(m).graph
         new_g = torch.fx.Graph()
-        new_g.graph_copy(g)
-        t = Proxy(new_g.nodes[-1])
+        val_map : Dict[Node, Node] = {}
+        output_val = new_g.graph_copy(g, val_map)
+        t = Proxy(output_val)
         # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules.
         new_g.output((t + t).node)
         gm = GraphModule(m, new_g)
@@ -214,7 +216,8 @@ def test_graph_unique_names_manual(self):
         d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c))
         graph.output(d)
         graph2 = torch.fx.Graph()
-        graph2.graph_copy(graph)
+        val_map : Dict[Node, Node] = {}
+        graph2.graph_copy(graph, val_map)
         seen_names : Set[str] = set()
         for node in graph2.nodes:
             assert node.name not in seen_names
@@ -312,6 +315,7 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod
                 operator.mul : "mul"
             }
 
+            output_node : Optional[Node] = None
             # For each instruction, create a triple
             # (instruction_name : str, inputs : List[str], output : str)
             # to feed into the C++ interpreter
@@ -338,9 +342,12 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod
                         else:
                             arg_names.append(arg.name)
                     instructions.append((target_to_name[target], arg_names, out_name))
-
+                elif n.op == 'output':
+                    if output_node is not None:
+                        raise RuntimeError('Multiple output nodes!')
+                    output_node = n
                 else:
-                    raise RuntimeError('Unsupported opcode' + n.op)
+                    raise RuntimeError('Unsupported opcode ' + n.op)
 
             interpreter = torch.classes._TorchScriptTesting._ElementwiseInterpreter()
             # Load constants
@@ -351,7 +358,8 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod
             # Load instructions
             interpreter.set_instructions(instructions)
             # Specify name for single output
-            interpreter.set_output_name(mod.graph.result.name)
+            assert isinstance(output_node.args[0], torch.fx.Node)
+            interpreter.set_output_name(output_node.args[0].name)
 
             # ===== Stage 3: Create a wrapper GraphModule around the interpreter =====
             class WrapperModule(torch.nn.Module):
@@ -518,9 +526,10 @@ def test_deepcopy_graphmodule_with_transform(self):
 
         def transform(traced):
             new_graph = torch.fx.Graph()
-            new_graph.graph_copy(traced.graph)
+            val_map : Dict[Node, Node] = {}
+            output_value = new_graph.graph_copy(traced.graph, val_map)
             relu_out = new_graph.create_node(
-                op='call_method', target='neg', args=(new_graph.nodes[-1],), kwargs={})
+                op='call_method', target='neg', args=(output_value,), kwargs={})
             new_graph.output(relu_out)
             return GraphModule(traced, new_graph)
         transformed = transform(traced)
@@ -709,7 +718,8 @@ def test_get_all_users_of(self):
             0: [1],
             1: [3],
             2: [3],
-            3: []
+            3: [4],
+            4: [],
         }
         for i, node in enumerate(graph.nodes):
             user_indexes = GraphManipulation.get_all_users_of(gm, i)
@@ -748,16 +758,20 @@ def forward(self, x):
         tc = TestCase()
         tc_traced = symbolic_trace(tc)
         ref_out = tc_traced(torch.rand(3, 4))
+        shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4))
 
         # Make sure we're testing all opcodes
         opcodes = set()
+        output_shape : Optional[torch.Shape] = None
         for node in tc_traced.graph.nodes:
             opcodes.add(node.op)
-        self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method', 'call_module']))
+            if node.op == 'output':
+                output_shape = node.args[0].shape
+        self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method',
+                                       'call_module', 'output']))
 
         # Test shape propogation and make sure results match actual
-        shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4))
-        self.assertEqual(tc_traced.graph.result.shape, ref_out.shape)
+        self.assertEqual(output_shape, ref_out.shape)
 
     def test_find_single_partition(self):
         class testModule(torch.nn.Module):
@@ -769,7 +783,7 @@ def forward(self, a, b):
         devices = [{"name": "dev_0", "available_mem": float('inf')}]
         dag = partitioner.partition_graph(traced, devices)
         for node in traced.graph.nodes:
-            assert node.partition_ids == [1]
+            assert node.op == 'output' or node.partition_ids == [1]
         nodes = traced.graph.nodes
         res_dag = DAG()
         res_dag.create_node(0, [], [1], [], [])
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 185511460740..30e65c191a30 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -9,7 +9,7 @@
 
 ```
 import torch
-from torch.fx import GraphModule
+from torch.fx import symbolic_trace
 
 class MyModule(torch.nn.Module):
     def __init__(self):
@@ -27,27 +27,25 @@ def forward(self, x):
 The Intermediate Representation centers around a 5-opcode format:
 
 ```
-from tabulate import tabulate
-node_specs = [[n.op, n.name, n.target, n.args, n.kwargs] for n in gm.graph.nodes]
-print(tabulate(node_specs, headers=['opcode', 'name', 'target', 'args', 'kwargs']))
+print(gm.graph)
 ```
 
 ```
-opcode         name           target                                                   args                kwargs
--------------  -------------  -------------------------------------------------------  ------------------  -----------
-placeholder    x              x                                                        ()                  {}
-get_attr       linear_weight  linear.weight                                            ()                  {}
-call_function  add_1          <built-in function add>                                  (x, linear_weight)  {}
-call_module    linear_1       linear                                                   (add_1,)            {}
-call_method    relu_2         relu                                                     [linear_1]          {}
-call_function  sum_1          <built-in method sum of type object at 0x7f1c29dd36e0>   (relu_2,)           {'dim': -1}
-call_function  topk_1         <built-in method topk of type object at 0x7f1c29dd36e0>  (sum_1, 3)          {}
+graph(x):
+    %linear_weight : [uses=1] = self.linear.weight
+    %add_1 : [uses=1] = call_function[target=<built-in function add>](args = (%x, %linear_weight), kwargs = {})
+    %linear_1 : [uses=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
+    %relu_1 : [uses=1] = call_method[target=relu](args = (%linear_1,), kwargs = {})
+    %sum_1 : [uses=1] = call_function[target=<built-in method sum of type object at 0x7fad0a3c16a0>](args = (%relu_1,), kwargs = {dim: -1}) # noqa: B950
+    %topk_1 : [uses=1] = call_function[target=<built-in method topk of type object at 0x7fad0a3c16a0>](args = (%sum_1, 3), kwargs = {}) # noqa: B950
+    return topk_1
 ```
 
 The semantics are as follows:
 
 - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on.
-  `target` is similarly the name of the argument. `args` and `kwargs` are don't-care
+  `target` is similarly the name of the argument. `args` and `kwargs` are don't-care. Placeholders correspond to
+  the function parameters (e.g. `x`) in the graph printout.
 - `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the
    fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy.
    `args` and `kwargs` are don't-care
@@ -60,6 +58,8 @@ def forward(self, x):
 - `call_method` calls a method on a value. `name` is as similar. `target` is the string name of the method
   to apply to the `self` argument. `args` and `kwargs` represent the arguments to invoke the module on,
   _including the self argument_.
+- `output` contains the output of the traced function in its `args[0]` attribute. This corresponds to the "return" statement
+  in the Graph printout.
 
 GraphModule automatically generates Python code for the operations it symbolically observed:
 
@@ -69,16 +69,14 @@ def forward(self, x):
 
 ```
 def forward(self, x):
-    self = self.root
     linear_weight = self.linear.weight
     add_1 = x + linear_weight
     linear_1 = self.linear(add_1)
-    relu_2 = linear_1.relu()
-    sum_1 = torch.sum(relu_2, dim = -1)
+    relu_1 = linear_1.relu()
+    sum_1 = torch.sum(relu_1, dim = -1)
     topk_1 = torch.topk(sum_1, 3)
-
-
     return topk_1
+
 ```
 
 Because this code is valid PyTorch code, the resulting `GraphModule` can be used in any context another
diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py
index c0dd78d28231..10a0c86e2249 100644
--- a/torch/fx/experimental/GraphManipulation.py
+++ b/torch/fx/experimental/GraphManipulation.py
@@ -52,5 +52,4 @@ def replace_target_nodes_with(
             val_map[node] = new_graph.create_node(new_op, new_target, args, kwargs, node.name)
         else:
             val_map[node] = new_graph.node_copy(node, lambda n : val_map[n])
-    new_graph.output(map_arg(fx_module.graph.result, lambda n: val_map[n]))
     fx_module.graph = new_graph
diff --git a/torch/fx/experimental/Partitioner.py b/torch/fx/experimental/Partitioner.py
index d3318916da27..605900cf974b 100644
--- a/torch/fx/experimental/Partitioner.py
+++ b/torch/fx/experimental/Partitioner.py
@@ -147,6 +147,8 @@ def find_single_partition(self) -> None:
         """Only one partition (one graph on one device)."""
         partition_0 = self.create_partition()
         for i, node in enumerate(self.graph_module.graph.nodes):
+            if node.op == 'output':
+                break
             self.node_to_partitions[node] = [partition_0.partition_id]
             partition_0.add_node(node)
         # Connect the partition to the root.
@@ -157,6 +159,8 @@ def find_single_partition(self) -> None:
     def do_partition(self) -> None:
         """Mark the partition on each node in the fx_module."""
         for node in self.graph_module.graph.nodes:
+            if node.op == 'output':
+                break
             node.partition_ids = self.node_to_partitions[node]
         return
 
diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py
index 6a9184d59a78..01374727a447 100644
--- a/torch/fx/experimental/shape_prop.py
+++ b/torch/fx/experimental/shape_prop.py
@@ -39,6 +39,8 @@ def fetch_attr(target : str):
                 result = getattr(self_obj, node.target)(*args, **kwargs)
             elif node.op == 'call_module':
                 result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'output':
+                return load_arg(node.args[0])
 
             if isinstance(result, torch.Tensor):
                 node.shape = result.shape
@@ -46,4 +48,4 @@ def fetch_attr(target : str):
 
             env[node.name] = result
 
-        return load_arg(self.graph.result)
+        return None
diff --git a/torch/fx/experimental/subgraph_creation_example.py b/torch/fx/experimental/subgraph_creation_example.py
index 430ba2586ae5..dc473d53505d 100644
--- a/torch/fx/experimental/subgraph_creation_example.py
+++ b/torch/fx/experimental/subgraph_creation_example.py
@@ -22,7 +22,7 @@ def __repr__(self) -> str:
             f" partitions depenent on: {self.partitions_dependent_on},\n" \
             f" parition dependents: {self.partition_dependents}"
 
-# Creates subgraphs out of main graph 
+# Creates subgraphs out of main graph
 def split_module(
     m: GraphModule,
     root_m: torch.nn.Module,
@@ -55,6 +55,9 @@ def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optiona
         # rather they're added to the graphs where they are used down below
         if node.op in ["placeholder", "get_attr"]:
             continue
+        if node.op == 'output':
+            torch.fx.graph.map_arg(node.args[0], lambda n: record_cross_partition_use(n, None))
+            continue
         partition_name = str(split_callback(node))
 
         # add node to partitions
@@ -68,8 +71,6 @@ def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optiona
         torch.fx.graph.map_arg(node.args, lambda def_node: record_cross_partition_use(def_node, node))
         torch.fx.graph.map_arg(node.kwargs, lambda def_node: record_cross_partition_use(def_node, node))
 
-    torch.fx.graph.map_arg(m.graph.result, lambda n: record_cross_partition_use(n, None))
-
     # find partitions with no dependencies
     root_partitions : List[str] = []
     for partition_name, partition in partitions.items():
@@ -168,7 +169,8 @@ def record_cross_partition_use(def_node : torch.fx.node.Node, use_node : Optiona
         else:
             base_mod_env[list(partition.outputs)[0]] = output_val
 
-    # Set output value for base graph
-    base_mod_graph.output(torch.fx.graph.map_arg(m.graph.result, lambda n : base_mod_env[n.name]))
+    for node in m.graph.nodes:
+        if node.op == 'output':
+            base_mod_graph.output(torch.fx.graph.map_arg(node.args[0], lambda n : base_mod_env[n.name]))
 
     return torch.fx.graph_module.GraphModule(base_mod_attrs, base_mod_graph)
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 381d4befd2bf..f53f96db0174 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -74,13 +74,19 @@ def __init__(self):
     def nodes(self):
         return tuple(self._nodes)
 
-    def graph_copy(self, g : 'Graph'):
+    def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node]) -> Optional[Argument]:
         """
-        Append all nodes from graph `g` to this graph
+        Append all nodes from graph `g` to this graph. `val_map` should be a dictionary
+        that maps nodes in `g` to nodes in `self. `val_map` will be populated with more
+        items by this function. Returns the equivalent output value of `g` with
+        Nodes switched to refer to nodes in `self`.
         """
-        val_map : Dict[Node, Node] = {}
         for node in g._nodes:
+            if node.op == 'output':
+                rv = map_arg(node.args[0], lambda n: val_map[n])
+                return rv
             val_map[node] = self.node_copy(node, lambda n : val_map[n])
+        return None
 
     def _mark_uses(self, a: Argument):
         def add_use(n: Node):
@@ -92,7 +98,7 @@ def create_node(self, op: str, target: Target,
                     args: Optional[Tuple[Argument, ...]] = None,
                     kwargs: Optional[Dict[str, Argument]] = None,
                     name: Optional[str] = None) -> Node:
-        assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder')
+        assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder', 'output')
         args = () if args is None else args
         kwargs = {} if kwargs is None else kwargs
         self._mark_uses(args)
@@ -157,8 +163,8 @@ def node_copy(self, node: Node, arg_transform: Callable[[Node], Argument] = lamb
         return self.create_node(node.op, node.target, args, kwargs, name)
 
     def output(self, result: Argument):
-        self.result = result
         self._mark_uses(result)
+        return self.create_node(op='output', target='output', args=(result,))
 
     def _name(self, target: Target) -> str:
         if callable(target):
@@ -194,7 +200,7 @@ def _register_name_used(self, op : str) -> str:
         i = self._used_names[op] = self._used_names[op] + 1
         return f'{op}_{i}'
 
-    def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
+    def python_code(self, root_module: str) -> str:
         free_vars: List[str] = []
         body: List[str] = []
         for node in self._nodes:
@@ -236,10 +242,18 @@ def python_code(self, root_module: str) -> Tuple[str, str, List[str]]:
                 assert isinstance(node.target, str)
                 body.append(f'{node.name} = {_format_target(root_module, node.target)}\n')
                 continue
+            elif node.op == 'output':
+                body.append(f'return {node.args[0]}')
+                continue
             raise NotImplementedError(f'node: {node.op} {node.target}')
 
-        src = ''.join(body)
-        return src, str(self.result), free_vars
+        code = ''.join(body)
+        code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'
+        fn_code = f"""\
+def forward(self, {', '.join(free_vars)}):
+{code}
+"""
+        return fn_code
 
     def __str__(self) -> str:
         placeholder_names : List[str] = []
@@ -268,6 +282,8 @@ def format_node(n : Node) -> Optional[str]:
                 return None
             elif n.op == 'get_attr':
                 return f'%{n.name} : [uses={n.uses}] = self.{n.target}'
+            elif n.op == 'output':
+                return f'return {n.args[0]}'
             else:
                 return f'%{n.name} : [uses={n.uses}] = {n.op}[target={n.target}](' \
                        f'args = {format_arg(n.args)}, kwargs = {format_arg(n.kwargs)})'
@@ -279,8 +295,6 @@ def format_node(n : Node) -> Optional[str]:
         for node_str in node_strs:
             if node_str:
                 s += '\n    ' + node_str
-        if hasattr(self, 'result'):
-            s += f'\n    return {format_arg(self.result)}'
         return s
 
     def lint(self, root : Optional[torch.nn.Module] = None):
@@ -306,7 +320,7 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None:
         seen_names : Set[str] = set()
         seen_values : Set[Node] = set()
         for node in self._nodes:
-            if node.op not in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr']:
+            if node.op not in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output']:
                 raise RuntimeError(f'Node {node} had unknown opcode {node.op}!')
             if node.graph is not self:
                 raise RuntimeError(f'Node \'{node}\' does not belong to this Graph!')
@@ -318,9 +332,6 @@ def check_arg(arg : Node, n : Optional[Node] = None) -> None:
                 raise RuntimeError(f'Node redefined name {node.name}!')
             seen_names.add(node.name)
 
-        if hasattr(self, 'result'):
-            map_arg(self.result, check_arg)
-
         # Check targets are legit
         if root:
             for node in self._nodes:
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 9c7d50b1d9dc..668d1ca07192 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -173,13 +173,7 @@ def graph(self):
     @graph.setter
     def graph(self, val) -> None:
         self._graph = val
-        body, result, free_variables = self._graph.python_code(root_module='self')
-        body = '\n'.join('    ' + line for line in body.split('\n')) + '\n'
-        self.code = f"""\
-def forward(self, {', '.join(free_variables)}):
-{body}
-    return {result}
-"""
+        self.code = self._graph.python_code(root_module='self')
         cls = type(self)
         cls.forward = _forward_from_src(self.code)
 
diff --git a/torch/fx/node.py b/torch/fx/node.py
index b008ad63bd2a..7bf57cff4dae 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -25,7 +25,7 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
                  args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> None:
         self.graph = graph
         self.name = name  # unique name of value being created
-        assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr']
+        assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output']
         self.op = op  # the kind of operation = placeholder|call_method|call_module|call_function|get_attr
         if op in ['call_method', 'call_module']:
             assert isinstance(target, str)
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index 9b192dd5501f..85af99e5f646 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -149,7 +149,7 @@ def module_call_wrapper(mod, *args, **kwargs):
                 return _create_proxy(self, 'call_module', module_qualified_name, args, kwargs)
         try:
             torch.nn.Module.__call__ = module_call_wrapper
-            self.graph.output(self.create_arg(fn(*args)))
+            self.create_node('output', 'output', (self.create_arg(fn(*args)),), {})
         finally:
             torch.nn.Module.__call__ = orig_call
         return GraphModule(root, self.graph)
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 4e8103d71015..0c7e1f90f47a 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -40,7 +40,6 @@ def load_arg(a):
                 env[node.name] = self.fused_graph.node_copy(node, load_arg)
             # node matched in patterns and is not root is removed here
 
-        self.fused_graph.output(load_arg(input_graph.result))
         model = GraphModule(input_root, self.fused_graph)
         return model
 
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 64c4fd18103a..1ea12b2894d3 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -50,6 +50,8 @@
 import copy
 import re
 
+from typing import Optional
+
 # ------------------------
 # Helper Functions
 # ------------------------
@@ -361,7 +363,12 @@ def load_arg(a):
 
         get_new_observer_name = get_new_attr_name_with_prefix('activation_post_process_')
 
+        result_node : Optional[Node] = None
         for node in model.graph.nodes:
+            if node.op == 'output':
+                observed_graph.output(load_arg(node.args[0]))
+                result_node = node
+                continue
             if node.name in observed_node_names_set:
                 continue
 
@@ -479,15 +486,15 @@ def is_observed(input_arg):
                     env[node.name] = observed_graph.create_node('call_module', observer_name, (load_arg(node),), {})
                     observed_node_names_set.add(node.name)
 
-        observed_graph.output(load_arg(model.graph.result))
         model = GraphModule(model, observed_graph)
         self.save_state(model)
         if is_standalone_module:
-            assert isinstance(model.graph.result, Node), \
+            assert result_node is not None
+            assert isinstance(result_node.args[0], Node), \
                 'standalone module returning dict is not yet supported'
             # indicator for whether output is observed or not.
             # This used for correctly quantize standalone modules
-            output_is_observed = model.graph.result.name in observed_node_names_set
+            output_is_observed = result_node.args[0].name in observed_node_names_set
             model._standalone_module_observed_input_idxs = standalone_module_observed_input_idxs
             model._output_is_observed = output_is_observed
         return model
@@ -641,6 +648,14 @@ def is_quantized(node):
                     raise Exception("partially quantized inputs in list not handled yet")
 
         for node in model.graph.nodes:
+            if node.op == 'output':
+                if is_standalone_module:
+                    # result are kept quantized in the quantized standalone module
+                    graph_output = map_arg(node.args[0], load_x)
+                else:
+                    graph_output = map_arg(node.args[0], load_non_quantized)
+                self.quantized_graph.output(graph_output)
+                continue
             root_node, matched, obj, qconfig = matches.get(node.name, (None, None, None, None))
             if root_node is node:
                 if qconfig is None:
@@ -706,13 +721,6 @@ def is_quantized(node):
                 # dequantize inputs for the node that are not quantized
                 env[node.name] = self.quantized_graph.node_copy(node, load_non_quantized)
 
-        if is_standalone_module:
-            # result are kepted quantized in the quantized standalone module
-            graph_output = map_arg(model.graph.result, load_x)
-        else:
-            graph_output = map_arg(model.graph.result, load_non_quantized)
-        self.quantized_graph.output(graph_output)
-
         # remove activation post process
         act_post_process_removed_graph = Graph()
         env = {}
@@ -720,13 +728,15 @@ def is_quantized(node):
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
         for node in self.quantized_graph.nodes:
+            if node.op == 'output':
+                act_post_process_removed_graph.output(map_arg(node.args[0], load_arg))
+                continue
             if node.op == 'call_module' and \
                is_activation_post_process(self.modules[node.target]):
                 # remove activation post process node
                 env[node.name] = env[node.args[0].name]
             else:
                 env[node.name] = act_post_process_removed_graph.node_copy(node, load_arg)
-        act_post_process_removed_graph.output(map_arg(self.quantized_graph.result, load_arg))
 
         module_dict = dict(model.named_modules())
         to_be_removed = []
@@ -784,7 +794,6 @@ def load_arg(a):
             else:
                 # copy other nodes
                 env[node.name] = folded_graph.node_copy(node, load_arg)
-        folded_graph.output(load_arg(quantized_graph.result))
         quantized = GraphModule(quantized_root, folded_graph)
         return quantized
 

From 31621c828d00d1ac23edbc1a8885ae425a88eb50 Mon Sep 17 00:00:00 2001
From: Michael Suo <suo@fb.com>
Date: Fri, 2 Oct 2020 17:06:17 -0700
Subject: [PATCH 395/449] Fix JIT tests when run locally in fbcode (#45776)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45776

Splitting out backend and custom class registration into their own library is
not currently implemented in fbcode, so detect that we are running tests in
fbcode and disable those tests.

Test Plan: buck test mode/no-gpu mode/dev caffe2/test:jit

Reviewed By: smessmer

Differential Revision: D24085871

fbshipit-source-id: 1fcc0547880bc4be59428e2810b6a7f6e50ef798
---
 test/jit/test_backends.py               | 12 +++++++++---
 test/jit/test_torchbind.py              |  4 ++--
 torch/testing/_internal/common_utils.py |  3 ++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index 9902ec4748f6..89330ddbd2d9 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -6,8 +6,14 @@
 import torch
 import torch._C
 from pathlib import Path
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, skipIfRocm, IS_SANDCASTLE, IS_WINDOWS, IS_MACOS
-
+from torch.testing._internal.common_utils import (
+    IS_FBCODE,
+    IS_MACOS,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    TEST_WITH_ROCM,
+    skipIfRocm,
+)
 # Make the helper files in test/ importable
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
@@ -54,7 +60,7 @@ class JitBackendTestCase(JitTestCase):
 
     def setUp(self):
         super().setUp()
-        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS:
+        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE:
             raise unittest.SkipTest("non-portable load_library call used in test")
         torch_root = Path(__file__).resolve().parent.parent.parent
         p = torch_root / 'build' / 'lib' / 'libjitbackend_test.so'
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index a8bea73c984d..7e61cef80f69 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -12,7 +12,7 @@
 pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.jit_utils import JitTestCase
-from torch.testing._internal.common_utils import TEST_WITH_ROCM, IS_WINDOWS, IS_SANDCASTLE, IS_MACOS
+from torch.testing._internal.common_utils import TEST_WITH_ROCM, IS_WINDOWS, IS_SANDCASTLE, IS_MACOS, IS_FBCODE
 from torch.testing import FileCheck
 
 if __name__ == "__main__":
@@ -24,7 +24,7 @@
 
 class TestTorchbind(JitTestCase):
     def setUp(self):
-        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS:
+        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE:
             raise unittest.SkipTest("non-portable load_library call used in test")
         torch_root = Path(__file__).resolve().parent.parent.parent
         p = torch_root / 'build' / 'lib' / 'libtorchbind_test.so'
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 4890cf30b9d6..44caea6687f0 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -59,6 +59,7 @@
     FILE_SCHEMA = "file:///"
 
 IS_SANDCASTLE = os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
+IS_FBCODE = os.getenv('PYTORCH_TEST_FBCODE') == '1'
 
 class ProfilingMode(Enum):
     LEGACY = 1
@@ -1174,7 +1175,7 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *,
         else:
             super().assertEqual(x, y, msg=msg)
 
-    def assertNotEqual(self, x, y, msg: Optional[str] = None, *,                                       # type: ignore[override] 
+    def assertNotEqual(self, x, y, msg: Optional[str] = None, *,                                       # type: ignore[override]
                        atol: Optional[float] = None, rtol: Optional[float] = None, **kwargs) -> None:  # type: ignore[override]
         with self.assertRaises(AssertionError, msg=msg):
             self.assertEqual(x, y, msg, atol=atol, rtol=rtol, **kwargs)

From 546aab66c1496845134538e78d6653fc4064ad63 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 2 Oct 2020 18:50:04 -0700
Subject: [PATCH 396/449] Revert D24027761: Update backward definition for more
 operators and reenable tests in test_ops.py

Test Plan: revert-hammer

Differential Revision:
D24027761 (https://github.com/pytorch/pytorch/commit/7d809f5d8e93cb6d332297ece071083845e30e26)

Original commit changeset: c1f707c2a039

fbshipit-source-id: 30750d2f08886036fb8b2cd0ae51c7732d3b7b19
---
 aten/src/ATen/native/Pow.cpp                  |  9 ++-
 aten/src/ATen/native/cpu/BinaryOpsKernel.cpp  | 16 +---
 .../cuda/BinaryMiscBackwardOpsKernels.cu      | 16 +---
 test/test_autograd.py                         |  6 +-
 test/test_jit.py                              |  2 +-
 test/test_ops.py                              | 18 +++--
 tools/autograd/derivatives.yaml               | 30 +++----
 tools/autograd/gen_variable_type.py           |  3 +-
 torch/csrc/autograd/FunctionsManual.cpp       | 80 +++++++------------
 torch/csrc/autograd/FunctionsManual.h         |  1 -
 .../_internal/common_methods_invocations.py   | 37 +++------
 11 files changed, 82 insertions(+), 136 deletions(-)

diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index 8fc2668f49c7..c10a617a5928 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -27,10 +27,13 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) {
            "result type ", common_dtype, "can't be cast to the desired output type ",
            result.scalar_type());
 
-  auto exponent = (exp.isComplex()) ? exp.toComplexDouble() : exp.toDouble();
-  if (exponent == 0.0) {
+  if (exp.isComplex() && (exp.toComplexDouble() == 0.0) ) {
     result.resize_as_(base).fill_(1);
-  } else if (exponent == 1.0) {
+  } else if (exp.isComplex() && (exp.toComplexDouble() == 1.0) ) {
+    result.resize_as_(base).fill_(base);
+  } else if (!exp.isComplex() && (exp.toDouble() == 0.0)) {
+    result.resize_as_(base).fill_(1);
+  } else if (!exp.isComplex() && (exp.toDouble() == 1.0)) {
     result.resize_as_(base).copy_(base);
   } else {
     auto iter = TensorIterator::unary_op(result, base.to(common_dtype));
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 2efde913fba1..fce8c348919b 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -589,20 +589,7 @@ void logit_backward_kernel(TensorIterator& iter, Scalar eps_scalar) {
 }
 
 void tanh_backward_kernel(TensorIterator& iter) {
-  if (isComplexType(iter.dtype())) {
-    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
-    auto one_vec = Vec256<scalar_t>(scalar_t{1});
-    cpu_kernel_vec(
-      iter,
-      [=](scalar_t a, scalar_t b) -> scalar_t {
-        return a * std::conj(scalar_t{1} - b * b);
-      },
-      [=](Vec256<scalar_t> a, Vec256<scalar_t> b) {
-        return a * (one_vec - b * b).conj();
-      });
-  });
-  } else {
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "tanh_backward_cpu", [&]() {
     auto one_vec = Vec256<scalar_t>(scalar_t{1});
     cpu_kernel_vec(
       iter,
@@ -613,7 +600,6 @@ void tanh_backward_kernel(TensorIterator& iter) {
         return a * (one_vec - b * b);
       });
   });
-  }
 }
 
 void mse_kernel(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
index ed7e2190f75e..9b7bc28a829e 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu
@@ -60,21 +60,13 @@ void logit_backward_kernel_cuda(TensorIterator& iter, Scalar eps_scalar) {
 }
 
 void tanh_backward_kernel_cuda(TensorIterator& iter) {
-  if(isComplexType(iter.dtype())) {
-    AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "tanh_backward_complex_cuda", [&]() {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
+    AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] {
       gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-        return a * std::conj(scalar_t{1.} - b * b);
+        return a * (scalar_t{1.} - b * b);
       });
     });
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() {
-      AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] {
-        gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
-          return a * (scalar_t{1.} - b * b);
-        });
-      });
-    });
-  }
+  });
 }
 
 REGISTER_DISPATCH(sigmoid_backward_stub, &sigmoid_backward_kernel_cuda);
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 63f245576c5b..d6661b4662fe 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4794,8 +4794,8 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
 # the tests for these ops which do not have 'complex' in variant should not run for complex
 # and only run for floating point
 
-separate_complex_tests = ['view_as_real', 'real', 'imag', 'asin', 'acos', 'div', 'log',
-                          'log10', 'log1p', 'log2', 'pow', 'tan', 'reciprocal', 'rsqrt']
+# TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition
+separate_complex_tests = ['view_as_real', 'real', 'imag', 'asin', 'acos']  # ['log', 'log10', 'log1p', 'log2', 'reciprocal', 'tan']
 
 # NOTE: Some non-holomorphic are separately tested in TestAutogradComplex until gradcheck works properly
 # for non-holomorphic functions
@@ -4806,7 +4806,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu',
                 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_',
                 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh',
-                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'atan', 'angle'] + separate_complex_tests
+                'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot'] + separate_complex_tests
 
 # TODO(@anjali411): add tests for 'sub', 'div
 # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411
diff --git a/test/test_jit.py b/test/test_jit.py
index 4d133663a766..494d70ecfcfb 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -15598,7 +15598,7 @@ def add_autograd_test(
 
     # Disable complex tests
     # TODO: Add complex support for jit
-    if 'complex' in variant_name or name in ['view_as_complex', 'complex', 'angle']:
+    if 'complex' in variant_name or name in ['view_as_complex', 'complex']:
         return
 
     # Skips aliases, which are tested in test_op_aliases.py
diff --git a/test/test_ops.py b/test/test_ops.py
index 1d85f86113e9..5be450d4d41f 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -89,35 +89,41 @@ def _gradgrad_test_helper(self, device, dtype, op, variant):
         return self._check_helper(device, dtype, op, variant, 'gradgradcheck')
 
     # Tests that gradients are computed correctly
-    @dtypes(torch.double, torch.cdouble)
+    # TODO(@anjali411) enable this for torch.cdouble.
+    @dtypes(torch.double)
     @ops(op_db)
     def test_fn_grad(self, device, dtype, op):
         self._grad_test_helper(device, dtype, op, op.get_op())
 
-    @dtypes(torch.double, torch.cdouble)
+    # TODO(@anjali411) enable this for torch.cdouble.
+    @dtypes(torch.double)
     @ops(op_db)
     def test_method_grad(self, device, dtype, op):
         self._grad_test_helper(device, dtype, op, op.get_method())
 
-    @dtypes(torch.double, torch.cdouble)
+    # TODO(@anjali411) enable this for torch.cdouble.
+    @dtypes(torch.double)
     @ops(op_db)
     def test_inplace_grad(self, device, dtype, op):
         if not op.test_inplace_grad:
             self.skipTest("Skipped! Inplace gradcheck marked to skip.")
         self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
 
+    # TODO(@anjali411) enable this for torch.cdouble.
     # Test that gradients of gradients are computed correctly
-    @dtypes(torch.double, torch.cdouble)
+    @dtypes(torch.double)
     @ops(op_db)
     def test_fn_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, op.get_op())
 
-    @dtypes(torch.double, torch.cdouble)
+    # TODO(@anjali411) enable this for torch.cdouble.
+    @dtypes(torch.double)
     @ops(op_db)
     def test_method_gradgrad(self, device, dtype, op):
         self._gradgrad_test_helper(device, dtype, op, op.get_method())
 
-    @dtypes(torch.double, torch.cdouble)
+    # TODO(@anjali411) enable this for torch.cdouble.
+    @dtypes(torch.double)
     @ops(op_db)
     def test_inplace_gradgrad(self, device, dtype, op):
         if not op.test_inplace_grad:
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 54df6eace85b..92ee277e9ecf 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -162,7 +162,7 @@
   self: grad * self.sgn()
 
 - name: acos(Tensor self) -> Tensor
-  self: grad * -((-self * self + 1).rsqrt()).conj()
+  self: grad * -((-self * self + 1).rsqrt())
 
 - name: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
   self: grad
@@ -213,7 +213,7 @@
   self: grad
 
 - name: angle(Tensor self) -> Tensor
-  self: angle_backward(grad, self)
+  self: grad.to(self.scalar_type()) * (self*Scalar(c10::complex<double>{0.0, 1.0})).conj() / self.abs().pow(2)
 
 # The four items below are necessary because TensorIterator doesn't work on
 # Variables (codegen does not unwrap the input Tensor for all() and any() ).
@@ -230,19 +230,19 @@
   self: not_implemented("all")
 
 - name: acosh(Tensor self) -> Tensor
-  self: grad * (self.pow(2) - 1).rsqrt().conj()
+  self: grad * (self.pow(2) - 1).rsqrt()
 
 - name: acosh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of acosh")
 
 - name: asinh(Tensor self) -> Tensor
-  self: grad * (self.pow(2) + 1).rsqrt().conj()
+  self: grad * (self.pow(2) + 1).rsqrt()
 
 - name: asinh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of asinh")
 
 - name: atanh(Tensor self) -> Tensor
-  self: grad * 1 / (1 - self.pow(2)).conj()
+  self: grad * 1 / (1 - self.pow(2))
 
 - name: atanh_(Tensor(a!) self) -> Tensor(a!)
   self: not_implemented("inplace version of atanh")
@@ -251,10 +251,10 @@
   self: as_strided_backward(grad, TensorGeometry(self), size, stride, storage_offset)
 
 - name: asin(Tensor self) -> Tensor
-  self: grad * (-self * self + 1).rsqrt().conj()
+  self: grad * (-self * self + 1).rsqrt()
 
 - name: atan(Tensor self) -> Tensor
-  self: grad / (self * self + 1).conj()
+  self: grad / (self * self + 1)
 
 - name: atan2(Tensor self, Tensor other) -> Tensor
   self, other: atan2_backward(grad, self, other, grad_input_mask)
@@ -610,16 +610,16 @@
   self: grad * polygamma(n + 1, self)
 
 - name: log(Tensor self) -> Tensor
-  self: grad.div(self.conj())
+  self: grad.div(self)
 
 - name: log10(Tensor self) -> Tensor
-  self: grad / (self.conj() * 2.3025850929940456)
+  self: grad / (self * 2.3025850929940456)
 
 - name: log1p(Tensor self) -> Tensor
   self: log1p_backward(grad, self)
 
 - name: log2(Tensor self) -> Tensor
-  self: grad / (self.conj() * 0.6931471805599453)
+  self: grad / (self * 0.6931471805599453)
 
 - name: logaddexp(Tensor self, Tensor other) -> Tensor
   self: grad / (1 + exp(other - self))
@@ -875,7 +875,7 @@
   self: zeros_like(grad)
 
 - name: reciprocal(Tensor self) -> Tensor
-  self: -grad * (result * result).conj()
+  self: -grad * result * result
 
 - name: remainder.Scalar(Tensor self, Scalar other) -> Tensor
   self: grad
@@ -900,7 +900,7 @@
   self: zeros_like(grad)
 
 - name: rsqrt(Tensor self) -> Tensor
-  self: -0.5 * grad * result.pow(3).conj()
+  self: -0.5 * grad * result.pow(3)
 
 - name: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
   self: grad.clone().scatter_(dim, index, 0)
@@ -1037,7 +1037,7 @@
   index: non_differentiable
 
 - name: tan(Tensor self) -> Tensor
-  self: grad * (1 + result.pow(2)).conj()
+  self: grad * (1 + result.pow(2))
 
 - name: tanh(Tensor self) -> Tensor
   self: tanh_backward(grad, result)
@@ -1661,8 +1661,8 @@
   output: grad * grad_output * (-2 * output + 1)
 
 - name: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
-  grad_output: tanh_backward(grad, output.conj())
-  output: grad.conj() * (-2 * output.conj() * grad_output)
+  grad_output: tanh_backward(grad, output)
+  output: -2 * output * grad * grad_output
 
 # cudnn
 - name: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 8046c6a288e6..b7e61b5b7a83 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -160,8 +160,7 @@
     'cosh', '__rmul__', 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex',
     'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd',
     'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward',
-    'dot', 'vdot', 'cholesky', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
-    'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh'
+    'dot', 'vdot', 'cholesky'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 91045ee88099..1314a98e9562 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -87,14 +87,6 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) {
   return tensor;
 }
 
-Tensor handle_r_to_c(Tensor self, Tensor gradient_result) {
-  if (!self.is_complex() && gradient_result.is_complex()) {
-    // R -> C
-    return at::real(gradient_result);
-  }
-  return gradient_result;
-}
-
 Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result) {
   if (!at::isComplexType(self_st) && gradient_result.is_complex()) {
     // R -> C
@@ -185,18 +177,16 @@ Tensor norm_backward(Tensor grad, const Tensor & self, const optional<Scalar> &
 }
 
 Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent_) {
-  auto exponent = (exponent_.isComplex()) ? exponent_.toComplexDouble() : exponent_.toDouble();
+  double exponent = exponent_.toDouble();
   if (exponent == 0.0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
-    auto out = grad * (exponent * self.pow(exponent - 1)).conj();
-    return handle_r_to_c(self, out);
+    return grad * exponent * self.pow(exponent - 1);
   }
 }
 
 Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & exponent) {
-  auto out = at::where(exponent == 0.0, at::zeros({}, grad.options()), grad * (exponent * self.pow(exponent - 1)).conj());
-  return handle_r_to_c(self, out);
+  return at::where(exponent == 0.0, at::zeros({}, grad.options()), grad * exponent * self.pow(exponent - 1));
 }
 
 // Caveats:
@@ -208,46 +198,18 @@ Tensor pow_backward_self(Tensor grad, const Tensor & self, const Tensor & expone
 // d(a^b)/db = 0 for a > 0 and b -> +0.
 // Currently, tensorflow agrees with us.
 Tensor pow_backward_exponent(Tensor grad, const Tensor& self, const Tensor& exponent, Tensor result) {
-  Tensor cond;
-  if (exponent.is_complex()) {
-    auto is_real_exp = at::logical_and(at::imag(exponent) == 0, at::real(exponent) >= 0);
-    cond = at::logical_and(self == 0, is_real_exp);
-  } else {
-    cond = at::logical_and(self == 0, exponent >= 0);
-  }
-  auto out = grad * at::where(cond,
+  return grad * at::where(at::logical_and(self == 0, exponent >= 0),
                           at::zeros({}, grad.options()),
-                          (result * self.log()).conj());
-  return handle_r_to_c(exponent, out);
+                          result * self.log());
 }
 
 Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exponent, Tensor result) {
-  auto base_ = base.isComplex() ? base.toComplexDouble() : base.toDouble();
-  auto grad_lambda = [](auto a, auto b) { return (a * std::log(b)).conj(); };
-  if (base_ == 0.0) {
-    auto cond = [](auto exp) {
-      if (exp.is_complex()) {
-        return at::logical_and(at::imag(exp) == 0, at::real(exp) >= 0);
-      } else {
-        return exp >=0;
-      }
-    };
-    auto out = grad * at::where(cond(exponent),
+  if (base.toDouble() == 0) {
+    return grad * at::where(exponent >= 0,
                             at::zeros({}, grad.options()),
-                            grad_lambda(result, base_));
-    return handle_r_to_c(exponent, out);
-  } else {
-    auto out = grad * grad_lambda(result, base_);
-    return handle_r_to_c(exponent, out);
-  }
-}
-
-Tensor angle_backward(Tensor grad, const Tensor& self) {
-  if (self.is_complex()) {
-    return at::where(self == 0.0, at::zeros({}, self.options()),
-                     grad * self / self.abs().pow(2) * Scalar(c10::complex<double>{0.0, 1.0}));
+                            result * std::log(base.toDouble()));
   } else {
-    return at::zeros_like(self, at::MemoryFormat::Preserve);
+    return grad * result * std::log(base.toDouble());
   }
 }
 
@@ -264,23 +226,35 @@ Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) {
     // https://arxiv.org/pdf/1701.00392.pdf Section 4.20
     return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result)));
   } else {
-    return at::zeros_like(self, at::MemoryFormat::Preserve);
+    return at::zeros_like(grad, at::MemoryFormat::Preserve);
   }
 }
 
 Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) {
-  auto out = grad * other.conj();
-  return handle_r_to_c(self_st, out);
+  auto result = grad * other.conj();
+  if (!at::isComplexType(self_st) && result.is_complex()) {
+    // R -> C
+    result = at::real(result);
+  }
+  return result;
 }
 
 Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) {
   auto result = grad / other.conj();
-  return handle_r_to_c(self_st, result);
+  if (!at::isComplexType(self_st) && result.is_complex()) {
+    // R -> C
+    result = at::real(result);
+  }
+  return result;
 }
 
 Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) {
   auto result = -grad * ((self / other) / other).conj();
-  return handle_r_to_c(other, result);
+  if (!other.is_complex() && result.is_complex()) {
+    // R -> C
+    result = at::real(result);
+  }
+  return result;
 }
 
 Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) {
@@ -2647,7 +2621,7 @@ Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
       "Use a different mathematical operation which preserves sparsity of gradients, ",
       "or report a bug if you think this is an error.");
   }
-  return grad / (self + 1).conj();
+  return grad / (self + 1);
 }
 
 Tensor sparse_constructor_values_backward(const Tensor& sparse_grad_out, const Tensor& indices, IntArrayRef values_shape) {
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 28022c8fb930..00171cbbf656 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -44,7 +44,6 @@ at::Tensor pow_backward(at::Tensor grad, const at::Tensor & self, const at::Scal
 at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at::Tensor & exponent);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result);
 at::Tensor pow_backward_exponent(at::Tensor grad, const at::Scalar & base, const at::Tensor& exponent, at::Tensor result);
-at::Tensor angle_backward(at::Tensor grad, const at::Tensor& self);
 at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st);
 at::Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st);
 at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other);
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 979b5b3d78f9..290645fd0d32 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -560,15 +560,6 @@ def method_tests():
             (True, [], ['aten::mul', 'aten::reciprocal'])),
         ('__rdiv__', uniform_scalar(1e-1, requires_grad=True), (3.14,), 'scalar_constant',
             (True, [], ['aten::mul', 'aten::reciprocal'])),
-        ('div', (S, S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex', (True,)),
-        ('div', (S, S, S), (torch.rand(S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_rhs', (True,)),
-        ('div', (S, S), (torch.rand(S, S, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_lhs', (True,)),
-        ('div', (S, 1, S), (torch.rand(M, S, dtype=torch.cdouble) + 0.1,), 'complex_broadcast_all', (True,)),
-        ('div', (), (uniform_scalar(0.1j),), 'complex_scalar', (True,)),
-        ('div', (S, S, S), (uniform_scalar(0.1j),), 'complex_scalar_broadcast_rhs', (True,)),
-        ('div', (), (uniform_scalar(0.1j),), 'complex_scalar_broadcast_lhs', (True,)),
-        ('div', torch.rand(S, S, S, dtype=torch.cdouble) + 1e-1, (3.14j,), 'complex_constant', (True,)),
-        ('div', uniform_scalar(1e-1j, requires_grad=True), (3.14j,), 'complex_scalar_constant', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(S, S, S) + 0.1,), '', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (torch.rand(1,) + 0.1,), 'broadcast_rhs', (True,)),
         ('pow', torch.rand(1,) + 1e-3, (torch.rand(S, S, S) + 0.1,), 'broadcast_lhs', (True,)),
@@ -577,11 +568,8 @@ def method_tests():
         ('pow', torch.rand(S, S, S) + 1e-3, (uniform_scalar(0.1),), 'scalar_broadcast_rhs', (True,)),
         ('pow', uniform_scalar(1e-3, requires_grad=True), (torch.rand(S, S, S) + 0.1,), 'scalar_broadcast_lhs', (True,)),
         ('pow', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant', (True,)),
-        ('pow', torch.rand(S, S, S, dtype=torch.cdouble) + 1e-3 * (1 + 1j), (3.14,), 'complex_constant', (True,)),
         ('__rpow__', torch.rand(S, S, S) + 1e-3, (3.14,), 'constant', (True, 'aten::pow')),
         ('pow', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant', (True,)),
-        ('pow', uniform_scalar(1e-3 * (1 + 1j), requires_grad=True), (3.14,), 'complex_scalar_constant', (True,)),
-        ('pow', uniform_scalar(1e-3 * (1 + 1j), requires_grad=True), (3.14j,), 'complex_imaginary_exponent', (True,)),
         ('__rpow__', uniform_scalar(1e-3, requires_grad=True), (3.14,), 'scalar_constant', (True, 'aten::pow')),
         ('transpose', (1, 2, 3), (1, 2), 'dim', (False,), [0, 1]),
         ('transpose', (), (0, 0), 'scalar', (False,)),
@@ -652,12 +640,13 @@ def method_tests():
         ('log1p', uniform_scalar(requires_grad=True), NO_ARGS, 'scalar', (True,)),
         ('log2', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('log2', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('log', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
-        ('log', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        ('log10', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
-        ('log10', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
-        ('log2', torch.randn(S, S, S, dtype=torch.cdouble) + 1e-2, NO_ARGS, 'complex', (True,)),
-        ('log2', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition.
+        # ('log', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
+        # ('log', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        # ('log10', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
+        # ('log10', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
+        # ('log2', torch.randn(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
+        # ('log2', uniform_scalar(1e-2j, requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
         ('tanh', (S, S, S), NO_ARGS, '', (True,)),
         ('tanh', (), NO_ARGS, 'scalar', (True,)),
         ('sigmoid', (S, S, S), NO_ARGS, '', (True,)),
@@ -678,8 +667,6 @@ def method_tests():
         ('complex', (S, S, S), ((S, S, S),), ''),
         ('abs', (S, S, S), NO_ARGS, '', (True,)),
         ('abs', (), NO_ARGS, 'scalar', (True,)),
-        ('angle', (S, S, S), NO_ARGS, '', (True,)),
-        ('angle', (), NO_ARGS, 'scalar', (True,)),
         ('clamp', (S, S, S), (0, 1), '', (True,)),
         ('clamp', (S, S, S), (None, 0.5), 'min', (True,)),
         ('clamp', (S, S, S), (0.5, None), 'max', (True,)),
@@ -694,7 +681,8 @@ def method_tests():
         ('cos', (S, S, S), NO_ARGS, '', (True,)),
         ('cos', (), NO_ARGS, 'scalar', (True,)),
         ('tan', torch.randn(S, S, S).clamp(-1, 1), NO_ARGS, '', (True,)),
-        ('tan', (S, S, S), NO_ARGS, 'complex', (True,)),
+        # TODO(@anjali411): add the commented test back after updating the formula based on tensorflow definition.
+        # ('tan', (S, S, S), NO_ARGS, 'complex', (True,)),
         ('asin', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
         ('acos', torch.randn(S, S, S).clamp(-0.9, 0.9), NO_ARGS, '', (True,)),
         ('atan', (S, S, S), NO_ARGS, '', (True,)),
@@ -706,8 +694,9 @@ def method_tests():
         ('atan2', (S, 1, S), ((S, S),), 'broadcast_all'),
         ('reciprocal', torch.rand(S, S, S) + 0.1, NO_ARGS, '', (True,)),
         ('reciprocal', uniform_scalar(0.1, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('reciprocal', torch.randn(S, S, S, dtype=torch.cdouble) + 0.1, NO_ARGS, 'complex', (True,)),
-        ('reciprocal', uniform_scalar(0.1j), NO_ARGS, 'complex_scalar', (True,)),
+        # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition.
+        # ('reciprocal', torch.randn(S, S, S, dtype=torch.cdouble) + 0.1, NO_ARGS, 'complex', (True,)),
+        # ('reciprocal', uniform_scalar(0.1j), NO_ARGS, 'complex_scalar', (True,)),
         ('round', (S, S, S), NO_ARGS, '', (True,)),
         ('round', (), NO_ARGS, 'scalar', (True,)),
         ('sign', (S, S, S), NO_ARGS),
@@ -724,8 +713,6 @@ def method_tests():
         ('deg2rad', (S, S, S), NO_ARGS),
         ('rsqrt', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('rsqrt', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
-        ('rsqrt', torch.rand(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
-        ('rsqrt', uniform_scalar(1e-2 * (1 + 1j), requires_grad=True), NO_ARGS, 'complex_scalar', (True,)),
         ('frac', (S, S, S), NO_ARGS, '', (True,)),
         ('frac', (), NO_ARGS, 'scalar', (True,)),
         ('fmod', (S, S, S), (1.5,), '', (True,)),

From 2b48dd168d846c864d4e12f9d66dc61be67fa811 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Fri, 2 Oct 2020 23:01:09 -0700
Subject: [PATCH 397/449] [StaticRuntime] Integrate Static Runtime into
 PyTorchPredictor (#45640)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45640

Reviewed By: dzhulgakov

Differential Revision: D23996656

fbshipit-source-id: 63d88c89d1df61a04deadc472319607ed83867e5
---
 .../static_runtime/test_static_runtime.cc     | 52 +++++++++++++
 torch/csrc/jit/runtime/static/impl.cpp        | 77 +++++++++++++------
 torch/csrc/jit/runtime/static/impl.h          | 15 ++++
 torch/csrc/jit/runtime/static/init.cpp        | 12 ++-
 4 files changed, 131 insertions(+), 25 deletions(-)

diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc
index 211839f0eaa5..172073705ea1 100644
--- a/benchmarks/static_runtime/test_static_runtime.cc
+++ b/benchmarks/static_runtime/test_static_runtime.cc
@@ -44,3 +44,55 @@ TEST(StaticRuntime, DeepWide) {
     }
   }
 }
+
+TEST(StaticRuntime, KWargsAPI_1) {
+  const int embedding_size = 32;
+  const int num_features = 50;
+  auto module = getDeepAndWideSciptModel();
+  torch::jit::StaticRuntime runtime(module);
+
+  for (int batch_size : {1, 8, 32}) {
+    for (int i = 0; i < 5; ++i) {
+      auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+      auto user_emb = torch::randn({batch_size, 1, embedding_size});
+      auto wide = torch::randn({batch_size, num_features});
+
+      // run jit graph executor
+      std::vector<at::IValue> inputs({ad_emb_packed, user_emb, wide});
+      at::Tensor output_1 = module.forward(inputs).toTensor();
+
+      // run static runtime
+      at::Tensor output_2 = runtime.run(inputs, {}).toTensor();
+      EXPECT_TRUE(output_1.equal(output_2));
+    }
+  }
+}
+
+TEST(StaticRuntime, KWargsAPI_2) {
+  const int embedding_size = 32;
+  const int num_features = 50;
+  auto module = getDeepAndWideSciptModel();
+  auto g = torch::jit::PrepareForStaticRuntime(module);
+  torch::jit::StaticRuntime runtime(module);
+
+  for (int batch_size : {1, 8, 32}) {
+    for (int i = 0; i < 5; ++i) {
+      auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size});
+      auto user_emb = torch::randn({batch_size, 1, embedding_size});
+      auto wide = torch::randn({batch_size, num_features});
+
+      // run jit graph executor
+      std::vector<at::IValue> args({ad_emb_packed, user_emb, wide});
+      at::Tensor output_1 = module.forward(args).toTensor();
+
+      std::unordered_map<std::string, c10::IValue> kwargs(
+          {{"ad_emb_packed", ad_emb_packed},
+           {"user_emb", user_emb},
+           {"wide", wide}});
+
+      // run static runtime
+      at::Tensor output_2 = runtime.run({}, kwargs).toTensor();
+      EXPECT_TRUE(output_1.equal(output_2));
+    }
+  }
+}
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 1ebd7cc89062..e63c0be66454 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/runtime/static/impl.h>
 #include <ATen/core/interned_strings.h>
-#include <ATen/core/op_registration/op_registration.h>
 #include <torch/csrc/jit/passes/canonicalize.h>
 #include <torch/csrc/jit/passes/freeze_module.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
@@ -11,9 +10,6 @@
 namespace torch {
 namespace jit {
 
-using c10::DispatchKey;
-using c10::RegisterOperators;
-
 std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
     std::shared_ptr<torch::jit::Graph> g) {
   Inline(*g);
@@ -27,24 +23,12 @@ std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
     if (n->kind() == c10::Symbol::fromQualString("prim::GetAttr")) {
       throw std::runtime_error("Cannot accelerate unfrozen graphs");
     }
-    bool supported = false;
-#define X(_)                                          \
-  if (n->kind() == c10::Symbol::fromQualString(#_)) { \
-    supported = true;                                 \
-  }
-    SUPPORTED_OPS(X)
-#undef X
-    if (!supported) {
-      throw std::runtime_error(
-          std::string("Unsupported operation: ") + n->kind().toQualString());
-    }
   }
 
   // remove unused input 0 from graph
   if (g->inputs().at(0)->type()->is_module()) {
-    if (!g->inputs().at(0)->hasUses()) {
-      g->eraseInput(0);
-    }
+    TORCH_INTERNAL_ASSERT(!g->inputs().at(0)->hasUses());
+    g->eraseInput(0);
   }
 
   return g;
@@ -59,7 +43,16 @@ std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
   return PrepareForStaticRuntime(g);
 }
 
-StaticRuntime::StaticRuntime(std::shared_ptr<torch::jit::Graph> g) : graph_(g) {
+StaticRuntime::StaticRuntime(std::shared_ptr<torch::jit::Graph> g)
+    : StaticRuntime(g, c10::nullopt) {}
+
+StaticRuntime::StaticRuntime(const torch::jit::Module& m)
+    : StaticRuntime(PrepareForStaticRuntime(m), m) {}
+
+StaticRuntime::StaticRuntime(
+    std::shared_ptr<torch::jit::Graph> g,
+    c10::optional<torch::jit::Module> m)
+    : graph_(g) {
   // fill workspace_ with constants
   for (Node* node : graph_->nodes()) {
     if (node->kind() == prim::Constant) {
@@ -69,14 +62,23 @@ StaticRuntime::StaticRuntime(std::shared_ptr<torch::jit::Graph> g) : graph_(g) {
       nodes_.emplace_back(node);
     }
   }
+  if (m) {
+    Method method = m->get_method("forward");
+    const c10::FunctionSchema& schema = method.function().getSchema();
+
+    // remove "self" from function schema
+    TORCH_INTERNAL_ASSERT(
+        schema.arguments().size() >= 1 &&
+        schema.arguments()[0].name() == "self");
+    std::vector<Argument> args(
+        {schema.arguments().begin() + 1, schema.arguments().end()});
+    schema_ =
+        std::make_unique<c10::FunctionSchema>(schema.cloneWithArguments(args));
+  }
 }
 
 std::vector<at::Tensor> StaticRuntime::run(
     const std::vector<at::Tensor>& inps) const {
-  // Container for inputs, outputs, and activations (excluding parameters)
-
-  TORCH_INTERNAL_ASSERT(graph_->inputs().size() == inps.size());
-
   for (size_t i = 0; i < inps.size(); i++) {
     workspace_[graph_->inputs()[i]] = inps[i];
   }
@@ -100,9 +102,33 @@ std::vector<at::Tensor> StaticRuntime::run(
   return out;
 }
 
+c10::IValue StaticRuntime::run(
+    const std::vector<c10::IValue>& args,
+    const std::unordered_map<std::string, c10::IValue>& kwargs) const {
+  std::vector<IValue> stack(args);
+  if (!kwargs.empty()) {
+    // This is not ideal
+    TORCH_INTERNAL_ASSERT(
+        schema_ != nullptr,
+        "Schema is not available. Consider creating the Static Runtime "
+        "with StaticRuntime(const torch::jit::Module& m) instead.");
+    schema_->checkAndNormalizeInputs(stack, kwargs);
+  }
+  for (size_t i = 0; i < stack.size(); i++) {
+    workspace_[graph_->inputs()[i]] = stack[i];
+  }
+
+  for (const auto& n : nodes_) {
+    n.run(workspace_);
+  }
+
+  return workspace_[graph_->outputs().at(0)];
+}
+
 ProcessedNode::ProcessedNode(Node* node) : node_(node) {
   if (node->kind() != prim::ListConstruct &&
-      node->kind() != prim::TupleConstruct) {
+      node->kind() != prim::TupleConstruct &&
+      node->kind() != prim::ListUnpack) {
     const Operator& op = node->getOperator();
     CHECK(op.hasOperation());
     op_ = op.getOperation(node);
@@ -145,6 +171,9 @@ void ProcessedNode::run(StaticRuntime::ConstantMap& workspace) const {
         } else {
           tupleConstruct(stack, node_->inputs().size());
         }
+      } else if (node_->kind() == prim::ListUnpack) {
+        size_t num_outputs = node_->outputs().size();
+        listUnpack(stack, num_outputs);
       } else {
         TORCH_CHECK(0, "Unhandled operation!", node_->kind().toQualString());
       }
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index 8fe8c805dfba..270251bc265d 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -22,9 +22,18 @@ TORCH_API std::shared_ptr<torch::jit::Graph> PrepareForStaticRuntime(
 class ProcessedNode;
 class TORCH_API StaticRuntime {
  public:
+  // g is the optimized graph produced by PrepareForStaticRuntime
   explicit StaticRuntime(std::shared_ptr<torch::jit::Graph> g);
+
+  // m is unoptimized
+  explicit StaticRuntime(const torch::jit::Module& m);
+
   std::vector<at::Tensor> run(const std::vector<at::Tensor>& inps) const;
 
+  c10::IValue run(
+      const std::vector<c10::IValue>& args,
+      const std::unordered_map<std::string, c10::IValue>& kwargs) const;
+
 #ifdef FBCODE_CAFFE2
   using ConstantMap = folly::F14FastMap<Value*, IValue>;
 #else
@@ -32,8 +41,14 @@ class TORCH_API StaticRuntime {
 #endif
 
  private:
+  explicit StaticRuntime(
+      std::shared_ptr<torch::jit::Graph> g, // optimized graph
+      c10::optional<torch::jit::Module> m);
+
   std::shared_ptr<torch::jit::Graph> graph_;
 
+  std::unique_ptr<c10::FunctionSchema> schema_{nullptr};
+
   // Static runtime states
   // Value table (including weights)
   mutable ConstantMap workspace_;
diff --git a/torch/csrc/jit/runtime/static/init.cpp b/torch/csrc/jit/runtime/static/init.cpp
index 86292f4ee5f5..5d9c4a67b9f9 100644
--- a/torch/csrc/jit/runtime/static/init.cpp
+++ b/torch/csrc/jit/runtime/static/init.cpp
@@ -6,7 +6,17 @@ namespace jit {
 
 void initStaticRuntimeBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
-  py::class_<StaticRuntime>(m, "StaticRuntime").def("run", &StaticRuntime::run);
+  py::class_<StaticRuntime>(m, "StaticRuntime")
+      .def(
+          "run",
+          py::overload_cast<const std::vector<at::Tensor>&>(
+              &StaticRuntime::run, py::const_))
+      .def(
+          "run",
+          py::overload_cast<
+              const std::vector<c10::IValue>&,
+              const std::unordered_map<std::string, c10::IValue>&>(
+              &StaticRuntime::run, py::const_));
   m.def(
        "_jit_to_static_runtime",
        [](const std::shared_ptr<torch::jit::Graph>& g) {

From d8a9c2c27e0d11fdde1d8e5bfbce1b55fa2cd478 Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Fri, 2 Oct 2020 23:11:37 -0700
Subject: [PATCH 398/449] [iOS][CI] Fix the timeout for nightlies (#45798)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45798

Test Plan: Imported from OSS

Reviewed By: husthyc

Differential Revision: D24098451

Pulled By: xta0

fbshipit-source-id: 269517e0d54b0a07ea2ae5e2aee7f0ebc7985191
---
 .circleci/scripts/binary_ios_build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index 6df086ccf965..1166b3a1bab7 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -16,7 +16,7 @@ source ~/anaconda/bin/activate
 
 # Install dependencies
 conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
-conda install -c conda-forge valgrind
+conda install -c conda-forge valgrind --yes
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
 # sync submodules

From 3a27fc966ab9129bc56a6ac03e8fffacafafccad Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Sat, 3 Oct 2020 11:31:20 -0700
Subject: [PATCH 399/449] Test torch.svd using complex float and double numbers
 (take 2) (#45795)

Summary:
Adds support for magmaSvd for complex numbers

Fixes use-after-free error in `apply_symeig`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45795

Reviewed By: ezyang

Differential Revision: D24096955

Pulled By: malfet

fbshipit-source-id: 0d8d8492f89fe722bbd5aed3528f244245b496d0
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp   |  1 -
 .../ATen/native/cuda/BatchLinearAlgebra.cu    | 84 +++++++++++++++----
 test/test_linalg.py                           |  3 -
 test/test_torch.py                            |  2 +-
 4 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 5e37fc0dc53d..17e24a38fdc7 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -976,7 +976,6 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
   auto S_stride = S.size(-1);
   auto VT_stride = matrixStride(VT);
   auto batchsize = batchCount(self);
-  auto self_stype = at::typeMetaToScalarType(self.dtype());
 
   int info;
   auto m = self.size(-2);
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 74915830fa7f..2a628bb54925 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -123,11 +123,12 @@ void magmaSymeig(
     value_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork, value_t* rwork,
     magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info);
 
-template<class scalar_t>
+template<class scalar_t, class value_t=scalar_t>
 void magmaSvd(
     magma_vec_t jobz, magma_int_t m, magma_int_t n, scalar_t* A,
-    magma_int_t lda, scalar_t* s, scalar_t* U, magma_int_t ldu,
+    magma_int_t lda, value_t* s, scalar_t* U, magma_int_t ldu,
     scalar_t* VT, magma_int_t ldvt, scalar_t* work, magma_int_t lwork,
+    value_t* rwork,
     magma_int_t* iwork, magma_int_t* info);
 
 template<class scalar_t>
@@ -538,7 +539,8 @@ void magmaSvd<double>(
     magma_vec_t jobz, magma_int_t m, magma_int_t n, double* A,
     magma_int_t lda, double* s, double* U, magma_int_t ldu,
     double* VT, magma_int_t ldvt, double* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t* info) {
+    double *rwork, magma_int_t* iwork, magma_int_t* info) {
+  (void)rwork; // unused
   MagmaStreamSyncGuard guard;
   magma_dgesdd(jobz, m, n, A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
@@ -549,12 +551,43 @@ void magmaSvd<float>(
     magma_vec_t jobz, magma_int_t m, magma_int_t n, float* A,
     magma_int_t lda, float* s, float* U, magma_int_t ldu,
     float* VT, magma_int_t ldvt, float* work, magma_int_t lwork,
-    magma_int_t* iwork, magma_int_t* info) {
+    float* rwork, magma_int_t* iwork, magma_int_t* info) {
+  (void)rwork; // unused
   MagmaStreamSyncGuard guard;
   magma_sgesdd(jobz, m, n, A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info);
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaSvd<c10::complex<float>, float>(
+    magma_vec_t jobz, magma_int_t m, magma_int_t n, c10::complex<float>* A,
+    magma_int_t lda, float* s, c10::complex<float>* U, magma_int_t ldu,
+    c10::complex<float>* VT, magma_int_t ldvt, c10::complex<float>* work, magma_int_t lwork,
+    float *rwork, magma_int_t* iwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cgesdd(jobz, m, n, reinterpret_cast<magmaFloatComplex*>(A), lda, s,
+                reinterpret_cast<magmaFloatComplex*>(U), ldu,
+                reinterpret_cast<magmaFloatComplex*>(VT), ldvt,
+                reinterpret_cast<magmaFloatComplex*>(work), lwork,
+                rwork, iwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaSvd<c10::complex<double>, double>(
+    magma_vec_t jobz, magma_int_t m, magma_int_t n, c10::complex<double>* A,
+    magma_int_t lda, double* s, c10::complex<double>* U, magma_int_t ldu,
+    c10::complex<double>* VT, magma_int_t ldvt, c10::complex<double>* work, magma_int_t lwork,
+    double *rwork, magma_int_t* iwork, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zgesdd(jobz, m, n, reinterpret_cast<magmaDoubleComplex*>(A), lda, s,
+                reinterpret_cast<magmaDoubleComplex*>(U), ldu,
+                reinterpret_cast<magmaDoubleComplex*>(VT), ldvt,
+                reinterpret_cast<magmaDoubleComplex*>(work), lwork,
+                rwork, iwork, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
 template<>
 void magmaLuSolve<double>(
     magma_int_t n, magma_int_t nrhs, double* dA, magma_int_t ldda, magma_int_t* ipiv,
@@ -1299,9 +1332,11 @@ AT_ERROR("symeig: MAGMA library not found in "
   ALLOCATE_ARRAY(iwork, magma_int_t, liwork);
 
   value_t* rwork = nullptr;
+  c10::Storage storage_rwork;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
     lrwork = magma_int_cast(rwkopt, "rwork_size");
-    ALLOCATE_ARRAY(rwork, value_t, lrwork);
+    storage_rwork = pin_memory<value_t>(lrwork);
+    rwork = static_cast<value_t*>(storage_rwork.data());
   }
 
   for (int64_t i = 0; i < batch_size; i++) {
@@ -1363,9 +1398,10 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
 AT_ERROR("svd: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
 #else
+  using value_t = typename c10::scalar_value_type<scalar_t>::type;
   auto self_data = self.data_ptr<scalar_t>();
   auto U_data = U.data_ptr<scalar_t>();
-  auto S_data = S.data_ptr<scalar_t>();
+  auto S_data = S.data_ptr<value_t>();
   auto VT_data = VT.data_ptr<scalar_t>();
   auto self_stride = matrixStride(self);
   auto U_stride = matrixStride(U);
@@ -1377,7 +1413,27 @@ AT_ERROR("svd: MAGMA library not found in "
 
   magma_int_t m = magma_int_cast(self.size(-2), "m");
   magma_int_t n = magma_int_cast(self.size(-1), "n");
-  auto k = std::min(m, n);
+  auto mn = std::min(m, n);
+
+  c10::Storage storage_rwork;
+  value_t* rwork = nullptr;
+
+  magma_int_t* iwork;
+  ALLOCATE_ARRAY(iwork, magma_int_t, 8 * mn);
+  // Copy-n-paste rwork size computation from BatchLinearAlgebra.cpp
+  if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
+    auto mx = std::max(m, n);
+    int64_t lrwork; // These settings are valid for on LAPACK 3.6+
+    if (jobz == MagmaNoVec){
+      lrwork = 7 * mn;
+    } else if (mx > 10 * mn){
+      lrwork = 7 * mn * mn + 7 * mn;
+    } else {
+      lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn);
+    }
+    storage_rwork = pin_memory<value_t>(lrwork);
+    rwork = static_cast<value_t*>(storage_rwork.data());
+  }
 
   magma_int_t info = 0;
   // Run once, first to get the optimum work size.
@@ -1386,22 +1442,20 @@ AT_ERROR("svd: MAGMA library not found in "
   // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty()
   magma_int_t lwork = -1;
   scalar_t wkopt;
-  magma_int_t* iwork;
-  ALLOCATE_ARRAY(iwork, magma_int_t, 8 * k);
-  magmaSvd<scalar_t>(jobz, m, n, self_data, m, S_data, U_data, m, VT_data, n, &wkopt, lwork, iwork, &info);
-  lwork = magma_int_cast(wkopt, "work_size");
+  magmaSvd<scalar_t, value_t>(jobz, m, n, self_data, m, S_data, U_data, m, VT_data, n, &wkopt, lwork, rwork, iwork, &info);
+  lwork = magma_int_cast(real_impl<scalar_t, value_t>(wkopt), "work_size");
   scalar_t* work;
   ALLOCATE_ARRAY(work, scalar_t, lwork);
 
   for (int64_t i = 0; i < batchsize; i++) {
     scalar_t* self_working_ptr = &self_data[i * self_stride];
-    scalar_t* S_working_ptr = &S_data[i * S_stride];
+    value_t* S_working_ptr = &S_data[i * S_stride];
     scalar_t* U_working_ptr = &U_data[i * U_stride];
     scalar_t* VT_working_ptr = &VT_data[i * VT_stride];
 
     // Compute S, U (optionally), VT (optionally)
-    magmaSvd<scalar_t>(jobz, m, n, self_working_ptr, m,
-                       S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work, lwork, iwork, &info);
+    magmaSvd<scalar_t, value_t>(jobz, m, n, self_working_ptr, m,
+                                S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work, lwork, rwork, iwork, &info);
     infos[i] = info;
     if (info != 0) {
       return;
@@ -1434,7 +1488,7 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda(const Tensor& self, bool som
                                                at::TensorOptions(at::kCPU).dtype(self.dtype()).pinned_memory(true));
     self_working_copy.copy_(self);
 
-    AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "svd_cuda", [&]{
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "svd_cuda", [&] {
       apply_svd<scalar_t>(self_working_copy, U_working_copy, S_working_copy, VT_working_copy, jobchar, infos);
     });
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 6215337d7456..97c7b926faf4 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -477,9 +477,6 @@ def gen_error_message(input_size, ord, keepdim, dim=None):
             unsupported_matrix_ords = [
                 (None, r'norm with p=2 not supported for complex tensors'),
                 ('fro', r'frobenius norm not supported for complex tensors'),
-                (2, r'"svd_cuda" not implemented for \'Complex'),
-                (-2, r'"svd_cuda" not implemented for \'Complex'),
-                ('nuc', r'"svd_cuda" not implemented for \'Complex'),
             ]
 
         # Test supported ords
diff --git a/test/test_torch.py b/test/test_torch.py
index 126cbda5815f..9796803d46f1 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -20838,7 +20838,7 @@ def _test_svd_helper(self, shape, some, col_maj, device, dtype):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(*_float_types_no_half)
+    @dtypes(*(_float_types_no_half + _complex_types))
     def test_svd_square(self, device, dtype):
         self._test_svd_helper((10, 10), True, False, device, dtype)
 

From ff568a0e6b8a686e0bd9846524941f0e02905202 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Sat, 3 Oct 2020 12:25:25 -0700
Subject: [PATCH 400/449] Revert D24072697: [te] Get llvm codegen to compile
 with llvm9 and llvm-fb

Test Plan: revert-hammer

Differential Revision:
D24072697 (https://github.com/pytorch/pytorch/commit/e3d2defdc862c4b28858c2a920532b5baab2e963)

Original commit changeset: 7f56b9f3cbe5

fbshipit-source-id: b32c5163e4fb6df99447f95fdb82674e5ae62f22
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp |  82 +++++---------
 torch/csrc/jit/tensorexpr/llvm_jit.cpp     | 121 +--------------------
 torch/csrc/jit/tensorexpr/llvm_jit.h       |   3 +-
 3 files changed, 30 insertions(+), 176 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 2d20bd1b47d0..609c5a731c42 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -77,7 +77,7 @@ llvm::CmpInst::Predicate llvm_comparison_predicate(
 
 class LLVMCodeGenImpl : public IRVisitor {
  private:
-  std::unique_ptr<llvm::LLVMContext> context_;
+  llvm::orc::ThreadSafeContext context_;
   llvm::IRBuilder<> irb_;
   std::unique_ptr<llvm::TargetMachine> TM_;
   std::unique_ptr<llvm::orc::PytorchLLVMJIT> jit_;
@@ -313,7 +313,8 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
   emitWrapper(params);
   emitKernel(stmt, params);
 
-  cantFail(jit_->addModule(std::move(module_), std::move(context_)));
+  cantFail(jit_->addModule(
+      llvm::orc::ThreadSafeModule(std::move(module_), context_)));
   auto sym = jit_->findSymbol("wrapper");
   kernelAddress_ = cantFail(sym.getAddress());
   argv_ = std::make_unique<void*[]>(params.size());
@@ -322,7 +323,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
 }
 
 llvm::LLVMContext& LLVMCodeGenImpl::getContext() {
-  return *context_;
+  return *context_.getContext();
 }
 
 llvm::Type* LLVMCodeGenImpl::dtypeToLLVM(Dtype dtype) {
@@ -972,7 +973,7 @@ void LLVMCodeGenImpl::visit(const Load* v) {
       auto addr = irb_.CreateGEP(base, first_idx);
       auto vaddr = irb_.CreateBitOrPointerCast(
           addr, llvm::PointerType::get(loadType, 0));
-      value_ = irb_.CreateAlignedLoad(vaddr, 4);
+      value_ = irb_.CreateAlignedLoad(loadType, vaddr, 4);
       return;
     }
   }
@@ -1220,48 +1221,14 @@ void LLVMCodeGenImpl::visit(const BaseCallNode* v) {
 
 static void applyMathFunctionAttributes(llvm::Function* f) {
   f->addFnAttr(llvm::Attribute::ReadNone);
+  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::NoUnwind);
   // TODO: Adding this attr should be correct, but as of LLVM 9.0.1 adding it
   // causes some math functions to incorrectly be turned into tail calls.
   // f->addFnAttr(llvm::Attribute::Speculatable);
-#if LLVM_VERSION_MAJOR == 9
-  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::WillReturn);
-#endif
 }
 
-namespace {
-#if LLVM_VERSION_MAJOR == 9
-
-using FunctionCallee = llvm::FunctionCallee;
-
-#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
-
-struct FunctionCallee {
-  FunctionCallee() {}
-
-  FunctionCallee(llvm::Constant* fn)
-      : v_(fn), ft_(cast<llvm::Function>(v_)->getFunctionType()) {}
-
-  llvm::FunctionType* getFunctionType() {
-    return ft_;
-  }
-
-  llvm::Value* getCallee() {
-    return v_;
-  }
-
- private:
-  llvm::Value* v_{nullptr};
-  llvm::FunctionType* ft_{nullptr};
-};
-
-#else
-#error Only LLVM versions 8 or 9 are supported.
-#endif
-
-} // namespace
-
 void LLVMCodeGenImpl::visit(const Intrinsics* v) {
   llvm::FunctionType* call_ty = nullptr;
   llvm::Value* call_fn = nullptr;
@@ -1283,7 +1250,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1308,7 +1275,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1354,7 +1321,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1383,7 +1350,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1407,15 +1374,16 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
         SIMD_BINARY_MATH_CASE(kFmod, "fmodf", FloatTy_)
 #undef SIMD_BINARY_MATH_CASE
 
-      case kRemainder: {
-        FunctionCallee callee = module_->getOrInsertFunction(
-            "remainderf",
-            llvm::FunctionType::get(FloatTy_, {FloatTy_, FloatTy_}, false),
-            {});
-        call_ty = callee.getFunctionType();
-        call_fn = callee.getCallee();
-        applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));
-      } break;
+#define BINARY_MATH_CASE(enum, name, type)                             \
+  case enum: {                                                         \
+    auto callee = module_->getOrInsertFunction(                        \
+        name, llvm::FunctionType::get(type, {type, type}, false), {}); \
+    call_ty = callee.getFunctionType();                                \
+    call_fn = callee.getCallee();                                      \
+    applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));  \
+  } break;
+        BINARY_MATH_CASE(kRemainder, "remainderf", FloatTy_)
+#undef BINARY_MATH_CASE
 
       default: {
         throw unimplemented_lowering(v);
@@ -1427,7 +1395,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1452,7 +1420,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1509,7 +1477,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1538,7 +1506,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    FunctionCallee callee;                                                   \
+    llvm::FunctionCallee callee;                                             \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1564,7 +1532,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 
 #define BINARY_MATH_CASE(enum, name, type)                             \
   case enum: {                                                         \
-    FunctionCallee callee = module_->getOrInsertFunction(              \
+    auto callee = module_->getOrInsertFunction(                        \
         name, llvm::FunctionType::get(type, {type, type}, false), {}); \
     call_ty = callee.getFunctionType();                                \
     call_fn = callee.getCallee();                                      \
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
index 8a8e2bf48513..c4cc9337ce16 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
@@ -2,23 +2,7 @@
 
 #include <torch/csrc/jit/tensorexpr/llvm_jit.h>
 
-#include <llvm/ExecutionEngine/ExecutionEngine.h>
-#include <llvm/ExecutionEngine/JITSymbol.h>
-#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
-#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
-#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
 #include <llvm/ExecutionEngine/Orc/LLJIT.h>
-#include <llvm/ExecutionEngine/Orc/LambdaResolver.h>
-#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
-#include <llvm/ExecutionEngine/Orc/SymbolStringPool.h>
-#include <llvm/ExecutionEngine/RTDyldMemoryManager.h>
-#include <llvm/ExecutionEngine/SectionMemoryManager.h>
-#include <llvm/IR/DataLayout.h>
-#include <llvm/IR/Mangler.h>
-#include <llvm/Support/DynamicLibrary.h>
-#include <llvm/Support/raw_ostream.h>
-#include <llvm/Target/TargetMachine.h>
-
 #include <sleef.h>
 #include <algorithm>
 #include <memory>
@@ -30,7 +14,6 @@ namespace orc {
 
 // Lightly modified implementation from LLVM's Kaleidoscope JIT tutorial:
 // https://llvm.org/docs/tutorial/BuildingAJIT1.html
-#if LLVM_VERSION_MAJOR == 9
 class TORCH_API PytorchLLVMJITImpl {
  private:
   std::unique_ptr<LLJIT> LLJ;
@@ -437,9 +420,8 @@ class TORCH_API PytorchLLVMJITImpl {
 #endif
   }
 
-  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C) {
-    if (auto Err =
-            LLJ->addIRModule(ThreadSafeModule(std::move(M), std::move(C)))) {
+  Error addModule(ThreadSafeModule M) {
+    if (auto Err = LLJ->addIRModule(std::move(M))) {
       return Err;
     }
     return Error::success();
@@ -459,99 +441,8 @@ PytorchLLVMJIT::PytorchLLVMJIT()
 
 PytorchLLVMJIT::~PytorchLLVMJIT() = default;
 
-Error PytorchLLVMJIT::addModule(
-    std::unique_ptr<Module> M,
-    std::unique_ptr<LLVMContext> C) {
-  return impl_->addModule(std::move(M), std::move(C));
-}
-
-JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
-  return impl_->findSymbol(std::move(Name));
-}
-
-const DataLayout& PytorchLLVMJIT::getDataLayout() {
-  return impl_->getDataLayout();
-}
-
-#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
-
-class TORCH_API PytorchLLVMJITImpl {
- private:
-  ExecutionSession ES;
-  std::shared_ptr<SymbolResolver> Resolver;
-  std::unique_ptr<TargetMachine> TM;
-  const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
-
- public:
-  PytorchLLVMJITImpl()
-      : Resolver(createLegacyLookupResolver(
-            ES,
-            [this](const std::string& Name) -> JITSymbol {
-              if (auto Sym = CompileLayer.findSymbol(Name, false))
-                return Sym;
-              else if (auto Err = Sym.takeError())
-                return std::move(Err);
-              if (auto SymAddr =
-                      RTDyldMemoryManager::getSymbolAddressInProcess(Name))
-                return JITSymbol(SymAddr, JITSymbolFlags::Exported);
-              return nullptr;
-            },
-            [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })),
-        TM(EngineBuilder().selectTarget()),
-        DL(TM->createDataLayout()),
-        ObjectLayer(
-            ES,
-            [this](VModuleKey) {
-              return RTDyldObjectLinkingLayer::Resources{
-                  std::make_shared<SectionMemoryManager>(), Resolver};
-            }),
-        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
-    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
-  }
-
-  TargetMachine& getTargetMachine() {
-    return *TM;
-  }
-
-  VModuleKey addModule(std::unique_ptr<Module> M) {
-    // Add the module to the JIT with a new VModuleKey.
-    auto K = ES.allocateVModule();
-    cantFail(CompileLayer.addModule(K, std::move(M)));
-    return K;
-  }
-
-  JITSymbol findSymbol(const std::string Name) {
-    std::string MangledName;
-    raw_string_ostream MangledNameStream(MangledName);
-    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    return CompileLayer.findSymbol(MangledNameStream.str(), true);
-  }
-
-  JITTargetAddress getSymbolAddress(const std::string Name) {
-    return cantFail(findSymbol(Name).getAddress());
-  }
-
-  void removeModule(VModuleKey K) {
-    cantFail(CompileLayer.removeModule(K));
-  }
-
-  const DataLayout& getDataLayout() {
-    return DL;
-  }
-};
-
-PytorchLLVMJIT::PytorchLLVMJIT()
-    : impl_(std::make_unique<PytorchLLVMJITImpl>()) {}
-
-PytorchLLVMJIT::~PytorchLLVMJIT() = default;
-
-Error PytorchLLVMJIT::addModule(
-    std::unique_ptr<Module> M,
-    std::unique_ptr<LLVMContext> C) {
-  impl_->addModule(std::move(M));
-  return Error::success();
+Error PytorchLLVMJIT::addModule(ThreadSafeModule M) {
+  return impl_->addModule(std::move(M));
 }
 
 JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
@@ -562,10 +453,6 @@ const DataLayout& PytorchLLVMJIT::getDataLayout() {
   return impl_->getDataLayout();
 }
 
-#else // LLVM_VERSION_MAJOR
-#error Only LLVM versions 8 or 9 are supported.
-#endif
-
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index fa73cdfca3bc..0a96efd1298a 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -21,12 +21,11 @@ class TORCH_API PytorchLLVMJIT {
   PytorchLLVMJIT();
   ~PytorchLLVMJIT();
 
-  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C);
+  Error addModule(ThreadSafeModule M);
 
   JITSymbol findSymbol(const std::string Name);
 
   TargetMachine& getTargetMachine();
-
   const DataLayout& getDataLayout();
 
  private:

From 24fa2daea68e31e2623072dd186727159db022c9 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Sat, 3 Oct 2020 15:31:46 -0700
Subject: [PATCH 401/449] Revert D24100389: Revert D24072697: [te] Get llvm
 codegen to compile with llvm9 and llvm-fb

Test Plan: revert-hammer

Differential Revision:
D24100389

Original commit changeset: b32c5163e4fb

fbshipit-source-id: 9ce7bfbcf411c0584e5d535ee107fb5a135ee6e6
---
 torch/csrc/jit/tensorexpr/llvm_codegen.cpp |  82 +++++++++-----
 torch/csrc/jit/tensorexpr/llvm_jit.cpp     | 121 ++++++++++++++++++++-
 torch/csrc/jit/tensorexpr/llvm_jit.h       |   3 +-
 3 files changed, 176 insertions(+), 30 deletions(-)

diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index 609c5a731c42..2d20bd1b47d0 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -77,7 +77,7 @@ llvm::CmpInst::Predicate llvm_comparison_predicate(
 
 class LLVMCodeGenImpl : public IRVisitor {
  private:
-  llvm::orc::ThreadSafeContext context_;
+  std::unique_ptr<llvm::LLVMContext> context_;
   llvm::IRBuilder<> irb_;
   std::unique_ptr<llvm::TargetMachine> TM_;
   std::unique_ptr<llvm::orc::PytorchLLVMJIT> jit_;
@@ -313,8 +313,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
   emitWrapper(params);
   emitKernel(stmt, params);
 
-  cantFail(jit_->addModule(
-      llvm::orc::ThreadSafeModule(std::move(module_), context_)));
+  cantFail(jit_->addModule(std::move(module_), std::move(context_)));
   auto sym = jit_->findSymbol("wrapper");
   kernelAddress_ = cantFail(sym.getAddress());
   argv_ = std::make_unique<void*[]>(params.size());
@@ -323,7 +322,7 @@ LLVMCodeGenImpl::LLVMCodeGenImpl(
 }
 
 llvm::LLVMContext& LLVMCodeGenImpl::getContext() {
-  return *context_.getContext();
+  return *context_;
 }
 
 llvm::Type* LLVMCodeGenImpl::dtypeToLLVM(Dtype dtype) {
@@ -973,7 +972,7 @@ void LLVMCodeGenImpl::visit(const Load* v) {
       auto addr = irb_.CreateGEP(base, first_idx);
       auto vaddr = irb_.CreateBitOrPointerCast(
           addr, llvm::PointerType::get(loadType, 0));
-      value_ = irb_.CreateAlignedLoad(loadType, vaddr, 4);
+      value_ = irb_.CreateAlignedLoad(vaddr, 4);
       return;
     }
   }
@@ -1221,14 +1220,48 @@ void LLVMCodeGenImpl::visit(const BaseCallNode* v) {
 
 static void applyMathFunctionAttributes(llvm::Function* f) {
   f->addFnAttr(llvm::Attribute::ReadNone);
-  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::NoUnwind);
   // TODO: Adding this attr should be correct, but as of LLVM 9.0.1 adding it
   // causes some math functions to incorrectly be turned into tail calls.
   // f->addFnAttr(llvm::Attribute::Speculatable);
+#if LLVM_VERSION_MAJOR == 9
+  f->addFnAttr(llvm::Attribute::NoFree);
   f->addFnAttr(llvm::Attribute::WillReturn);
+#endif
 }
 
+namespace {
+#if LLVM_VERSION_MAJOR == 9
+
+using FunctionCallee = llvm::FunctionCallee;
+
+#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
+
+struct FunctionCallee {
+  FunctionCallee() {}
+
+  FunctionCallee(llvm::Constant* fn)
+      : v_(fn), ft_(cast<llvm::Function>(v_)->getFunctionType()) {}
+
+  llvm::FunctionType* getFunctionType() {
+    return ft_;
+  }
+
+  llvm::Value* getCallee() {
+    return v_;
+  }
+
+ private:
+  llvm::Value* v_{nullptr};
+  llvm::FunctionType* ft_{nullptr};
+};
+
+#else
+#error Only LLVM versions 8 or 9 are supported.
+#endif
+
+} // namespace
+
 void LLVMCodeGenImpl::visit(const Intrinsics* v) {
   llvm::FunctionType* call_ty = nullptr;
   llvm::Value* call_fn = nullptr;
@@ -1250,7 +1283,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1275,7 +1308,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1321,7 +1354,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 8) {                                           \
       fname = "Sleef_" + std::string(name) + "8";                            \
@@ -1350,7 +1383,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "4";                            \
@@ -1374,16 +1407,15 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
         SIMD_BINARY_MATH_CASE(kFmod, "fmodf", FloatTy_)
 #undef SIMD_BINARY_MATH_CASE
 
-#define BINARY_MATH_CASE(enum, name, type)                             \
-  case enum: {                                                         \
-    auto callee = module_->getOrInsertFunction(                        \
-        name, llvm::FunctionType::get(type, {type, type}, false), {}); \
-    call_ty = callee.getFunctionType();                                \
-    call_fn = callee.getCallee();                                      \
-    applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));  \
-  } break;
-        BINARY_MATH_CASE(kRemainder, "remainderf", FloatTy_)
-#undef BINARY_MATH_CASE
+      case kRemainder: {
+        FunctionCallee callee = module_->getOrInsertFunction(
+            "remainderf",
+            llvm::FunctionType::get(FloatTy_, {FloatTy_, FloatTy_}, false),
+            {});
+        call_ty = callee.getFunctionType();
+        call_fn = callee.getCallee();
+        applyMathFunctionAttributes(llvm::cast<llvm::Function>(call_fn));
+      } break;
 
       default: {
         throw unimplemented_lowering(v);
@@ -1395,7 +1427,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1420,7 +1452,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_UNARY_MATH_CASE(enum, name, type)                               \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1477,7 +1509,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #if defined(__AVX__) && !defined(_MSC_VER)
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 4) {                                           \
       fname = "Sleef_" + std::string(name) + "d4";                           \
@@ -1506,7 +1538,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 #else
 #define SIMD_BINARY_MATH_CASE(enum, name, type)                              \
   case enum: {                                                               \
-    llvm::FunctionCallee callee;                                             \
+    FunctionCallee callee;                                                   \
     std::string fname;                                                       \
     if (v->dtype().lanes() == 2) {                                           \
       fname = "Sleef_" + std::string(name) + "d2";                           \
@@ -1532,7 +1564,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) {
 
 #define BINARY_MATH_CASE(enum, name, type)                             \
   case enum: {                                                         \
-    auto callee = module_->getOrInsertFunction(                        \
+    FunctionCallee callee = module_->getOrInsertFunction(              \
         name, llvm::FunctionType::get(type, {type, type}, false), {}); \
     call_ty = callee.getFunctionType();                                \
     call_fn = callee.getCallee();                                      \
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.cpp b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
index c4cc9337ce16..8a8e2bf48513 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.cpp
@@ -2,7 +2,23 @@
 
 #include <torch/csrc/jit/tensorexpr/llvm_jit.h>
 
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITSymbol.h>
+#include <llvm/ExecutionEngine/Orc/CompileUtils.h>
+#include <llvm/ExecutionEngine/Orc/ExecutionUtils.h>
+#include <llvm/ExecutionEngine/Orc/IRCompileLayer.h>
 #include <llvm/ExecutionEngine/Orc/LLJIT.h>
+#include <llvm/ExecutionEngine/Orc/LambdaResolver.h>
+#include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
+#include <llvm/ExecutionEngine/Orc/SymbolStringPool.h>
+#include <llvm/ExecutionEngine/RTDyldMemoryManager.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/Mangler.h>
+#include <llvm/Support/DynamicLibrary.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+
 #include <sleef.h>
 #include <algorithm>
 #include <memory>
@@ -14,6 +30,7 @@ namespace orc {
 
 // Lightly modified implementation from LLVM's Kaleidoscope JIT tutorial:
 // https://llvm.org/docs/tutorial/BuildingAJIT1.html
+#if LLVM_VERSION_MAJOR == 9
 class TORCH_API PytorchLLVMJITImpl {
  private:
   std::unique_ptr<LLJIT> LLJ;
@@ -420,8 +437,9 @@ class TORCH_API PytorchLLVMJITImpl {
 #endif
   }
 
-  Error addModule(ThreadSafeModule M) {
-    if (auto Err = LLJ->addIRModule(std::move(M))) {
+  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C) {
+    if (auto Err =
+            LLJ->addIRModule(ThreadSafeModule(std::move(M), std::move(C)))) {
       return Err;
     }
     return Error::success();
@@ -441,8 +459,99 @@ PytorchLLVMJIT::PytorchLLVMJIT()
 
 PytorchLLVMJIT::~PytorchLLVMJIT() = default;
 
-Error PytorchLLVMJIT::addModule(ThreadSafeModule M) {
-  return impl_->addModule(std::move(M));
+Error PytorchLLVMJIT::addModule(
+    std::unique_ptr<Module> M,
+    std::unique_ptr<LLVMContext> C) {
+  return impl_->addModule(std::move(M), std::move(C));
+}
+
+JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
+  return impl_->findSymbol(std::move(Name));
+}
+
+const DataLayout& PytorchLLVMJIT::getDataLayout() {
+  return impl_->getDataLayout();
+}
+
+#elif LLVM_VERSION_MAJOR == 8 && LLVM_VERSION_PATCH == 20181009
+
+class TORCH_API PytorchLLVMJITImpl {
+ private:
+  ExecutionSession ES;
+  std::shared_ptr<SymbolResolver> Resolver;
+  std::unique_ptr<TargetMachine> TM;
+  const DataLayout DL;
+  RTDyldObjectLinkingLayer ObjectLayer;
+  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+
+ public:
+  PytorchLLVMJITImpl()
+      : Resolver(createLegacyLookupResolver(
+            ES,
+            [this](const std::string& Name) -> JITSymbol {
+              if (auto Sym = CompileLayer.findSymbol(Name, false))
+                return Sym;
+              else if (auto Err = Sym.takeError())
+                return std::move(Err);
+              if (auto SymAddr =
+                      RTDyldMemoryManager::getSymbolAddressInProcess(Name))
+                return JITSymbol(SymAddr, JITSymbolFlags::Exported);
+              return nullptr;
+            },
+            [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })),
+        TM(EngineBuilder().selectTarget()),
+        DL(TM->createDataLayout()),
+        ObjectLayer(
+            ES,
+            [this](VModuleKey) {
+              return RTDyldObjectLinkingLayer::Resources{
+                  std::make_shared<SectionMemoryManager>(), Resolver};
+            }),
+        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
+    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+  }
+
+  TargetMachine& getTargetMachine() {
+    return *TM;
+  }
+
+  VModuleKey addModule(std::unique_ptr<Module> M) {
+    // Add the module to the JIT with a new VModuleKey.
+    auto K = ES.allocateVModule();
+    cantFail(CompileLayer.addModule(K, std::move(M)));
+    return K;
+  }
+
+  JITSymbol findSymbol(const std::string Name) {
+    std::string MangledName;
+    raw_string_ostream MangledNameStream(MangledName);
+    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
+    return CompileLayer.findSymbol(MangledNameStream.str(), true);
+  }
+
+  JITTargetAddress getSymbolAddress(const std::string Name) {
+    return cantFail(findSymbol(Name).getAddress());
+  }
+
+  void removeModule(VModuleKey K) {
+    cantFail(CompileLayer.removeModule(K));
+  }
+
+  const DataLayout& getDataLayout() {
+    return DL;
+  }
+};
+
+PytorchLLVMJIT::PytorchLLVMJIT()
+    : impl_(std::make_unique<PytorchLLVMJITImpl>()) {}
+
+PytorchLLVMJIT::~PytorchLLVMJIT() = default;
+
+Error PytorchLLVMJIT::addModule(
+    std::unique_ptr<Module> M,
+    std::unique_ptr<LLVMContext> C) {
+  impl_->addModule(std::move(M));
+  return Error::success();
 }
 
 JITSymbol PytorchLLVMJIT::findSymbol(const std::string Name) {
@@ -453,6 +562,10 @@ const DataLayout& PytorchLLVMJIT::getDataLayout() {
   return impl_->getDataLayout();
 }
 
+#else // LLVM_VERSION_MAJOR
+#error Only LLVM versions 8 or 9 are supported.
+#endif
+
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/torch/csrc/jit/tensorexpr/llvm_jit.h b/torch/csrc/jit/tensorexpr/llvm_jit.h
index 0a96efd1298a..fa73cdfca3bc 100644
--- a/torch/csrc/jit/tensorexpr/llvm_jit.h
+++ b/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -21,11 +21,12 @@ class TORCH_API PytorchLLVMJIT {
   PytorchLLVMJIT();
   ~PytorchLLVMJIT();
 
-  Error addModule(ThreadSafeModule M);
+  Error addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C);
 
   JITSymbol findSymbol(const std::string Name);
 
   TargetMachine& getTargetMachine();
+
   const DataLayout& getDataLayout();
 
  private:

From 8a6b9191635bef2173b06f970dcc3a08081bc778 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Sat, 3 Oct 2020 16:52:28 -0700
Subject: [PATCH 402/449] [StaticRuntime] Fix broken tests (#45813)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45813

Fix tests broken by D23996656 (https://github.com/pytorch/pytorch/commit/2b48dd168d846c864d4e12f9d66dc61be67fa811).

Test Plan:
```
buck test mode/opt //pytorch/tensorboardX:test_pytorchtb -- 'test_pytorch_graph \(pytorch\.tensorboardX\.tests\.test_pytorch_graph\.PytorchGraphTest\)'
buck test mode/opt //pytext/tests:
buck test mode/dev-nosan //mobile-vision/projects/detectron2go/tests:test_caffe2_compatibles
```

Reviewed By: yinghai

Differential Revision: D24100807

fbshipit-source-id: e2f92aadca4161f5cf9f552e922fb4d6500af3a4
---
 torch/csrc/jit/runtime/static/init.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch/csrc/jit/runtime/static/init.cpp b/torch/csrc/jit/runtime/static/init.cpp
index 5d9c4a67b9f9..f55ca1a2801b 100644
--- a/torch/csrc/jit/runtime/static/init.cpp
+++ b/torch/csrc/jit/runtime/static/init.cpp
@@ -10,12 +10,6 @@ void initStaticRuntimeBindings(PyObject* module) {
       .def(
           "run",
           py::overload_cast<const std::vector<at::Tensor>&>(
-              &StaticRuntime::run, py::const_))
-      .def(
-          "run",
-          py::overload_cast<
-              const std::vector<c10::IValue>&,
-              const std::unordered_map<std::string, c10::IValue>&>(
               &StaticRuntime::run, py::const_));
   m.def(
        "_jit_to_static_runtime",

From 2ab74a48395c55d9b186a01bdb09fdd347bf4f1b Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Sat, 3 Oct 2020 21:11:52 -0700
Subject: [PATCH 403/449] [FX] Make Tracer.trace() just return a Graph (#45704)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45704

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D24067982

Pulled By: jamesr66a

fbshipit-source-id: c82aa6be504d45e110055a3c4db129d0b9ac3ef5
---
 test/quantization/test_quantize_fx.py | 4 ++--
 test/test_fx.py                       | 6 +++---
 torch/fx/graph_module.py              | 3 ++-
 torch/fx/symbolic_trace.py            | 6 +++---
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 116581b330a6..53551efb7c0f 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -369,7 +369,7 @@ def forward(self, x):
         original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach())
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
 
-        m = CustomTracer().trace(original_m).eval()
+        m = torch.fx.GraphModule(original_m, CustomTracer().trace(original_m)).eval()
         qconfig_dict = {'': default_qconfig, 'standalone_module_name': ['standalone']}
         # check prepared model
         m = prepare_fx(m, qconfig_dict)
@@ -762,7 +762,7 @@ def is_leaf_module(self, m, module_qualified_name):
             register_observed_custom_module_mapping(CustomModule, ObservedCustomModule)
             register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule)
 
-            m = CustomTracer().trace(original_m).eval()
+            m = torch.fx.GraphModule(original_m, CustomTracer().trace(original_m)).eval()
             qconfig_dict = {'': default_qconfig}
             # check prepared model
             m = prepare_fx(m, qconfig_dict)
diff --git a/test/test_fx.py b/test/test_fx.py
index 7f82d47895fc..9666ccd395bf 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -170,9 +170,9 @@ def forward(self, x):
 
         mrm = MyReluMod()
         sym = NoLeafModulesTracer().trace(mrm)
-        for node in sym.graph.nodes:
+        for node in sym.nodes:
             self.assertNotEqual(node.op, 'call_module')
-        sym.graph.lint(sym)
+        sym.lint(sym)
 
     def test_graph_edit_with_proxy(self):
         class M(torch.nn.Module):
@@ -444,7 +444,7 @@ def forward(self, a, b):
                 return a + b
 
         m = M()
-        g = TaggingTracer().trace(m).graph
+        g = TaggingTracer().trace(m)
         g.lint(m)
         for n in g.nodes:
             self.assertTrue(hasattr(n, 'tag'))
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 668d1ca07192..bbc5c26a8182 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -59,7 +59,8 @@ class KeepModules(Tracer):
         def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
             return True
 
-    return KeepModules().trace(CodeOnlyModule(body))
+    com = CodeOnlyModule(body)
+    return GraphModule(com, KeepModules().trace(com))
 
 # copy an attribute value with qualified name 'target' from 'from_module' to 'to_module'
 # This installs empty Modules where none exist yet if they are subpaths of target
diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py
index 85af99e5f646..7c295f6af133 100644
--- a/torch/fx/symbolic_trace.py
+++ b/torch/fx/symbolic_trace.py
@@ -119,7 +119,7 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo
         """
         return m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential)
 
-    def trace(self, root: torch.nn.Module) -> GraphModule:
+    def trace(self, root: torch.nn.Module) -> Graph:
         self.root = root
         self.graph = Graph()
 
@@ -152,7 +152,7 @@ def module_call_wrapper(mod, *args, **kwargs):
             self.create_node('output', 'output', (self.create_arg(fn(*args)),), {})
         finally:
             torch.nn.Module.__call__ = orig_call
-        return GraphModule(root, self.graph)
+        return self.graph
 
     def _proxy_placeholder(self, name: str) -> Proxy:
         return Proxy(self.create_node('placeholder', name, (), {}), self)
@@ -165,4 +165,4 @@ def _proxy_placeholder(self, name: str) -> Proxy:
 # Args:
 #   - root - the `nn.Module` instance to trace
 def symbolic_trace(root : torch.nn.Module) -> GraphModule:
-    return Tracer().trace(root)
+    return GraphModule(root, Tracer().trace(root))

From e1ff46b6e5b752a889174ba4a995e74898799287 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sun, 4 Oct 2020 11:35:58 -0700
Subject: [PATCH 404/449] CUDA BFloat16 TopK (#44755)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44755

Reviewed By: mruberry

Differential Revision: D23741680

Pulled By: ngimel

fbshipit-source-id: 8fce92a26663336bcb831c72202fe2623a2ddaf0
---
 aten/src/THC/THCDeviceUtils.cuh       | 4 +++-
 aten/src/THC/generic/THCTensorTopK.cu | 4 ----
 test/test_torch.py                    | 9 ++++-----
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/aten/src/THC/THCDeviceUtils.cuh b/aten/src/THC/THCDeviceUtils.cuh
index 171488d91214..5bd751a4921f 100644
--- a/aten/src/THC/THCDeviceUtils.cuh
+++ b/aten/src/THC/THCDeviceUtils.cuh
@@ -7,6 +7,8 @@
 #include <c10/util/Half.h>
 #endif
 
+#include <c10/util/BFloat16.h>
+
 /* The largest consecutive integer representable in float32 (2^24) */
 #define FLOAT32_MAX_CONSECUTIVE_INT 16777216.0f
 
@@ -32,7 +34,7 @@ __host__ __device__ __forceinline__ T THCRoundUp(T a, T b) {
  */
 template <typename T>
 __device__ __forceinline__ T doLdg(const T* p) {
-#if __CUDA_ARCH__ >= 350
+#ifndef __HIP_PLATFORM_HCC__
   return __ldg(p);
 #else
   return *p;
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
index a50f5e8f51ac..357b3f2e22f3 100644
--- a/aten/src/THC/generic/THCTensorTopK.cu
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -9,9 +9,6 @@ void THCTensor_(topk)(THCState* state,
                       THCudaLongTensor *indices,
                       THCTensor *input_,
                       int64_t k, int dim, int dir, int sorted) {
-  #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__)
-  TORCH_CHECK(false, "topk not suppported with BFloat16");
-  #else
   THAssert(topK != NULL && indices != NULL && input_ != NULL);
   THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_));
   dim = at::maybe_wrap_dim(dim, input_);
@@ -186,7 +183,6 @@ void THCTensor_(topk)(THCState* state,
   THCudaLongTensor_free(state, input);
 
   THCudaCheck(cudaGetLastError());
-  #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__
 }
 
 #endif // THC_GENERIC_FILE
diff --git a/test/test_torch.py b/test/test_torch.py
index 9796803d46f1..0458e323b78f 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -14745,8 +14745,7 @@ def test_topk_integral(self, device, dtype):
         self.assertEqual(sort_topk, topk[0])      # check values
         self.assertEqual(sort_topk, a[topk[1]])   # check indices
 
-    @dtypesIfCUDA(*([torch.half, torch.float, torch.double]
-                    + ([torch.bfloat16] if TEST_WITH_ROCM else [])))
+    @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes())
     @dtypes(torch.float, torch.double)
     def test_topk_nonfinite(self, device, dtype):
         x = torch.tensor([float('nan'), float('inf'), 1e4, 0, -1e4, -float('inf')], device=device, dtype=dtype)
@@ -20337,11 +20336,11 @@ def inner(self, device, dtype):
     ('transpose', 'neg_dim', _new_t((1, 2, 3, 4)), lambda t, d: [-1, -2], ),
     ('tolist', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
     ('topk', 'dim_sort', _small_3d_unique, lambda t, d: [2, 1, False, True],
-        1e-5, 1e-5, 1e-5, _types2, _cpu_types, False),
+        1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('topk', 'neg_dim_sort', _small_3d_unique, lambda t, d: [2, -1, False, True],
-        1e-5, 1e-5, 1e-5, _types2, _cpu_types, False),
+        1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('topk', 'dim_desc_sort', _small_3d_unique, lambda t, d: [2, 1, True, True],
-        1e-5, 1e-5, 1e-5, _types2, _cpu_types, False),
+        1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False),
     ('trace', '', _medium_2d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _types, _cpu_types, False),
     ('tril', '', _medium_2d, lambda t, d: [],),
     ('tril', 'zero_stride', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),

From f65ab89eddc5cf449b94fdbddb4715fb8f57d2d6 Mon Sep 17 00:00:00 2001
From: kshitij12345 <kshitijkalambarkar@gmail.com>
Date: Mon, 5 Oct 2020 01:36:53 -0700
Subject: [PATCH 405/449] [numpy] Add torch.nan_to_num (#44592)

Summary:
Reference https://github.com/pytorch/pytorch/issues/42515

TODO:
* [x] Add tests
* [x] Add docs

Pull Request resolved: https://github.com/pytorch/pytorch/pull/44592

Reviewed By: colesbury

Differential Revision: D24079472

Pulled By: mruberry

fbshipit-source-id: 2b67d36cba46eaa7ca16cd72671b57750bd568bc
---
 aten/src/ATen/NumericUtils.h                  |  4 +--
 aten/src/ATen/core/aten_interned_strings.h    |  1 +
 aten/src/ATen/native/UnaryOps.cpp             | 36 +++++++++++++++++++
 aten/src/ATen/native/UnaryOps.h               |  7 ++++
 aten/src/ATen/native/cpu/UnaryOpsKernel.cpp   | 28 +++++++++++++++
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu   | 28 +++++++++++++++
 aten/src/ATen/native/native_functions.yaml    | 10 ++++++
 docs/source/tensors.rst                       |  2 ++
 docs/source/torch.rst                         |  1 +
 test/test_autograd.py                         | 27 ++++++++++++++
 test/test_unary_ufuncs.py                     | 36 +++++++++++++++++++
 tools/autograd/derivatives.yaml               |  3 ++
 tools/autograd/gen_variable_type.py           |  4 ++-
 torch/_tensor_docs.py                         | 12 +++++++
 torch/_torch_docs.py                          | 35 ++++++++++++++++++
 torch/overrides.py                            |  1 +
 .../_internal/common_methods_invocations.py   |  7 +++-
 17 files changed, 238 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h
index 6cbd974f51dd..d691fec1aa34 100644
--- a/aten/src/ATen/NumericUtils.h
+++ b/aten/src/ATen/NumericUtils.h
@@ -42,12 +42,12 @@ inline bool _isnan(T val) {
 template <typename T,
          typename std::enable_if<std::is_same<T, at::Half>::value, int>::type = 0>
 inline C10_HOST_DEVICE bool _isnan(T val) {
-  return at::_isnan(float(val));
+  return at::_isnan(static_cast<float>(val));
 }
 
 
 inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
-  return at::_isnan(float(val));
+  return at::_isnan(static_cast<float>(val));
 }
 
 template <typename T>
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index 4fa49302240b..54481814be5b 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -502,6 +502,7 @@ _(aten, multinomial) \
 _(aten, mv) \
 _(aten, mvlgamma) \
 _(aten, nansum) \
+_(aten, nan_to_num) \
 _(aten, narrow) \
 _(aten, narrow_copy) \
 _(aten, native_batch_norm) \
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 2c66a1970863..e2b5639f8dc9 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -387,6 +387,41 @@ Tensor& logit_(Tensor& self, c10::optional<double> eps) {
   return at::logit_out(self, self, eps);
 }
 
+Tensor& nan_to_num_out(
+    Tensor& result,
+    const Tensor& self,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+
+  if (c10::isIntegralType(self.scalar_type())) {
+    result.resize_as_(self);
+    result.copy_(self);
+    return result;
+  }
+
+  auto iter = TensorIterator::unary_op(result, self);
+  nan_to_num_stub(iter.device_type(), iter, nan, pos_inf, neg_inf);
+  return result;
+}
+
+Tensor nan_to_num(
+    const Tensor& self,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  auto result = at::empty_like(self);
+  return at::nan_to_num_out(result, self, nan, pos_inf, neg_inf);
+}
+
+Tensor& nan_to_num_(
+    Tensor& self,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  return at::nan_to_num_out(self, self, nan, pos_inf, neg_inf);
+}
+
 Tensor& tanh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, tanh_stub); }
 Tensor tanh(const Tensor& self) { return unary_op_impl(self, at::tanh_out); }
 Tensor& tanh_(Tensor& self) { return unary_op_impl_(self, at::tanh_out); }
@@ -645,6 +680,7 @@ DEFINE_DISPATCH(log1p_stub);
 DEFINE_DISPATCH(log2_stub);
 DEFINE_DISPATCH(logical_not_stub);
 DEFINE_DISPATCH(neg_stub);
+DEFINE_DISPATCH(nan_to_num_stub);
 DEFINE_DISPATCH(polygamma_stub);
 DEFINE_DISPATCH(reciprocal_stub);
 DEFINE_DISPATCH(round_stub);
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index 0dcd5a0b9473..a6db47f17153 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -77,6 +77,13 @@ DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional<Generator>), random_stub
 DECLARE_DISPATCH(void(*)(TensorIterator&, const int64_t), polygamma_stub);
 DECLARE_DISPATCH(void(*)(TensorIterator&, Scalar a, Scalar b), clamp_stub);
 DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, int64_t, bool, c10::optional<Generator>), multinomial_stub);
+DECLARE_DISPATCH(
+    void (*)(
+        TensorIterator&,
+        c10::optional<double>,
+        c10::optional<double>,
+        c10::optional<double>),
+    nan_to_num_stub);
 
 // Missing unary functions
 // digamma
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index cc9eedebff5a..84c3ceed3a23 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -383,6 +383,33 @@ static void polygamma_kernel(TensorIterator& iter, int64_t n) {
   }
 }
 
+static void nan_to_num_kernel(
+    TensorIterator& iter,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "nan_to_num", [&]() {
+    scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
+    scalar_t pos_inf_replacement = pos_inf.has_value()
+        ? static_cast<scalar_t>(pos_inf.value())
+        : std::numeric_limits<scalar_t>::max();
+    scalar_t neg_inf_replacement = neg_inf.has_value()
+        ? static_cast<scalar_t>(neg_inf.value())
+        : std::numeric_limits<scalar_t>::lowest();
+
+    cpu_kernel(iter, [=](scalar_t a) -> scalar_t {
+      return (
+          at::_isnan(a)
+              ? nan_replacement
+              : (a == std::numeric_limits<scalar_t>::infinity()
+                     ? pos_inf_replacement
+                     : (a == -std::numeric_limits<scalar_t>::infinity()
+                            ? neg_inf_replacement
+                            : a)));
+    });
+  });
+}
+
 static void clamp_kernel(TensorIterator& iter, Scalar min_scalar, Scalar max_scalar) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, iter.dtype(), "clamp_cpu", [&]() {
     c10::scalar_value_type<scalar_t>::type (*zabs_)(scalar_t) = zabs;
@@ -648,6 +675,7 @@ REGISTER_DISPATCH(bitwise_not_stub, &bitwise_not_kernel);
 REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel);
 REGISTER_DISPATCH(frac_stub, &frac_kernel);
 REGISTER_DISPATCH(reciprocal_stub, &reciprocal_kernel);
+REGISTER_DISPATCH(nan_to_num_stub, &nan_to_num_kernel);
 REGISTER_DISPATCH(neg_stub, &neg_kernel);
 REGISTER_DISPATCH(sign_stub, &sign_kernel);
 REGISTER_DISPATCH(signbit_stub, &signbit_kernel);
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 1067d7c61bc5..5b545471fb34 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -10,6 +10,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/Math.cuh>
+#include <ATen/NumericUtils.h>
 #include <c10/cuda/CUDAMathCompat.h>
 #include <c10/util/complex.h>
 
@@ -180,6 +181,32 @@ void clamp_max_kernel_cuda(TensorIterator& iter, Scalar max_value) {
   });
 }
 
+void nan_to_num_kernel_cuda(
+    TensorIterator& iter,
+    c10::optional<double> nan,
+    c10::optional<double> pos_inf,
+    c10::optional<double> neg_inf) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "nan_to_num_cuda", [&]() {
+    scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
+    scalar_t pos_inf_replacement = pos_inf.has_value()
+        ? static_cast<scalar_t>(pos_inf.value())
+        : std::numeric_limits<scalar_t>::max();
+    scalar_t neg_inf_replacement = neg_inf.has_value()
+        ? static_cast<scalar_t>(neg_inf.value())
+        : std::numeric_limits<scalar_t>::lowest();
+    gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return (
+          at::_isnan(a)
+              ? nan_replacement
+              : (a == std::numeric_limits<scalar_t>::infinity()
+                     ? pos_inf_replacement
+                     : (a == -std::numeric_limits<scalar_t>::infinity()
+                            ? neg_inf_replacement
+                            : a)));
+    });
+  });
+}
+
 void kaiser_window_kernel_cuda(TensorIterator& iter, int64_t window_length, double beta){
   AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){
     AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "kaiser_window_cuda", [&] {
@@ -206,6 +233,7 @@ REGISTER_DISPATCH(erfinv_stub, &erfinv_kernel_cuda);
 REGISTER_DISPATCH(clamp_stub, &clamp_kernel_cuda);
 REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel_cuda);
 REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel_cuda);
+REGISTER_DISPATCH(nan_to_num_stub, &nan_to_num_kernel_cuda);
 REGISTER_DISPATCH(kaiser_window_stub, &kaiser_window_kernel_cuda);
 
 } // namespace native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 92a20d8625b3..de5e98037277 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1981,6 +1981,16 @@
     CPU: layer_norm_backward_cpu
     CUDA: layer_norm_backward_cuda
 
+- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
+  use_c10_dispatcher: full
+  variants: function, method
+
+- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
+
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   use_c10_dispatcher: full
   python_module: nn
diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst
index 7cd1a88f82b3..94b1fb25f58e 100644
--- a/docs/source/tensors.rst
+++ b/docs/source/tensors.rst
@@ -453,6 +453,8 @@ view of a storage and defines numeric operations on it.
    .. automethod:: narrow
    .. automethod:: narrow_copy
    .. automethod:: ndimension
+   .. automethod:: nan_to_num
+   .. automethod:: nan_to_num_
    .. automethod:: ne
    .. automethod:: ne_
    .. automethod:: not_equal
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 0063c6cc8db4..d0537947d4ff 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -312,6 +312,7 @@ Pointwise Ops
     mul
     multiply
     mvlgamma
+    nan_to_num
     neg
     negative
     nextafter
diff --git a/test/test_autograd.py b/test/test_autograd.py
index d6661b4662fe..e92fbcbf21bb 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4641,6 +4641,33 @@ def test(inp, inp_dtype, out_dtype):
         test(inp, torch.float, torch.double)
         test(inp, torch.double, torch.float)
 
+    def test_nan_to_num(self):
+        a = torch.randn(3, 3, 3, 3)
+        with torch.no_grad():
+            a[torch.rand_like(a) < 0.2] = float('nan')
+            a[torch.rand_like(a) < 0.2] = float('inf')
+            a[torch.rand_like(a) < 0.2] = -float('inf')
+
+        a.requires_grad = True
+
+        gradcheck(lambda x: x.nan_to_num(), a)
+        gradgradcheck(lambda x: x.nan_to_num(), a)
+
+        gradcheck(lambda x: x.nan_to_num(nan=1.2), a)
+        gradgradcheck(lambda x: x.nan_to_num(nan=1.2), a)
+
+        gradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0), a)
+
+        gradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0, neginf=-2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0, neginf=-2.0), a)
+
+        gradcheck(lambda x: x.nan_to_num(posinf=2.0, neginf=-2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(posinf=2.0, neginf=-2.0), a)
+
+        gradcheck(lambda x: x.nan_to_num(neginf=-2.0), a)
+        gradgradcheck(lambda x: x.nan_to_num(neginf=-2.0), a)
+
     def test_custom_function_error(self):
         class BadFw(Function):
             @staticmethod
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 09a3cbd583a7..ddc735199f2d 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -1,6 +1,7 @@
 import math
 from itertools import product, chain
 from numbers import Number
+import random
 
 import unittest
 
@@ -377,6 +378,41 @@ def test_batch_vs_slicing(self, device, dtype, op):
 
         self.assertEqual(actual, expected)
 
+    @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False)))
+    def test_nan_to_num(self, device, dtype):
+        for contiguous in [False, True]:
+            x = make_tensor((64, 64), low=0., high=100., dtype=dtype, device=device)
+
+            if dtype.is_floating_point:
+                # Add extremal values.
+                extremals = [float('nan'), float('inf'), -float('inf')]
+                for idx, extremal in zip(torch.randint(0, 63, (3,)), extremals):
+                    x[idx, :] = extremal
+
+            if not contiguous:
+                x = x.T
+
+            # With args
+            nan = random.random()
+            posinf = random.random() * 5
+            neginf = random.random() * 10
+
+            self.compare_with_numpy(lambda x: x.nan_to_num(nan=nan, posinf=posinf),
+                                    lambda x: np.nan_to_num(x, nan=nan, posinf=posinf),
+                                    x)
+            self.compare_with_numpy(lambda x: x.nan_to_num(posinf=posinf, neginf=neginf),
+                                    lambda x: np.nan_to_num(x, posinf=posinf, neginf=neginf),
+                                    x)
+
+            # Out Variant
+            out = torch.empty_like(x)
+            result = torch.nan_to_num(x)
+            torch.nan_to_num(x, out=out)
+            self.assertEqual(result, out)
+
+            result = torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+            torch.nan_to_num(x, out=out, nan=nan, posinf=posinf, neginf=neginf)
+            self.assertEqual(result, out)
 
 instantiate_device_type_tests(TestUnaryUfuncs, globals())
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 92ee277e9ecf..2af8ee81604e 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -749,6 +749,9 @@
 - name: mvlgamma(Tensor self, int p) -> Tensor
   self: mvlgamma_backward(grad, self, p)
 
+- name: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+  self: grad * at::isfinite(self)
+
 - name: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
   input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index b7e61b5b7a83..6e0dc0721aed 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -145,7 +145,9 @@
     'quantize_per_tensor', 'quantize_per_channel',
     # Functions that return integers should not have output that require gradients
     'argmax', 'argmin', 'argsort', 'searchsorted',
-    'bucketize'
+    'bucketize',
+    # Functions that return booleans are not differentiable
+    'isnan', 'isposinf', 'isneginf', 'isinf'
 }
 
 # The C -> R functions at the time of adding this are still being audited and tested
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 55c5613cdcc3..7caceff4a1d1 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -2342,6 +2342,18 @@ def callable(a, b) -> number
 Alias for :meth:`~Tensor.dim()`
 """)
 
+add_docstr_all('nan_to_num', r"""
+nan_to_num(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+See :func:`torch.nan_to_num`.
+""")
+
+add_docstr_all('nan_to_num_', r"""
+nan_to_num_(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+In-place version of :meth:`~Tensor.nan_to_num`.
+""")
+
 add_docstr_all('ne', r"""
 ne(other) -> Tensor
 
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index a939eacad1fd..6c641c3df140 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -5301,6 +5301,41 @@ def merge_dicts(*dicts):
             [ 8,  9]])
 """)
 
+add_docstr(torch.nan_to_num,
+           r"""
+nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor
+
+Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input`
+with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively.
+By default, :literal:`NaN`s are replaced with zero, positive infinity is replaced with the
+greatest finite value representable by :attr:`input`'s dtype, and negative infinity
+is replaced with the least finite value representable by :attr:`input`'s dtype.
+
+Args:
+    {input}
+    nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero.
+    posinf (Number, optional): if a Number, the value to replace positive infinity values with.
+        If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype.
+        Default is None.
+    neginf (Number, optional): if a Number, the value to replace negative infinity values with.
+        If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype.
+        Default is None.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
+    >>> torch.nan_to_num(x)
+    tensor([ 0.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+    >>> torch.nan_to_num(x, nan=2.0)
+    tensor([ 2.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+    >>> torch.nan_to_num(x, nan=2.0, posinf=1.0)
+    tensor([ 2.0000e+00,  1.0000e+00, -3.4028e+38,  3.1400e+00])
+
+""".format(**common_args))
+
 add_docstr(torch.ne, r"""
 ne(input, other, *, out=None) -> Tensor
 
diff --git a/torch/overrides.py b/torch/overrides.py
index bab17c1e961f..43efda1da862 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -490,6 +490,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.mv: lambda input, vec, out=None: -1,
         torch.mvlgamma: lambda input, p: -1,
         torch.narrow: lambda input, dim, start, length: -1,
+        torch.nan_to_num: lambda input, nan=0.0, posinf=None, neginf=None, out=None: -1,
         torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1,
         torch.native_layer_norm: lambda input, weight, bias, M, N, eps: -1,
         torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1,
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 290645fd0d32..46ba17f61d8f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12,7 +12,7 @@
 from torch.testing import \
     (make_non_contiguous, _dispatch_dtypes,
      floating_types, floating_types_and, floating_and_complex_types,
-     floating_and_complex_types_and, all_types_and_complex_and)
+     floating_and_complex_types_and, all_types_and_complex_and, all_types_and)
 from torch.testing._internal.common_device_type import \
     (skipCUDAIfNoMagma, skipCPUIfNoLapack, expectedFailureCUDA,
      expectedAlertNondeterministic, precisionOverride)
@@ -389,6 +389,11 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                    ref=np.exp2,
                    dtypes=floating_types_and(torch.half),
                    dtypesIfCPU=None,
+                   dtypesIfCUDA=None),
+    UnaryUfuncInfo('nan_to_num',
+                   ref=np.nan_to_num,
+                   dtypes=all_types_and(torch.half),
+                   dtypesIfCPU=None,
                    dtypesIfCUDA=None)
 ]
 

From e829d4fba9420ca76099ea78b190b009d129010b Mon Sep 17 00:00:00 2001
From: Mingzhe Li <mingzhe0908@fb.com>
Date: Mon, 5 Oct 2020 09:27:47 -0700
Subject: [PATCH 406/449] [op-bench] fix jit mode (#45774)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45774

Fix RuntimeError: No such operator operator_benchmark::_consume

Test Plan: waitforsandcastle

Reviewed By: ngimel

Differential Revision: D24064982

fbshipit-source-id: 13160b6d18569e659ca1ab0ca1d444ed9947260c
---
 benchmarks/operator_benchmark/benchmark_pytorch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index 4d927d73bfc0..1c5a905f2b75 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -1,7 +1,7 @@
 import time
 import json
 import torch
-import torch.utils.cpp_extension as cpp_extension # noqa
+import cpp_extension # noqa
 
 
 """PyTorch performance microbenchmarks.
@@ -149,14 +149,14 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync):
             for _ in range(num_runs):
                 start_time = time.time()
                 self.output = self.op_bench.forward()
-                if cuda_sync: 
+                if cuda_sync:
                     torch.cuda.synchronize(torch.cuda.current_device())
                 end_time = time.time()
                 self.time_series.append((end_time - start_time) * 1e3)
         else:
             for _ in range(num_runs):
                 self.output = self.op_bench.forward()
-            if cuda_sync: 
+            if cuda_sync:
                 torch.cuda.synchronize(torch.cuda.current_device())
 
     def _output_mean(self):

From 3ab88c39038b75ff22f9985b415f7af943c6303b Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Mon, 5 Oct 2020 09:36:22 -0700
Subject: [PATCH 407/449] Enable TorchBind tests on ROCm (#45426)

Summary:
The torchbind tests didn't work be cause somehow we missed the rename of caffe2_gpu to torch_... (hip for us) in https://github.com/pytorch/pytorch/issues/20774 (merged 2019-06-13, oops) and still tried to link against it.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45426

Reviewed By: VitalyFedyunin

Differential Revision: D24112439

Pulled By: walterddr

fbshipit-source-id: a66a574e63714728183399c543d2dafbd6c028f7
---
 caffe2/CMakeLists.txt              |  2 +-
 test/cpp/jit/CMakeLists.txt        |  2 --
 test/cpp/tensorexpr/CMakeLists.txt |  2 --
 test/jit/test_torchbind.py         | 10 +++++++---
 torch/CMakeLists.txt               |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 38b583be2486..318e46a44f54 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -841,7 +841,7 @@ endif()
     DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch)
 
 
-  if(BUILD_TEST AND NOT USE_ROCM)
+  if(BUILD_TEST)
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
     add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr)
     if(USE_DISTRIBUTED AND NOT WIN32)
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index 2d135befd805..2e22cd646813 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -97,8 +97,6 @@ elseif(USE_ROCM)
     ${PYTORCH_HIP_HCC_LIBRARIES}
     ${TORCH_CUDA_LIBRARIES})
 
-  target_link_libraries(test_jit PRIVATE caffe2_gpu)
-
   target_compile_definitions(test_jit PRIVATE USE_ROCM)
 endif()
 
diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt
index 5f162fdd2d13..af4299e395cd 100644
--- a/test/cpp/tensorexpr/CMakeLists.txt
+++ b/test/cpp/tensorexpr/CMakeLists.txt
@@ -36,14 +36,12 @@ elseif(USE_ROCM)
     ${ROCM_HIPRTC_LIB}
     ${PYTORCH_HIP_HCC_LIBRARIES}
     ${TORCH_CUDA_LIBRARIES})
-  target_link_libraries(test_tensorexpr PRIVATE caffe2_gpu)
   target_compile_definitions(test_tensorexpr PRIVATE USE_ROCM)
 
   target_link_libraries(tutorial_tensorexpr PRIVATE
     ${ROCM_HIPRTC_LIB}
     ${PYTORCH_HIP_HCC_LIBRARIES}
     ${TORCH_CUDA_LIBRARIES})
-  target_link_libraries(tutorial_tensorexpr PRIVATE caffe2_gpu)
   target_compile_definitions(tutorial_tensorexpr PRIVATE USE_ROCM)
 endif()
 
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index 7e61cef80f69..ee288b65551f 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -24,10 +24,14 @@
 
 class TestTorchbind(JitTestCase):
     def setUp(self):
-        if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE:
+        if IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE:
             raise unittest.SkipTest("non-portable load_library call used in test")
-        torch_root = Path(__file__).resolve().parent.parent.parent
-        p = torch_root / 'build' / 'lib' / 'libtorchbind_test.so'
+        if TEST_WITH_ROCM:
+            torch_root = Path(torch.__file__).resolve().parent
+            p = torch_root / 'lib' / 'libtorchbind_test.so'
+        else:
+            torch_root = Path(__file__).resolve().parent.parent.parent
+            p = torch_root / 'build' / 'lib' / 'libtorchbind_test.so'
         torch.ops.load_library(str(p))
 
     def test_torchbind(self):
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index de84804f3012..f64025c34683 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -46,7 +46,7 @@ append_filelist("libtorch_python_core_sources" TORCH_PYTHON_SRCS)
 
 # NB: This has to match the condition under which the JIT test directory
 #     is included (at the time of writing that's in caffe2/CMakeLists.txt).
-if(BUILD_TEST AND NOT USE_ROCM)
+if(BUILD_TEST)
     add_definitions(-DBUILDING_TESTS)
     list(APPEND TORCH_PYTHON_SRCS
       ${TORCH_ROOT}/test/cpp/jit/torch_python_test.cpp

From 162717e527f3f3572a27cd599519db89fa29661f Mon Sep 17 00:00:00 2001
From: Ayush Sharma <ayush2162002@gmail.com>
Date: Mon, 5 Oct 2020 09:53:38 -0700
Subject: [PATCH 408/449] grammatically update index.rst (#45801)

Summary:
This is a following up PR for https://github.com/pytorch/pytorch/issues/45652 which has a problem to rebase.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45801

Reviewed By: VitalyFedyunin

Differential Revision: D24111776

Pulled By: glaringlee

fbshipit-source-id: 2c727a17426be91a4df78a195de79197e1c5d120
---
 docs/cpp/source/index.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index 2bfbe63f47c6..39c63ddd5d7b 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -1,20 +1,20 @@
 PyTorch C++ API
 ===============
 
-These pages provide documentation for the public portions of the PyTorch C++
+These pages provide the documentation for the public portions of the PyTorch C++
 API.  This API can roughly be divided into five parts:
 
-- **ATen**: The foundational tensor and mathematical operation library on which all else is built;
-- **Autograd**: Augments ATen with automatic differentiation;
-- **C++ Frontend**: High level constructs for training and evaluation of machine learning models;
-- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter;
+- **ATen**: The foundational tensor and mathematical operation library on which all else is built.
+- **Autograd**: Augments ATen with automatic differentiation.
+- **C++ Frontend**: High level constructs for training and evaluation of machine learning models.
+- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter.
 - **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines.
 
-Together, these building blocks form a research and
+Combining, these building blocks form a research and
 production ready C++ library for tensor computation and dynamic neural
 networks with strong emphasis on GPU acceleration as well as fast CPU
 performance. It is currently in use at Facebook in research and
-production; we look forward to welcoming more users of the PyTorch C++ API.
+production; we are looking forward to welcome more users of the PyTorch C++ API.
 
 .. warning::
 
@@ -76,7 +76,7 @@ C++ Frontend
 ------------
 
 The PyTorch C++ frontend provides a high level, pure C++ modeling interface for
-neural network and general machine learning research and production use cases,
+neural network and general ML(Machine Learning) research and production use cases,
 largely following the Python API in design and provided functionality. The C++
 frontend includes the following:
 
@@ -119,7 +119,7 @@ expanded on a continuous and active basis.
 TorchScript
 -----------
 
-TorchScript a representation of a PyTorch model that can be understood,
+TorchScript is a representation of a PyTorch model that can be understood,
 compiled and serialized by the TorchScript compiler. Fundamentally, TorchScript
 is a programming language in its own right. It is a subset of Python using
 the PyTorch API.  The C++ interface to TorchScript encompasses three primary pieces of
@@ -150,7 +150,7 @@ CUDA to accelerate research in vanilla PyTorch setups. The C++ extension API
 does not add any new functionality to the PyTorch C++ API. Instead, it
 provides integration with Python setuptools as well as JIT compilation
 mechanisms that allow access to ATen, the autograd and other C++ APIs from
-Python. To learn more about the C++ extension API, see
+Python. To learn more about the C++ extension API, go through
 `this tutorial <https://pytorch.org/tutorials/advanced/cpp_extension.html>`_.
 
 Contents
@@ -183,4 +183,4 @@ Acknowledgements
 This documentation website for the PyTorch C++ universe has been enabled by the
 `Exhale <https://github.com/svenevs/exhale/>`_ project and generous investment
 of time and effort by its maintainer, `svenevs <https://github.com/svenevs/>`_.
-We thank Stephen for his work and his help with the PyTorch C++ documentation.
+We thank Stephen for his work and his efforts providing help with the PyTorch C++ documentation.

From cf48872d28f945d47793f63e19c54dd15bf580f7 Mon Sep 17 00:00:00 2001
From: Pawel Garbacki <pawelg@fb.com>
Date: Mon, 5 Oct 2020 10:45:09 -0700
Subject: [PATCH 409/449] [C2] Add string equality operator

Summary: This diff adds a string equality checking operator.

Test Plan: Unit tests

Differential Revision: D24042344

fbshipit-source-id: c8997c6130e3438f2ae95dae69f76978e2e95527
---
 caffe2/operators/string_ops.cc                | 26 ++++++++++++++++++
 .../python/operator_test/string_ops_test.py   | 27 +++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc
index 76fedeb488f8..047e740268a4 100644
--- a/caffe2/operators/string_ops.cc
+++ b/caffe2/operators/string_ops.cc
@@ -71,6 +71,17 @@ struct EndsWith {
   std::string suffix_;
 };
 
+struct Equals {
+  explicit Equals(OperatorBase& op)
+      : text_(op.GetSingleArgument<std::string>("text", "")) {}
+  bool operator()(const std::string& str) {
+    return str == text_;
+  }
+
+ private:
+  std::string text_;
+};
+
 struct Prefix {
   explicit Prefix(OperatorBase& op)
       : length_(op.GetSingleArgument<int>("length", 3)) {}
@@ -108,6 +119,9 @@ REGISTER_CPU_OPERATOR(
 REGISTER_CPU_OPERATOR(
     StringEndsWith,
     StringElementwiseOp<EndsWith, FixedType<bool>>);
+REGISTER_CPU_OPERATOR(
+    StringEquals,
+    StringElementwiseOp<Equals, FixedType<bool>>);
 REGISTER_CPU_OPERATOR(StringJoin, StringJoinOp<CPUContext>);
 
 OPERATOR_SCHEMA(StringPrefix)
@@ -164,6 +178,17 @@ Returns tensor of boolean of the same dimension of input.
     .Input(0, "strings", "Tensor of std::string.")
     .Output(0, "bools", "Tensor of bools of same shape as input.");
 
+OPERATOR_SCHEMA(StringEquals)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Performs equality check on each string in the input tensor.
+Returns tensor of booleans of the same dimension as input.
+)DOC")
+    .Arg("text", "The text to check input strings equality against.")
+    .Input(0, "strings", "Tensor of std::string.")
+    .Output(0, "bools", "Tensor of bools of same shape as input.");
+
 OPERATOR_SCHEMA(StringJoin)
     .NumInputs(1)
     .NumOutputs(1)
@@ -187,6 +212,7 @@ SHOULD_NOT_DO_GRADIENT(StringPrefix);
 SHOULD_NOT_DO_GRADIENT(StringSuffix);
 SHOULD_NOT_DO_GRADIENT(StringStartsWith);
 SHOULD_NOT_DO_GRADIENT(StringEndsWith);
+SHOULD_NOT_DO_GRADIENT(StringEquals);
 SHOULD_NOT_DO_GRADIENT(StringJoin);
 }
 } // namespace caffe2
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
index eedb57be1d6c..a0c56a686666 100644
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@@ -119,6 +119,33 @@ def string_ends_with_ref(strings):
             [strings],
             string_ends_with_ref)
 
+    @given(strings=st.text(alphabet=['a', 'b']))
+    @settings(deadline=1000)
+    def test_string_equals(self, strings):
+        text = ""
+        if strings:
+            text = strings[0]
+
+        strings = np.array(
+            [str(a) for a in strings], dtype=np.object
+        )
+
+        def string_equals_ref(strings):
+            return (
+                np.array([a == text for a in strings], dtype=bool),
+            )
+
+        op = core.CreateOperator(
+            'StringEquals',
+            ['strings'],
+            ['bools'],
+            text=text)
+        self.assertReferenceChecks(
+            hu.cpu_do,
+            op,
+            [strings],
+            string_equals_ref)
+
 if __name__ == "__main__":
     import unittest
     unittest.main()

From adc21c6db278829797161ff23a92e40068f0db5c Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Mon, 5 Oct 2020 11:41:12 -0700
Subject: [PATCH 410/449] Rename jobs and cli switches for testing
 GraphExecutor configurations to something a little bit more sensical.
 (#45715)

Summary:
Rename jobs for testing GraphExecutor configurations to something a little bit more sensical.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45715

Reviewed By: ezyang, anjali411

Differential Revision: D24114344

Pulled By: Krovatkin

fbshipit-source-id: 89e5f54aaebd88f8c5878e060e983c6f1f41b9bb
---
 .circleci/cimodel/data/simple/ge_config_tests.py |  8 ++++----
 .circleci/config.yml                             | 12 ++++++------
 .jenkins/pytorch/test.sh                         | 13 +++----------
 test/test_jit_cuda_fuser_legacy.py               |  2 +-
 test/test_jit_cuda_fuser_profiling.py            |  2 +-
 test/test_jit_fuser_legacy.py                    |  2 +-
 test/test_jit_legacy.py                          |  2 +-
 test/test_jit_profiling.py                       |  2 +-
 test/test_jit_simple.py                          |  2 +-
 torch/testing/_internal/common_utils.py          |  8 ++++----
 10 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/.circleci/cimodel/data/simple/ge_config_tests.py b/.circleci/cimodel/data/simple/ge_config_tests.py
index 306f616d3ef7..235c08d62786 100644
--- a/.circleci/cimodel/data/simple/ge_config_tests.py
+++ b/.circleci/cimodel/data/simple/ge_config_tests.py
@@ -61,25 +61,25 @@ def gen_tree(self):
         MultiPartVersion([3, 6], "py"),
         MultiPartVersion([5, 4], "gcc"),
         None,
-        ["ge_config_legacy", "test"],
+        ["jit_legacy", "test"],
         ["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
     GeConfigTestJob(
         MultiPartVersion([3, 6], "py"),
         MultiPartVersion([5, 4], "gcc"),
         None,
-        ["ge_config_simple", "test"],
+        ["jit_simple", "test"],
         ["pytorch_linux_xenial_py3_6_gcc5_4_build"],
     ),
     GeConfigTestJob(
         None,
         None,
         CudaVersion(10, 2),
-        ["cudnn7", "py3", "ge_config_legacy", "test"],
+        ["cudnn7", "py3", "jit_legacy", "test"],
         ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"],
         use_cuda_docker=True,
         # TODO Why does the build environment specify cuda10.1, while the
         # job name is cuda10_2?
-        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test"),
+        build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test"),
 ]
 
 
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 06a7188b7c13..208e0d09eed0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -7023,23 +7023,23 @@ workflows:
           requires:
             - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test
+          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_legacy-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_jit_legacy_test
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           resource_class: large
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test
+          build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_simple-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_jit_simple_test
           requires:
             - pytorch_linux_xenial_py3_6_gcc5_4_build
           resource_class: large
       - pytorch_linux_test:
-          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test
+          build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test
           docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test
+          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_jit_legacy_test
           requires:
             - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
           resource_class: gpu.medium
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 48a0063bbda6..4412915d2fb9 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -131,12 +131,7 @@ test_python_nn() {
   assert_git_not_dirty
 }
 
-test_python_ge_config_profiling() {
-  time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
-  assert_git_not_dirty
-}
-
-test_python_ge_config_legacy() {
+test_python_legacy_jit() {
   time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
@@ -381,10 +376,8 @@ if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then
 elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then
   install_torchvision
   test_xla
-elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then
-  test_python_ge_config_legacy
-elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then
-  test_python_ge_config_profiling
+elif [[ "${BUILD_ENVIRONMENT}" == *legacy_jit* || "${JOB_BASE_NAME}" == *legacy_jit* ]]; then
+  test_python_legacy_jit
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py
index 41e16df7d686..28ab78370637 100644
--- a/test/test_jit_cuda_fuser_legacy.py
+++ b/test/test_jit_cuda_fuser_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 
 import os
 os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py
index 7559b85519c4..5114ab190457 100644
--- a/test/test_jit_cuda_fuser_profiling.py
+++ b/test/test_jit_cuda_fuser_profiling.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=profiling")
+sys.argv.append("--jit_executor=profiling")
 
 import os
 os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1'
diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py
index c33983e45e79..420075f6e611 100644
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 from test_jit_fuser import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py
index 2422e518a7f9..b17908e910bb 100644
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=legacy")
+sys.argv.append("--jit_executor=legacy")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
index be02985e69a8..dc6bb2fbf878 100644
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=profiling")
+sys.argv.append("--jit_executor=profiling")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py
index 910e4a17713d..23da6602c572 100644
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@@ -1,5 +1,5 @@
 import sys
-sys.argv.append("--ge_config=simple")
+sys.argv.append("--jit_executor=simple")
 from test_jit import *
 
 if __name__ == '__main__':
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 44caea6687f0..9c9d27bf195b 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -147,7 +147,7 @@ def _get_test_report_path():
                     help='whether to run each test in a subprocess')
 parser.add_argument('--seed', type=int, default=1234)
 parser.add_argument('--accept', action='store_true')
-parser.add_argument('--ge_config', type=str)
+parser.add_argument('--jit_executor', type=str)
 parser.add_argument('--repeat', type=int, default=1)
 parser.add_argument('--test_bailouts', action='store_true')
 parser.add_argument('--save-xml', nargs='?', type=str,
@@ -158,11 +158,11 @@ def _get_test_report_path():
 parser.add_argument('--run-parallel', type=int, default=1)
 
 args, remaining = parser.parse_known_args()
-if args.ge_config == 'legacy':
+if args.jit_executor == 'legacy':
     GRAPH_EXECUTOR = ProfilingMode.LEGACY
-elif args.ge_config == 'profiling':
+elif args.jit_executor == 'profiling':
     GRAPH_EXECUTOR = ProfilingMode.PROFILING
-elif args.ge_config == 'simple':
+elif args.jit_executor == 'simple':
     GRAPH_EXECUTOR = ProfilingMode.SIMPLE
 else:
     # infer flags based on the default settings

From 45ddeb5ce6d0a7e2a719fac3d7e32a3643b46379 Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Mon, 5 Oct 2020 11:50:39 -0700
Subject: [PATCH 411/449] [ONNX] Improve error handling for adaptive_pool
 (#43032)

Summary:
This would also improve error handling for interpolate with 'area' mode.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/43032

Reviewed By: malfet

Differential Revision: D23398534

Pulled By: bzinodev

fbshipit-source-id: f2d60d40340f46e7c0499ea73c1e39945713418d
---
 test/onnx/test_pytorch_onnx_onnxruntime.py |  9 ++++++++-
 torch/onnx/symbolic_opset9.py              | 10 ++++++++--
 torch/onnx/utils.py                        |  3 +--
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 23d4879a8a4c..77577f687de0 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1667,7 +1667,14 @@ def forward(self, x, y):
         y = torch.randn(16, 16, requires_grad=True)
         self.run_test(MyModel(), (x, y))
 
-    @disableScriptTest()
+    def test_interpolate_adaptive_pooling_error(self):
+        x = torch.randn(1, 2, 6, requires_grad=True)
+        with self.assertRaises(RuntimeError) as cm:
+            self._interpolate(x, "area", True, True)
+
+        with self.assertRaises(RuntimeError) as cm:
+            self._interpolate(x, "area", False, True)
+
     def test_groupnorm(self):
         model = torch.nn.GroupNorm(3, 6, 0.002)
         x = torch.randn(4, 6, 180, 180, 180)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 7e8b04bf1612..e11e52bad508 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -826,7 +826,6 @@ def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include
 
 
 def _adaptive_pool(name, type, tuple_fn, fn=None):
-    @parse_args('v', 'is')
     def symbolic_fn(g, input, output_size):
         # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
         # by executing a GlobalPool.
@@ -837,6 +836,10 @@ def symbolic_fn(g, input, output_size):
         # so we try using max_poolxd_with_indices, and if it is not possible
         # (input is not a complete tensor or output size not factor of input size)
         # then we call GlobalAveragePool and return None for the indices
+        try:
+            output_size = _parse_arg(output_size, 'is')
+        except Exception:
+            return sym_help._onnx_unsupported('adaptive pooling, since output_size is not constant.')
         if output_size == [1] * len(output_size) and type == "AveragePool":
             return g.op("GlobalAveragePool", input)
         if not input.isCompleteTensor():
@@ -849,7 +852,10 @@ def symbolic_fn(g, input, output_size):
         if mod != [0] * len(mod):
             if output_size == [1] * len(output_size):
                 return g.op("GlobalMaxPool", input), None
-            return _unimplemented(name, 'output size that are not factor of input size')
+            if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+                return _unimplemented(name, 'output size that are not factor of input size')
+            else:
+                return sym_help._onnx_unsupported(name, ', since output size is not factor of input size')
         k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
         # call max_poolxd_with_indices to get indices in the output
         if type == "MaxPool":
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 98dc79d6546c..587e6d5b75c6 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1003,8 +1003,7 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
         else:
             raise RuntimeError("ONNX export failed on an operator with unrecognized namespace {}::{}. "
                                "If you are trying to export a custom operator, make sure you registered "
-                               "it with the right domain and version. "
-                               "Otherwise, please report a bug.".format(ns, op_name))
+                               "it with the right domain and version.".format(ns, op_name))
     except RuntimeError:
         if operator_export_type == OperatorExportTypes.ONNX_FALLTHROUGH:
             return None

From b5a2f04089fb3c82175b8768f4dea7fe4619c7f5 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Mon, 5 Oct 2020 11:52:09 -0700
Subject: [PATCH 412/449] Disallow creation of ProcessGroupNCCL without GPUs.
 (#45642)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45642

Prior to https://github.com/pytorch/pytorch/pull/45181, initializing a
NCCL process group would work even if no GPUs were present. Although, now since
init_process_group calls `barrier()` this would fail.

In general the problem was that we could initialize ProcessGroupNCCL without
GPUs and then if we called a method like `barrier()` the process would crash
since we do % numGPUs resulting in division by zero.
ghstack-source-id: 113490343

Test Plan: waitforbuildbot

Reviewed By: osalpekar

Differential Revision: D24038839

fbshipit-source-id: a1f1db52cabcfb83e06c1a11ae9744afbf03f8dc
---
 test/distributed/test_c10d.py                 | 46 ++++++++++++++++---
 torch/lib/c10d/ProcessGroupNCCL.cpp           |  2 +
 torch/testing/_internal/common_distributed.py | 11 +++++
 3 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 53a118e8f15b..6ef0ab5dbb2c 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -29,7 +29,8 @@
 from torch.testing._internal.common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \
-    simple_sparse_reduce_tests, skip_if_win32, create_device
+    skip_if_rocm_single_process, simple_sparse_reduce_tests, skip_if_win32, \
+    create_device
 
 from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \
     retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN
@@ -1594,13 +1595,30 @@ def create(num, prefix):
                 self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor)
             del pg
 
+class ProcessGroupNCCLNoGPUTest(TestCase):
+    MAIN_PROCESS_RANK = 0
+
+    def setUp(self):
+        self.rank = self.MAIN_PROCESS_RANK
+        self.world_size = 1
+        self.file = tempfile.NamedTemporaryFile(delete=False)
+        self.num_gpus = torch.cuda.device_count()
+        if self.num_gpus > 0:
+            raise unittest.SkipTest("GPUs are available, skipping test")
+
+    def tearDown(self):
+        pass
+
+    @requires_nccl()
+    @skip_if_rocm_single_process
+    def test_init_no_gpus(self):
+        store = c10d.FileStore(self.file.name, self.world_size)
+        with self.assertRaisesRegex(
+                RuntimeError,
+                "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"):
+            c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
+
 
-@requires_nccl()
-@unittest.skipIf(
-    TEST_WITH_TSAN,
-    "TSAN is not fork-safe since we're forking in a multi-threaded environment",
-)
-@skip_if_rocm
 class ProcessGroupNCCLTest(TestCase):
     MAIN_PROCESS_RANK = 0
 
@@ -1615,6 +1633,8 @@ def setUp(self):
     def tearDown(self):
         pass
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_empty_tensors(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1639,6 +1659,8 @@ def test_empty_tensors(self):
         pg.reduce_scatter(ys, xs).wait()
         self.assertEqual(0, ys[0].numel())
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_broadcast_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1661,6 +1683,8 @@ def broadcast(xs, rootRank, rootTensor):
             for i in range(self.num_gpus):
                 self.assertEqual(tensors[i], tensors[rt])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allreduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1722,6 +1746,8 @@ def allreduce(tensors, op):
             with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                 allreduce(tensors, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1752,6 +1778,8 @@ def reduce(xs, rootRank, rootTensor, op=None):
                 with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"):
                     reduce(tensors, self.rank, rt, op)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_allgather_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1777,6 +1805,8 @@ def allgather(output_ts, input_ts):
             for s_idx, t in enumerate(device_ts):
                 self.assertEqual(torch.tensor([s_idx]), t)
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_reduce_scatter_ops(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
@@ -1854,6 +1884,8 @@ def reduce_scatter(outputs, input_lists, op):
             # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095
             self.assertEqualIgnoreType(expected, output[i])
 
+    @requires_nccl()
+    @skip_if_rocm_single_process
     def test_barrier(self):
         store = c10d.FileStore(self.file.name, self.world_size)
         pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 23c1731380f5..a2d04be14959 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -450,6 +450,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
       opTimeout_(options.opTimeout),
       futureNCCLCallbackStreams_(c10::cuda::device_count()),
       isHighPriorityStream_(options.isHighPriorityStream) {
+  TORCH_CHECK(at::cuda::getNumGPUs() != 0,
+    "ProcessGroupNCCL is only supported with GPUs, no GPUs found!");
   try {
     parseNcclBlockingWait();
   } catch (std::exception& e) {
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index b2cd30c66812..4f36b31a23d0 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -130,6 +130,17 @@ def requires_mpi():
         "c10d was not compiled with the MPI backend",
     )
 
+def skip_if_rocm_single_process(func):
+    """Skips a test for ROCm in a single process environment"""
+    func.skip_if_rocm = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_ROCM:
+            return func(*args, **kwargs)
+        raise unittest.SkipTest("Test skipped for ROCm")
+
+    return wrapper
 
 def skip_if_rocm(func):
     """Skips a test for ROCm"""

From 615013edcbf15fd85fcac1d2f63f15aff59bc2ad Mon Sep 17 00:00:00 2001
From: Eli Uriegas <eliuriegas@fb.com>
Date: Mon, 5 Oct 2020 13:27:00 -0700
Subject: [PATCH 413/449] setup: Dataclasses only when < 3.7 (#45844)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45844

Someone pointed out that dataclasses were actually added to the python
stdlib in 3.7 and not 3.8, so bumping down the dependency on dataclasses
from 3.8 -> 3.7 makes sense here

Signed-off-by: Eli Uriegas <eliuriegas@fb.com>

Test Plan: Imported from OSS

Reviewed By: walterddr, malfet

Differential Revision: D24113367

Pulled By: seemethere

fbshipit-source-id: 03d2d93f7d966d48a30a8e2545fd07dfe63b4fb3
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b2270db497cf..c29ee929b8ca 100644
--- a/setup.py
+++ b/setup.py
@@ -346,7 +346,7 @@ def check_file(f):
 install_requires = [
     'future',
     'typing_extensions',
-    'dataclasses; python_version < "3.8"'
+    'dataclasses; python_version < "3.7"'
 ]
 
 missing_pydep = '''

From ffbffc0436d96bc755873605397d5f047f30ad62 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Mon, 5 Oct 2020 13:37:30 -0700
Subject: [PATCH 414/449] fixed formatting in function rstrings in
 torch.autograd.functional (#45849)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/44426

The changes look like:
![Screen Shot 2020-10-05 at 12 34 32 PM](https://user-images.githubusercontent.com/31798555/95107954-9839f500-0708-11eb-88b0-444486f53061.png)
(compare with https://pytorch.org/docs/stable/autograd.html#torch.autograd.functional.jacobian)

and also
![Screen Shot 2020-10-05 at 12 35 15 PM](https://user-images.githubusercontent.com/31798555/95107966-9bcd7c00-0708-11eb-979a-b3578b8203da.png)
(compare with https://pytorch.org/docs/stable/autograd.html#torch.autograd.functional.hessian)

and lastly
![Screen Shot 2020-10-05 at 12 38 19 PM](https://user-images.githubusercontent.com/31798555/95107971-9e2fd600-0708-11eb-9919-5b809f5f0f20.png)
(compare with https://pytorch.org/docs/stable/autograd.html#torch.autograd.functional.hvp)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45849

Reviewed By: albanD

Differential Revision: D24114223

Pulled By: janeyx99

fbshipit-source-id: bfea5f0d594933db4b2c400291d330f747f518e8
---
 torch/autograd/functional.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index 2a1d0ef55fd9..58e780c87d1b 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -381,15 +381,15 @@ def jacobian(func, inputs, create_graph=False, strict=False):
             Defaults to ``False``.
 
     Returns:
-        Jacobian (Tensor or nested tuple of Tensors): if there are a single
-            input and output, this will be a single Tensor containing the
-            Jacobian for the linearized inputs and output. If one of the two is
-            a tuple, then the Jacobian will be a tuple of Tensors. If both of
-            them are tuples, then the Jacobian will be a tuple of tuple of
-            Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
-            ``i``\th output and ``j``\th input and will have as size the
-            concatenation of the sizes of the corresponding output and the
-            corresponding input.
+        Jacobian (Tensor or nested tuple of Tensors): if there is a single
+        input and output, this will be a single Tensor containing the
+        Jacobian for the linearized inputs and output. If one of the two is
+        a tuple, then the Jacobian will be a tuple of Tensors. If both of
+        them are tuples, then the Jacobian will be a tuple of tuple of
+        Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
+        ``i``\th output and ``j``\th input and will have as size the
+        concatenation of the sizes of the corresponding output and the
+        corresponding input.
 
     Example:
 
@@ -476,12 +476,12 @@ def hessian(func, inputs, create_graph=False, strict=False):
             Defaults to ``False``.
 
     Returns:
-        Hessian (Tensor or a tuple of tuple of Tensors) if there are a single input,
-            this will be a single Tensor containing the Hessian for the input.
-            If it is a tuple, then the Hessian will be a tuple of tuples where
-            ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input
-            and ``j``\th input with size the sum of the size of the ``i``\th input plus
-            the size of the ``j``\th input.
+        Hessian (Tensor or a tuple of tuple of Tensors): if there is a single input,
+        this will be a single Tensor containing the Hessian for the input.
+        If it is a tuple, then the Hessian will be a tuple of tuples where
+        ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input
+        and ``j``\th input with size the sum of the size of the ``i``\th input plus
+        the size of the ``j``\th input.
 
     Example:
 
@@ -660,7 +660,9 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False):
             hvp for said inputs, which is the expected mathematical value.
             Defaults to ``False``.
     Returns:
-        func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+        output (tuple): tuple with:
+            func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+
             hvp (tuple of Tensors or Tensor): result of the dot product with
             the same shape as the inputs.
 

From 14e6e5070051802d471389db2795b34412df9653 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 5 Oct 2020 13:53:46 -0700
Subject: [PATCH 415/449] Refactor computeLRWorkDim (#45812)

Summary:
Move duplicated code for computing LRWork array dimention form CPU/CUDA implementation of apply_svd into LinearAlgebraUtils

Reduce common multiplication factor from 7 to 5, which according to the documentation should be sufficient for LAPACK-3.6+
From https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186
```
RWORK is REAL array, dimension (MAX(1,LRWORK))
Let mx = max(M,N) and mn = min(M,N).
If JOBZ = 'N',    LRWORK >= 5*mn (LAPACK <= 3.6 needs 7*mn);
else if mx >> mn, LRWORK >= 5*mn*mn + 5*mn;
else              LRWORK >= max( 5*mn*mn + 5*mn,
                                 2*mx*mn + 2*mn*mn + mn ).
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45812

Reviewed By: walterddr

Differential Revision: D24100836

Pulled By: malfet

fbshipit-source-id: 0ca86aed25077c91cf60086ed301298381d5f628
---
 aten/src/ATen/native/BatchLinearAlgebra.cpp     | 12 ++----------
 aten/src/ATen/native/LinearAlgebraUtils.h       | 15 +++++++++++++++
 aten/src/ATen/native/cuda/BatchLinearAlgebra.cu | 11 +----------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 17e24a38fdc7..e7e5659babbb 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -981,20 +981,12 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT,
   auto m = self.size(-2);
   auto n = self.size(-1);
   auto mn = std::min(m, n);
-  Tensor iwork = at::empty({8*mn}, at::kInt);
+  Tensor iwork = at::empty({8 * mn}, at::kInt);
   auto iwork_data = iwork.data_ptr<int>();
   Tensor rwork;
   value_t* rwork_data = nullptr;
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    auto mx = std::max(m, n);
-    int64_t lrwork; // These settings are valid for on LAPACK 3.6+
-    if (jobz == 'N'){
-      lrwork = 7 * mn;
-    }else if (mx > 10 * mn){
-      lrwork = 7 * mn * mn + 7 * mn;
-    } else {
-      lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn);
-    }
+    auto lrwork  = computeLRWorkDim(jobz, m, n);
     // rwork is an array of floats or doubles depending on the type
     rwork = at::empty({std::max(int64_t(1), lrwork)}, at::typeMetaToScalarType(S.dtype()));
     rwork_data = rwork.data_ptr<value_t>();
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 5c07700f1e85..4a6af18a5a96 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -318,4 +318,19 @@ static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_
   return reverse_permutation;
 }
 
+// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd
+// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186
+static inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) {
+  auto mn = std::min(m, n);
+  auto mx = std::max(m, n);
+  // These settings are valid for on LAPACK 3.6+
+  if (jobz == 'N') {
+    return 5 * mn;
+  }
+  if (mx > 10 * mn) {
+    return 5 * mn * mn + 5 * mn;
+  }
+  return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn);
+}
+
 }}  // namespace at::native
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index 2a628bb54925..e9dfe2d9285d 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -1420,17 +1420,8 @@ AT_ERROR("svd: MAGMA library not found in "
 
   magma_int_t* iwork;
   ALLOCATE_ARRAY(iwork, magma_int_t, 8 * mn);
-  // Copy-n-paste rwork size computation from BatchLinearAlgebra.cpp
   if (isComplexType(at::typeMetaToScalarType(self.dtype()))) {
-    auto mx = std::max(m, n);
-    int64_t lrwork; // These settings are valid for on LAPACK 3.6+
-    if (jobz == MagmaNoVec){
-      lrwork = 7 * mn;
-    } else if (mx > 10 * mn){
-      lrwork = 7 * mn * mn + 7 * mn;
-    } else {
-      lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn);
-    }
+    auto lrwork = computeLRWorkDim(jobchar, m, n);
     storage_rwork = pin_memory<value_t>(lrwork);
     rwork = static_cast<value_t*>(storage_rwork.data());
   }

From 21fa877026f28779947ae38fea0540e5c79b22b1 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Mon, 5 Oct 2020 14:09:17 -0700
Subject: [PATCH 416/449] [quant][test] Remove numeric equivalence test for
 debug and non-debug option (#45852)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45852

Test Plan: Imported from OSS

Reviewed By: vkuzo

Differential Revision: D24115329

fbshipit-source-id: ad32e68cbd54431fd440c8437a4361905a5dbdad
---
 torch/testing/_internal/common_quantization.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index ccbda8232952..468fd9cfdc81 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -653,12 +653,6 @@ def checkGraphModeFxOp(self, model, inputs, quant_type,
         result = qgraph(*inputs)
         result_debug = qgraph_debug(*inputs)
 
-        # numeric match for debug option for dynamic
-        # quantized op is not needed right now
-        if quant_type != QuantType.DYNAMIC:
-            self.assertEqual((result - result_debug).abs().max(), 0), \
-                'Expecting debug and non-debug option to produce identical result'
-
         qgraph_to_check = qgraph_debug if debug else qgraph
         if print_debug_info:
             print()

From c80ec91b005b7419056f365988bcebf1e2ac4b71 Mon Sep 17 00:00:00 2001
From: Yuchen Huang <hyc@fb.com>
Date: Mon, 5 Oct 2020 14:34:52 -0700
Subject: [PATCH 417/449] [iOS] Bump up the cocoapods version (#45862)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45862

Bump up the cocoapods version
ghstack-source-id: 113585513

(Note: this ignores all push blocking failures!)

Test Plan: CI

Reviewed By: xta0

Differential Revision: D24119158

fbshipit-source-id: e689b69628dcf802084e67c5ea627220cafcc575
---
 ios/LibTorch.podspec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec
index 17e9fb26afa1..f74e2dc9f37e 100644
--- a/ios/LibTorch.podspec
+++ b/ios/LibTorch.podspec
@@ -1,6 +1,6 @@
 Pod::Spec.new do |s|
     s.name             = 'LibTorch'
-    s.version          = '1.6.0'
+    s.version          = '1.6.1'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/pytorch'

From a83696ad53ec4cba6e5b655783f89daaf5f13fea Mon Sep 17 00:00:00 2001
From: Vasiliy Kuznetsov <vasiliy@fb.com>
Date: Mon, 5 Oct 2020 14:36:59 -0700
Subject: [PATCH 418/449] quant docs: add API summary section (#45848)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45848

This is a resubmit of the following stack:
* start: https://github.com/pytorch/pytorch/pull/45093
* end: https://github.com/pytorch/pytorch/pull/45306

The original stack was reverted due to build failure,
resubmitting.

Test Plan: Imported from OSS

Reviewed By: jerryzh168

Differential Revision: D24117781

Pulled By: vkuzo

fbshipit-source-id: fb767fff2b044cfbba695ca3949221904fc8931f
---
 docs/source/quantization.rst | 347 +++++++++++++++++++++++++++--------
 1 file changed, 272 insertions(+), 75 deletions(-)

diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index b597fa9f51f3..b78ed2c08586 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -77,6 +77,261 @@ The corresponding implementation is chosen automatically based on the PyTorch bu
 
     ``torch.backends.quantized.engine = 'qnnpack'``
 
+Quantization API Summary
+---------------------------------------
+
+There are three types of quantization supported in PyTorch:
+
+1. dynamic quantization (weights quantized with activations read/stored in
+   floating point and quantized for compute.)
+2. static quantization (weights quantized, activations quantized, calibration
+   required post training)
+3. quantization aware training (weights quantized, activations quantized,
+   quantization numerics modeled during training)
+
+Please see our `Introduction to Quantization on Pytorch
+<https://pytorch.org/blog/introduction-to-quantization-on-pytorch/>`_ blog post
+for a more comprehensive overview of the tradeoffs between these quantization
+types.
+
+Dynamic Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+This is the simplest to apply form of quantization where the weights are
+quantized ahead of time but the activations are dynamically quantized
+during inference. This is used for situations where the model execution time
+is dominated by loading weights from memory rather than computing the matrix
+multiplications. This is true for for LSTM and Transformer type models with
+small batch size.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                   /
+  linear_weight_fp32
+
+  # dynamically quantized model
+  # linear and conv weights are in int8
+  previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32
+                       /
+     linear_weight_int8
+
+API example::
+
+    import torch
+
+    # define a floating point model
+    class M(torch.nn.Module):
+        def __init__(self):
+            super(M, self).__init__()
+            self.fc = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            x = self.fc(x)
+            return x
+
+    # create a model instance
+    model_fp32 = M()
+    # create a quantized model instance
+    model_int8 = torch.quantization.quantize_dynamic(
+        model_fp32,  # the original model
+        {torch.nn.Linear},  # a set of layers to dynamically quantize
+        dtype=torch.qint8)  # the target dtype for quantized weights
+
+    # run the model
+    input_fp32 = torch.randn(4, 4, 4, 4)
+    res = model_int8(input_fp32)
+
+To learn more about dynamic quantization please see our `dynamic quantization tutorial
+<https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html>`_.
+
+Static Quantization
+^^^^^^^^^^^^^^^^^^^^
+
+Static quantization quantizes the weights and activations of the model.  It
+fuses activations into preceding layers where possible.  It requires
+calibration with a representative dataset to determine optimal quantization
+parameters for activations. Post Training Quantization is typically used when
+both memory bandwidth and compute savings are important with CNNs being a
+typical use case.  Static quantization is also known as Post Training
+Quantization or PTQ.
+
+Diagram::
+
+    # original model
+    # all tensors and computations are in floating point
+    previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+        linear_weight_fp32
+
+    # statically quantized model
+    # weights and activations are in int8
+    previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                        /
+      linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could be statically quantized
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          # manually specify where tensors will be converted from floating
+          # point to quantized in the quantized model
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.relu(x)
+          # manually specify where tensors will be converted from quantized
+          # to floating point in the quantized model
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to eval mode for static quantization logic to work
+  model_fp32.eval()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
+  model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+
+  # Fuse the activations to preceding layers, where applicable.
+  # This needs to be done manually depending on the model architecture.
+  # Common fusions include `conv + relu` and `conv + batchnorm + relu`
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
+
+  # Prepare the model for static quantization. This inserts observers in
+  # the model that will observe activation tensors during calibration.
+  model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
+
+  # calibrate the prepared model to determine quantization parameters for activations
+  # in a real world setting, the calibration would be done with a representative dataset
+  input_fp32 = torch.randn(4, 1, 4, 4)
+  model_fp32_prepared(input_fp32)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, and replaces key operators with quantized
+  # implementations.
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about static quantization, please see the `static quantization tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
+Quantization Aware Training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Quantization Aware Training models the effects of quantization during training
+allowing for higher accuracy compared to other quantization methods.  During
+training, all calculations are done in floating point, with fake_quant modules
+modeling the effects of quantization by clamping and rounding to simulate the
+effects of INT8.  After model conversion, weights and
+activations are quantized, and activations are fused into the preceding layer
+where possible.  It is commonly used with CNNs and yields a higher accuracy
+compared to static quantization.  Quantization Aware Training is also known as
+QAT.
+
+Diagram::
+
+  # original model
+  # all tensors and computations are in floating point
+  previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
+                        /
+      linear_weight_fp32
+
+  # model with fake_quants for modeling quantization numerics during training
+  previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32
+                             /
+     linear_weight_fp32 -- fq
+
+  # quantized model
+  # weights and activations are in int8
+  previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
+                       /
+     linear_weight_int8
+
+API Example::
+
+  import torch
+
+  # define a floating point model where some layers could benefit from QAT
+  class M(torch.nn.Module):
+      def __init__(self):
+          super(M, self).__init__()
+          # QuantStub converts tensors from floating point to quantized
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+          self.bn = torch.nn.BatchNorm2d(1)
+          self.relu = torch.nn.ReLU()
+          # DeQuantStub converts tensors from quantized to floating point
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          x = self.quant(x)
+          x = self.conv(x)
+          x = self.bn(x)
+          x = self.relu(x)
+          x = self.dequant(x)
+          return x
+
+  # create a model instance
+  model_fp32 = M()
+
+  # model must be set to train mode for QAT logic to work
+  model_fp32.train()
+
+  # attach a global qconfig, which contains information about what kind
+  # of observers to attach. Use 'fbgemm' for server inference and
+  # 'qnnpack' for mobile inference. Other quantization configurations such
+  # as selecting symmetric or assymetric quantization and MinMax or L2Norm
+  # calibration techniques can be specified here.
+  model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
+
+  # fuse the activations to preceding layers, where applicable
+  # this needs to be done manually depending on the model architecture
+  model_fp32_fused = torch.quantization.fuse_modules(model_fp32,
+      [['conv', 'bn', 'relu']])
+
+  # Prepare the model for QAT. This inserts observers and fake_quants in
+  # the model that will observe weight and activation tensors during calibration.
+  model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused)
+
+  # run the training loop (not shown)
+  training_loop(model_fp32_prepared)
+
+  # Convert the observed model to a quantized model. This does several things:
+  # quantizes the weights, computes and stores the scale and bias value to be
+  # used with each activation tensor, fuses modules where appropriate,
+  # and replaces key operators with quantized implementations.
+  model_fp32_prepared.eval()
+  model_int8 = torch.quantization.convert(model_fp32_prepared)
+
+  # run the model, relevant calculations will happen in int8
+  res = model_int8(input_fp32)
+
+To learn more about quantization aware training, please see the `QAT
+tutorial
+<https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html>`_.
+
 Quantized Tensors
 ---------------------------------------
 
@@ -121,79 +376,8 @@ cover typical CNN and RNN models
     torch.nn.quantized
     torch.nn.quantized.dynamic
 
-Quantization Workflows
-----------------------
-
-PyTorch provides three approaches to quantize models.
-
-.. _quantization tutorials:
-   https://pytorch.org/tutorials/#quantization-experimental
-
-1. Post Training Dynamic Quantization: This is the simplest to apply form of
-   quantization where the weights are quantized ahead of time but the
-   activations are dynamically quantized  during inference. This is used
-   for situations where the model execution time is dominated by loading
-   weights from memory rather than computing the matrix multiplications.
-   This is true for for LSTM and Transformer type models with small
-   batch size. Applying dynamic quantization to a whole model can be
-   done with a single call to :func:`torch.quantization.quantize_dynamic()`.
-   See the `quantization tutorials`_
-2. Post Training Static Quantization: This is the most commonly used form of
-   quantization where the weights are quantized ahead of time and the
-   scale factor and bias for the activation tensors is pre-computed
-   based on observing the behavior of the model during a calibration
-   process. Post Training Quantization is typically when both memory bandwidth
-   and compute savings are important with CNNs being a typical use case.
-   The general process for doing post training quantization is:
-
-
-
-   1. Prepare the model:
-
-      a. Specify where the activations are quantized and dequantized explicitly
-         by adding QuantStub and DeQuantStub modules.
-      b. Ensure that modules are not reused.
-      c. Convert any operations that require requantization into modules
-
-   2. Fuse operations like conv + relu or conv+batchnorm + relu together to
-      improve both model accuracy and performance.
-
-   3. Specify the configuration of the quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or
-      L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare` to insert modules
-      that will observe activation tensors during calibration
-   5. Calibrate the model by running inference against a calibration
-      dataset
-   6. Finally, convert the model itself with the
-      torch.quantization.convert() method. This does several things: it
-      quantizes the weights, computes and stores the scale and bias
-      value to be used each activation tensor, and replaces key
-      operators quantized implementations.
-
-   See the `quantization tutorials`_
-
-
-3. Quantization Aware Training: In the rare cases where post training
-   quantization does not provide adequate accuracy training can be done
-   with simulated quantization using the
-   :class:`torch.quantization.FakeQuantize`. Computations will take place in
-   FP32 but with values clamped and rounded to simulate the effects of INT8
-   quantization. The sequence of steps is very similar.
-
-
-   1. Steps (1) and (2) are identical.
-
-   3. Specify the configuration of the fake quantization methods \'97 such as
-      selecting symmetric or asymmetric quantization and MinMax or Moving Average
-      or L2Norm calibration techniques.
-   4. Use the :func:`torch.quantization.prepare_qat` to insert modules
-      that will simulate quantization during training.
-   5. Train or fine tune the model.
-   6. Identical to step (6) for post training quantization
-
-   See the `quantization tutorials`_
-
+Quantization Customizations
+---------------------------
 
 While default implementations of observers to select the scale factor and bias
 based on observed tensor data are provided, developers can provide their own
@@ -218,9 +402,15 @@ prior to quantization. This is because currently quantization works on a module
 by module basis. Specifically, for all quantization techniques, the user needs to:
 
 1. Convert any operations that require output requantization (and thus have
-   additional parameters) from functionals to module form.
+   additional parameters) from functionals to module form (for example,
+   using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``).
 2. Specify which parts of the model need to be quantized either by assigning
-   ```.qconfig`` attributes on submodules or by specifying ``qconfig_dict``
+   ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``.
+   For example, setting ``model.conv1.qconfig = None`` means that the
+   ``model.conv`` layer will not be quantized, and setting
+   ``model.linear1.qconfig = custom_qconfig`` means that the quantization
+   settings for ``model.linear1`` will be using ``custom_qconfig`` instead
+   of the global qconfig.
 
 For static quantization techniques which quantize activations, the user needs
 to do the following in addition:
@@ -238,6 +428,13 @@ to do the following in addition:
    to be fused. We currently support the following fusions:
    [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]
 
+Best Practices
+--------------
+
+1. Set the ``reduce_range`` argument on observers to `True` if you are using the
+   ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
+   by reducing the range of quantized data type by 1 bit.
+
 
 Modules that provide quantization functions and classes
 -------------------------------------------------------

From 9f4abcad9d0e18741e3d9984aac53f1bb9e7bb33 Mon Sep 17 00:00:00 2001
From: Facebook Community Bot <facebook-github-bot@users.noreply.github.com>
Date: Mon, 5 Oct 2020 14:40:21 -0700
Subject: [PATCH 419/449] Automated submodule update: FBGEMM (#45713)

Summary:
This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM).

New submodule commit: https://github.com/pytorch/FBGEMM/commit/fe9164007c3392a12ea51a19b0f4e9f40d24f88d

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45713

Test Plan: Ensure that CI jobs succeed on GitHub before landing.

Reviewed By: VitalyFedyunin

Differential Revision: D24069807

fbshipit-source-id: 4670725be42368bdf6e29a3746c89514c5f4ee1b
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index 1d710393d5b7..fe9164007c33 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 1d710393d5b7588f5de3b83f51c22bbddf095229
+Subproject commit fe9164007c3392a12ea51a19b0f4e9f40d24f88d

From f11f9a8c1f200c0798dd5043a958c563dfb00777 Mon Sep 17 00:00:00 2001
From: Taras Galkovskyi <tarasg@fb.com>
Date: Mon, 5 Oct 2020 14:50:22 -0700
Subject: [PATCH 420/449] [pytorch][improvement] Improve torch logging to
 identify problematic key (#45766)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45766

As per subj, making KeyError message more verbose.

Test Plan:
Verified that breakage can be successfully investigated with verbose error message
unit tests

Reviewed By: esqu1

Differential Revision: D24080362

fbshipit-source-id: f4e22a78809e5cff65a69780d5cbbc1e8b11b2e5
---
 torch/nn/modules/module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 30e732e6d859..2facc5e0c6eb 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -349,7 +349,7 @@ def add_module(self, name: str, module: Optional['Module']) -> None:
         elif hasattr(self, name) and name not in self._modules:
             raise KeyError("attribute '{}' already exists".format(name))
         elif '.' in name:
-            raise KeyError("module name can't contain \".\"")
+            raise KeyError("module name can't contain \".\", got: {}".format(name))
         elif name == '':
             raise KeyError("module name can't be empty string \"\"")
         self._modules[name] = module

From 9a668f94bbeeea2ae4e5a8242f756bd1cda3b027 Mon Sep 17 00:00:00 2001
From: Lillian Johnson <lillianjohnson@fb.com>
Date: Mon, 5 Oct 2020 15:00:59 -0700
Subject: [PATCH 421/449] [jit] allow slicing multiple dimensions with indicies
 (#45239)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45239

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D23886919

Pulled By: Lilyjjo

fbshipit-source-id: d45c2a550fa8df9960cf2ab5da9d1ae0058a967a
---
 test/jit/test_list_dict.py | 37 +++++++++++++++++++++++++++++++++++++
 torch/jit/frontend.py      |  5 ++---
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
index 8d0f74349b3b..19e4952cad57 100644
--- a/test/jit/test_list_dict.py
+++ b/test/jit/test_list_dict.py
@@ -408,6 +408,43 @@ def test_over_slice():
             return a[3:10] == [3, 4]
         self.checkScript(test_backward_slice, ())
 
+    def test_slice_index(self):
+        a = torch.tensor(
+            [
+                [[1, 11], [2, 22]],
+                [[3, 33], [4, 44]],
+                [[5, 55], [6, 66]],
+            ]
+        )
+
+        def test_index_slice1(x):
+            x = x[:, :, [0, 1]]
+            return x
+        self.checkScript(test_index_slice1, (a,))
+
+        def test_index_slice2(x):
+            x = x[[2, 1, 0], :, :]
+            return x
+        self.checkScript(test_index_slice2, (a,))
+
+        def test_index_slice3(x):
+            x = x[[0, 1], :, [1]]
+            return x
+        self.checkScript(test_index_slice3, (a,))
+
+        def test_index_slice_empty_list(x):
+            empty_list: List[int] = []
+            x = x[empty_list, :, :]
+            return x
+        self.checkScript(test_index_slice_empty_list, (a,))
+
+        def test_index_slice_out_of_bounds_index(x):
+            x = x[[4], :, :]
+            return x
+        with self.assertRaisesRegex(RuntimeError, "index 4 is out of bounds for dimension 0 with size 3"):
+            self.checkScript(test_index_slice_out_of_bounds_index, (a,))
+
+
     def test_mutable_list_append(self):
         def test_append():
             a = [0, 1]
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 952f44161c3f..36dccd04b7e3 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -686,11 +686,10 @@ def build_SliceExpr(ctx, base, slice_expr):
             return SliceExpr(base.range(), lower, upper, step)
 
         def build_Index(ctx, base, index_expr):
-            if isinstance(index_expr.value, ast.Tuple) or \
-                    isinstance(index_expr.value, ast.List):
+            if isinstance(index_expr.value, ast.Tuple):
                 raise NotSupportedError(base.range(),
                                         "slicing multiple dimensions with "
-                                        "sequences not supported yet")
+                                        "tuples not supported yet")
             return build_expr(ctx, index_expr.value)
 
         def build_ExtSlice(ctx, base, extslice):

From 519c086418a3818e27d55a3727c15c9128ae4f73 Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dzhulgakov@fb.com>
Date: Mon, 5 Oct 2020 15:01:46 -0700
Subject: [PATCH 422/449] Revert D24042344: [C2] Add string equality operator

Test Plan: revert-hammer

Differential Revision:
D24042344 (https://github.com/pytorch/pytorch/commit/cf48872d28f945d47793f63e19c54dd15bf580f7)

Original commit changeset: c8997c6130e3

fbshipit-source-id: 3d8aec1104a2a59c67ab4b7e77caeaf9fc94ae1d
---
 caffe2/operators/string_ops.cc                | 26 ------------------
 .../python/operator_test/string_ops_test.py   | 27 -------------------
 2 files changed, 53 deletions(-)

diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc
index 047e740268a4..76fedeb488f8 100644
--- a/caffe2/operators/string_ops.cc
+++ b/caffe2/operators/string_ops.cc
@@ -71,17 +71,6 @@ struct EndsWith {
   std::string suffix_;
 };
 
-struct Equals {
-  explicit Equals(OperatorBase& op)
-      : text_(op.GetSingleArgument<std::string>("text", "")) {}
-  bool operator()(const std::string& str) {
-    return str == text_;
-  }
-
- private:
-  std::string text_;
-};
-
 struct Prefix {
   explicit Prefix(OperatorBase& op)
       : length_(op.GetSingleArgument<int>("length", 3)) {}
@@ -119,9 +108,6 @@ REGISTER_CPU_OPERATOR(
 REGISTER_CPU_OPERATOR(
     StringEndsWith,
     StringElementwiseOp<EndsWith, FixedType<bool>>);
-REGISTER_CPU_OPERATOR(
-    StringEquals,
-    StringElementwiseOp<Equals, FixedType<bool>>);
 REGISTER_CPU_OPERATOR(StringJoin, StringJoinOp<CPUContext>);
 
 OPERATOR_SCHEMA(StringPrefix)
@@ -178,17 +164,6 @@ Returns tensor of boolean of the same dimension of input.
     .Input(0, "strings", "Tensor of std::string.")
     .Output(0, "bools", "Tensor of bools of same shape as input.");
 
-OPERATOR_SCHEMA(StringEquals)
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Performs equality check on each string in the input tensor.
-Returns tensor of booleans of the same dimension as input.
-)DOC")
-    .Arg("text", "The text to check input strings equality against.")
-    .Input(0, "strings", "Tensor of std::string.")
-    .Output(0, "bools", "Tensor of bools of same shape as input.");
-
 OPERATOR_SCHEMA(StringJoin)
     .NumInputs(1)
     .NumOutputs(1)
@@ -212,7 +187,6 @@ SHOULD_NOT_DO_GRADIENT(StringPrefix);
 SHOULD_NOT_DO_GRADIENT(StringSuffix);
 SHOULD_NOT_DO_GRADIENT(StringStartsWith);
 SHOULD_NOT_DO_GRADIENT(StringEndsWith);
-SHOULD_NOT_DO_GRADIENT(StringEquals);
 SHOULD_NOT_DO_GRADIENT(StringJoin);
 }
 } // namespace caffe2
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
index a0c56a686666..eedb57be1d6c 100644
--- a/caffe2/python/operator_test/string_ops_test.py
+++ b/caffe2/python/operator_test/string_ops_test.py
@@ -119,33 +119,6 @@ def string_ends_with_ref(strings):
             [strings],
             string_ends_with_ref)
 
-    @given(strings=st.text(alphabet=['a', 'b']))
-    @settings(deadline=1000)
-    def test_string_equals(self, strings):
-        text = ""
-        if strings:
-            text = strings[0]
-
-        strings = np.array(
-            [str(a) for a in strings], dtype=np.object
-        )
-
-        def string_equals_ref(strings):
-            return (
-                np.array([a == text for a in strings], dtype=bool),
-            )
-
-        op = core.CreateOperator(
-            'StringEquals',
-            ['strings'],
-            ['bools'],
-            text=text)
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [strings],
-            string_equals_ref)
-
 if __name__ == "__main__":
     import unittest
     unittest.main()

From a9a9d0b181c28d065227250cc58f747bd036c96f Mon Sep 17 00:00:00 2001
From: KyleCZH <kylechen@amd.com>
Date: Mon, 5 Oct 2020 15:06:19 -0700
Subject: [PATCH 423/449] Rocm skip test cases (#45782)

Summary:
Skip the following test cases for rocm (When PYTORCH_TEST_WITH_ROCM=1):
- test_reference_numerics_tan_cuda_float64 (__main__.TestUnaryUfuncsCUDA)
- test_addmv_cuda_float16 (__main__.TestTorchDeviceTypeCUDA)
- test_logspace_cuda_float64 (__main__.TestTensorCreationCUDA)
- test_gloo_backend_2gpu_module (__main__.DistributedDataParallelTest)
jeffdaily
pruthvistony

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45782

Reviewed By: VitalyFedyunin

Differential Revision: D24115581

Pulled By: xw285cornell

fbshipit-source-id: 4043a9fa19e242301b5007813c15b6b3873889c5
---
 test/distributed/test_c10d.py                         | 1 +
 test/test_tensor_creation_ops.py                      | 6 ++++--
 test/test_torch.py                                    | 4 +++-
 torch/testing/_internal/common_methods_invocations.py | 3 +++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 6ef0ab5dbb2c..9d0c19bef7b3 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -2172,6 +2172,7 @@ def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self):
 
     @requires_gloo()
     @skip_if_lt_x_gpu(4)
+    @skip_if_rocm
     def test_gloo_backend_2gpu_module(self):
         int_devices = gpus_for_rank(self.world_size)[self.rank][:2]
         devices = [torch.device("cuda:" + str(i)) for i in int_devices]
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index eaaee2dab836..b0777c7fa12a 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -6,7 +6,7 @@
 import torch
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, suppress_warnings,
+    (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, TEST_WITH_ROCM, suppress_warnings,
      torch_to_numpy_dtype_dict, slowTest)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA,
@@ -1048,7 +1048,9 @@ def test_logspace_special_steps(self, device, dtype):
             self._test_logspace_base2(device, dtype, steps=steps)
 
     @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, include_complex=False))
-    @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False))
+    @dtypesIfCUDA(*((torch.testing.get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) 
+                    if TEST_WITH_ROCM 
+                    else torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False)))
     def test_logspace(self, device, dtype):
         _from = random.random()
         to = _from + random.random()
diff --git a/test/test_torch.py b/test/test_torch.py
index 0458e323b78f..7da38b211dc5 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -16684,7 +16684,9 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=
 
     @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8,
                         torch.cfloat: 1e-4, torch.cdouble: 1e-8})
-    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))
+    @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(),
+                  *([torch.float32, torch.float64, torch.bfloat16]
+                    if TEST_WITH_ROCM else torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)))
     @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble)
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_addmv(self, device, dtype):
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 46ba17f61d8f..f26e6c75d37e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -373,6 +373,9 @@ def sample_inputs(self, device, dtype, requires_grad=False):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
                                 active_if=(IS_MACOS or IS_WINDOWS)),
+                       SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                device_type='cuda', dtypes=[torch.float64],
+                                active_if=TEST_WITH_ROCM),
                    )),
     UnaryUfuncInfo('tanh',
                    ref=np.tanh,

From f18cc9c57de6933a2c1b50e9ea65c1d40846145c Mon Sep 17 00:00:00 2001
From: Ansley Ussery <ansley@fb.com>
Date: Mon, 5 Oct 2020 15:07:11 -0700
Subject: [PATCH 424/449] Change type inferred from empty annotation (#45360)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45360

Test Plan: Imported from OSS

Reviewed By: gmagogsfm

Differential Revision: D24078645

Pulled By: ansley

fbshipit-source-id: 5d37d07df75bd7a2111d44638befe53c1021ee82
---
 test/test_jit.py                    | 15 +++++++++++++++
 torch/_C/__init__.pyi.in            |  2 ++
 torch/csrc/jit/python/python_ir.cpp |  3 ++-
 torch/jit/annotations.py            |  2 +-
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 494d70ecfcfb..d093a4b8826e 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -10010,6 +10010,21 @@ def method(self, x):
         with self.assertRaisesRegex(RuntimeError, "Argument y not provided."):
             ModuleDefault()
 
+    def test_type_inferred_from_empty_annotation(self):
+        """
+        Test that the type inferred from an empty or missing annotation is Torch.Tensor wtih `inferred=true`
+        """
+        @torch.jit.script
+        def fn(x):
+            return x
+
+        graph = fn.graph
+        n = next(graph.inputs())
+        self.assertTrue(n.type() == torch._C.TensorType.getInferred())
+
+        with self.assertRaisesRegex(RuntimeError, "Inferred \'x\' to be of type \'Tensor"):
+            fn(1)
+
     def test_script_define_order(self):
         class M(torch.jit.ScriptModule):
 
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index b0479c01f58a..9ccc5f7cb899 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -709,6 +709,8 @@ class EnumType(JitType):
 class TensorType(JitType):
     @classmethod
     def get(cls) -> TensorType: ...
+    @classmethod
+    def getInferred(cls) -> TensorType: ...
 
 # Defined in torch/csrc/jit/python/python_tree_views.cpp
 class SourceRange:
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index c5889144bd1f..78d11e79eb03 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -747,7 +747,8 @@ void initPythonIRBindings(PyObject* module_) {
   py::class_<FloatType, Type, std::shared_ptr<FloatType>>(m, "FloatType")
       .def_static("get", &FloatType::get);
   py::class_<TensorType, Type, std::shared_ptr<TensorType>>(m, "TensorType")
-      .def_static("get", &TensorType::get);
+      .def_static("get", &TensorType::get)
+      .def_static("getInferred", &TensorType::getInferred);
   py::class_<BoolType, Type, std::shared_ptr<BoolType>>(m, "BoolType")
       .def_static("get", &BoolType::get);
   py::class_<StringType, Type, std::shared_ptr<StringType>>(m, "StringType")
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index d9fce627e52d..81ceea5f58df 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -271,7 +271,7 @@ def get_enum_value_type(e: Type[enum.Enum], loc):
 
 def try_ann_to_type(ann, loc):
     if ann is None:
-        return TensorType.get()
+        return TensorType.getInferred()
     if inspect.isclass(ann) and issubclass(ann, torch.Tensor):
         return TensorType.get()
     if is_tuple(ann):

From 5177f8de2b9a53b4dec05d4ec00b00d1fe8f76ec Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dzhulgakov@fb.com>
Date: Mon, 5 Oct 2020 15:07:16 -0700
Subject: [PATCH 425/449] Revert D23398534: [pytorch][PR] [ONNX] Improve error
 handling for adaptive_pool

Test Plan: revert-hammer

Differential Revision:
D23398534 (https://github.com/pytorch/pytorch/commit/45ddeb5ce6d0a7e2a719fac3d7e32a3643b46379)

Original commit changeset: f2d60d40340f

fbshipit-source-id: acc9d6c3d031662c37447fcee027b0c97b8492a7
---
 test/onnx/test_pytorch_onnx_onnxruntime.py |  9 +--------
 torch/onnx/symbolic_opset9.py              | 10 ++--------
 torch/onnx/utils.py                        |  3 ++-
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 77577f687de0..23d4879a8a4c 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1667,14 +1667,7 @@ def forward(self, x, y):
         y = torch.randn(16, 16, requires_grad=True)
         self.run_test(MyModel(), (x, y))
 
-    def test_interpolate_adaptive_pooling_error(self):
-        x = torch.randn(1, 2, 6, requires_grad=True)
-        with self.assertRaises(RuntimeError) as cm:
-            self._interpolate(x, "area", True, True)
-
-        with self.assertRaises(RuntimeError) as cm:
-            self._interpolate(x, "area", False, True)
-
+    @disableScriptTest()
     def test_groupnorm(self):
         model = torch.nn.GroupNorm(3, 6, 0.002)
         x = torch.randn(4, 6, 180, 180, 180)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index e11e52bad508..7e8b04bf1612 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -826,6 +826,7 @@ def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include
 
 
 def _adaptive_pool(name, type, tuple_fn, fn=None):
+    @parse_args('v', 'is')
     def symbolic_fn(g, input, output_size):
         # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
         # by executing a GlobalPool.
@@ -836,10 +837,6 @@ def symbolic_fn(g, input, output_size):
         # so we try using max_poolxd_with_indices, and if it is not possible
         # (input is not a complete tensor or output size not factor of input size)
         # then we call GlobalAveragePool and return None for the indices
-        try:
-            output_size = _parse_arg(output_size, 'is')
-        except Exception:
-            return sym_help._onnx_unsupported('adaptive pooling, since output_size is not constant.')
         if output_size == [1] * len(output_size) and type == "AveragePool":
             return g.op("GlobalAveragePool", input)
         if not input.isCompleteTensor():
@@ -852,10 +849,7 @@ def symbolic_fn(g, input, output_size):
         if mod != [0] * len(mod):
             if output_size == [1] * len(output_size):
                 return g.op("GlobalMaxPool", input), None
-            if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-                return _unimplemented(name, 'output size that are not factor of input size')
-            else:
-                return sym_help._onnx_unsupported(name, ', since output size is not factor of input size')
+            return _unimplemented(name, 'output size that are not factor of input size')
         k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
         # call max_poolxd_with_indices to get indices in the output
         if type == "MaxPool":
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 587e6d5b75c6..98dc79d6546c 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1003,7 +1003,8 @@ def _run_symbolic_function(g, n, inputs, env, operator_export_type=OperatorExpor
         else:
             raise RuntimeError("ONNX export failed on an operator with unrecognized namespace {}::{}. "
                                "If you are trying to export a custom operator, make sure you registered "
-                               "it with the right domain and version.".format(ns, op_name))
+                               "it with the right domain and version. "
+                               "Otherwise, please report a bug.".format(ns, op_name))
     except RuntimeError:
         if operator_export_type == OperatorExportTypes.ONNX_FALLTHROUGH:
             return None

From 26a9012f842c48dcd2bc9a50aee7a9a84d87122b Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Mon, 5 Oct 2020 15:15:43 -0700
Subject: [PATCH 426/449] [fx] import used modules for code gen (#45471)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45471

Intead of assuming that 'torch' is the only module used by generated code,
use the qualified names of builtin functions to generate import statements
for all builtins. This allows user-captured functions to also get code generated correctly.

Test Plan: Imported from OSS

Reviewed By: jamesr66a

Differential Revision: D23978696

Pulled By: zdevito

fbshipit-source-id: ecbff150e3de38532531cdadbfe4965468f29a38
---
 test/test_fx.py          | 22 ++++++++++++++++++----
 torch/fx/graph.py        |  7 +++++++
 torch/fx/graph_module.py |  4 +---
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 9666ccd395bf..507e114614a6 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -9,7 +9,7 @@
 from torch.fx.experimental import GraphManipulation
 from torch.fx.experimental import shape_prop
 from torch.fx.experimental.Partitioner import DAG, Partitioner
-from torch.fx.experimental.subgraph_creation_example import split_module 
+from torch.fx.experimental.subgraph_creation_example import split_module
 
 from torch.fx.proxy import TraceError
 
@@ -30,6 +30,9 @@ class SimpleTest(torch.nn.Module):
     def forward(self, x):
         return torch.relu(x + 3.0)
 
+def a_non_torch_leaf(a, b):
+    return a + b
+
 class TestFX(JitTestCase):
     def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None):
         """Check that an nn.Module's results match the GraphModule version
@@ -84,6 +87,17 @@ def forward(self, A, b=4, *args, c=5, **kwargs):
         t = T()
         symbolic_trace(t)
 
+    def test_custom_import(self):
+        graph = torch.fx.Graph()
+        a = graph.placeholder('x')
+        b = graph.placeholder('y')
+        c = graph.call_function(a_non_torch_leaf, (a, b))
+        d = graph.call_function(torch.sin, (c,))
+        graph.output(d)
+        gm = GraphModule(torch.nn.Module(), graph)
+        x, y = torch.rand(1), torch.rand(1)
+        self.assertEqual(torch.sin(x + y), gm(x, y))
+
     def test_args_kwargs(self):
         class T(torch.nn.Module):
             def forward(self, *args, **kwargs):
@@ -803,8 +817,8 @@ def __init__(self):
                 self.linear = torch.nn.Linear(4, 5)
 
             def forward(self, x, y):
-                z = self.linear(x + self.param).clamp(min=0.0, max=1.0) 
-                w = self.linear(y).clamp(min=0.0, max=1.0) 
+                z = self.linear(x + self.param).clamp(min=0.0, max=1.0)
+                w = self.linear(y).clamp(min=0.0, max=1.0)
                 return z + w
 
         # symbolically trace model
@@ -821,7 +835,7 @@ def mod_partition(node: Node):
             partition_counter = (partition_counter + 1) % NPARTITIONS
             return partition
 
-        # split module in module with submodules 
+        # split module in module with submodules
         module_with_submodules = split_module(my_module_traced, my_module, mod_partition)
 
         x = torch.rand(3, 4)
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index f53f96db0174..b23464dc86f7 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -202,6 +202,7 @@ def _register_name_used(self, op : str) -> str:
 
     def python_code(self, root_module: str) -> str:
         free_vars: List[str] = []
+        modules_used : Set[str] = set()
         body: List[str] = []
         for node in self._nodes:
             if node.op == 'placeholder':
@@ -225,6 +226,9 @@ def python_code(self, root_module: str) -> str:
                     body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}\n')
                     continue
                 qualified_name = _qualified_name(node.target)
+                if '.' in qualified_name:
+                    module_name = qualified_name.split('.', maxsplit=1)[0]
+                    modules_used.add(module_name)
                 if qualified_name == 'getattr' and \
                    isinstance(node.args, tuple) and \
                    isinstance(node.args[1], str) and \
@@ -247,9 +251,12 @@ def python_code(self, root_module: str) -> str:
                 continue
             raise NotImplementedError(f'node: {node.op} {node.target}')
 
+        import_block = '\n'.join(f'import {name}' for name in sorted(modules_used))
+
         code = ''.join(body)
         code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'
         fn_code = f"""\
+{import_block}
 def forward(self, {', '.join(free_vars)}):
 {code}
 """
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index bbc5c26a8182..e261e0227f64 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -28,9 +28,7 @@ def patched_getline(*args, **kwargs):
 linecache.getlines = patched_getline
 
 def _forward_from_src(src : str):
-    gbls: Dict[str, Any] = {
-        'torch': torch
-    }
+    gbls: Dict[str, Any] = {}
     exec_with_source(src, gbls)
     return gbls['forward']
 

From 78f055272ce9d46f5aa84a4c7b626625495c9d80 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Mon, 5 Oct 2020 15:34:04 -0700
Subject: [PATCH 427/449] [docs] Add 3D reduction example to tensordot docs
 (#45697)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45697

**Summary**
This commit adds an example of a reduction over three dimensions with
`torch.tensordot`. It is unclear from existing docs whether `dims`
should be a list of pairs or a pair of lists.

**Test Plan**
Built the docs locally.

*Before*
<img width="864" alt="Captura de Pantalla 2020-10-01 a la(s) 1 35 46 p  m" src="https://user-images.githubusercontent.com/4392003/94866838-f0b17f80-03f4-11eb-8692-8f50fe3b9863.png">

*After*
<img width="831" alt="Captura de Pantalla 2020-10-05 a la(s) 12 06 28 p  m" src="https://user-images.githubusercontent.com/4392003/95121092-670af600-0703-11eb-959f-73c7797a76ee.png">

**Fixes**
This commit closes #22748.

Test Plan: Imported from OSS

Reviewed By: ansley

Differential Revision: D24118186

Pulled By: SplitInfinity

fbshipit-source-id: c19b0b7e001f8cd099dc4c2e0e8ec39310510b46
---
 torch/functional.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch/functional.py b/torch/functional.py
index 8eecf3643035..1a72aaf18a30 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -884,7 +884,7 @@ def tensordot(a, b, dims=2):
     Args:
       a (Tensor): Left tensor to contract
       b (Tensor): Right tensor to contract
-      dims (int or tuple of two lists of integers): number of dimensions to
+      dims (int or Tuple[List[int]] containing two lists): number of dimensions to
          contract or explicit lists of dimensions for :attr:`a` and
          :attr:`b` respectively
 
@@ -919,6 +919,12 @@ def tensordot(a, b, dims=2):
                 [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
                 [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])
 
+        >>> a = torch.randn(3, 5, 4, 6)
+        >>> b = torch.randn(6, 4, 5, 3)
+        >>> torch.tensordot(a, b, dims=([2, 1, 3], [1, 2, 0]))
+        tensor([[  7.7193,  -2.4867, -10.3204],
+                [  1.5513, -14.4737,  -6.5113],
+                [ -0.2850,   4.2573,  -3.5997]])
     """
     if not torch.jit.is_scripting():
         if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)):

From 4ab73c1f749ccc237d59f055cb35e44f95c834f6 Mon Sep 17 00:00:00 2001
From: Meghan Lele <meghanl@fb.com>
Date: Mon, 5 Oct 2020 15:54:40 -0700
Subject: [PATCH 428/449] [docs] Fix EmbeddingBag docs (#45763)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45763

**Summary**
This commit updates the documentation for `EmbeddingBag` to say that for
bags of constant length with no per-sample weights, the class is
equivalent to `Embedding` followed by `torch.sum(dim=1)`. The current
docs say `dim=0` and this is readily falsifiable.

**Test Plan**
1) Tried `Embedding` + `sum` with `dim`=0,1 in interpreter and compared
to `EmbeddingBag`
```
>>> import torch
>>> weights = torch.nn.Parameter(torch.randn(10, 3))
>>> e = torch.nn.Embedding(10, 3)
>>> eb = torch.nn.EmbeddingBag(10, 3, mode="sum")
>>> e.weight = weights
>>> eb.weight = weights
# Use 2D inputs because we are trying to test the case in which bags have constant length
>>> inputs = torch.LongTensor([[4,1,2,7],[5,6,0,3]])
>>> eb(inputs)
tensor([[-2.5497, -0.1556, -0.5166],
        [ 2.2528, -0.3627,  2.5822]], grad_fn=<EmbeddingBagBackward>)
>>> torch.sum(e(inputs), dim=0)
tensor([[ 1.6181, -0.8739,  0.8168],
        [ 0.0295,  2.3274,  1.2558],
        [-0.7958, -0.4228,  0.5961],
        [-1.1487, -1.5490, -0.6031]], grad_fn=<SumBackward1>)
>>> torch.sum(e(inputs), dim=1)
tensor([[-2.5497, -0.1556, -0.5166],
        [ 2.2528, -0.3627,  2.5822]], grad_fn=<SumBackward1>)
```
So clearly `torch.sum` with `dim=0` is not correct here.

2) Built docs and viewed in browser.

*Before*
<img width="882" alt="Captura de Pantalla 2020-10-02 a la(s) 12 26 20 p  m" src="https://user-images.githubusercontent.com/4392003/94963035-557be100-04ac-11eb-986c-088965ac3050.png">

*After*
<img width="901" alt="Captura de Pantalla 2020-10-05 a la(s) 11 26 51 a  m" src="https://user-images.githubusercontent.com/4392003/95117732-ea294d80-06fd-11eb-9d6b-9b4e6c805cd0.png">

**Fixes**
This commit closes #43197.

Test Plan: Imported from OSS

Reviewed By: ansley

Differential Revision: D24118206

Pulled By: SplitInfinity

fbshipit-source-id: cd0d6b5db33e415d8e04ba04f2c7074dcecf3eee
---
 torch/nn/modules/sparse.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py
index b6997ca7701a..f063ffa2e8eb 100644
--- a/torch/nn/modules/sparse.py
+++ b/torch/nn/modules/sparse.py
@@ -186,11 +186,11 @@ class EmbeddingBag(Module):
     r"""Computes sums or means of 'bags' of embeddings, without instantiating the
     intermediate embeddings.
 
-    For bags of constant length and no :attr:`per_sample_weights`, this class
+    For bags of constant length and no :attr:`per_sample_weights` and 2D inputs, this class
 
-        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
-        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
-        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
 
     However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
     operations.

From 54aaffb7c7d05b107f15bc1a73e5c8dedbf705a3 Mon Sep 17 00:00:00 2001
From: Kurt Mohler <kmohler@quansight.com>
Date: Mon, 5 Oct 2020 16:17:24 -0700
Subject: [PATCH 429/449] Avoid NaN values in torch.cdist backward for p<1
 (#45720)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/36493

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45720

Reviewed By: VitalyFedyunin

Differential Revision: D24112541

Pulled By: albanD

fbshipit-source-id: 8598a9e7cc0f6f9ea46c007f2e3365970aea0116
---
 aten/src/ATen/native/cpu/DistanceOpsKernel.cpp |  6 +++++-
 aten/src/ATen/native/cuda/DistanceKernel.cu    |  4 +++-
 test/test_autograd.py                          | 12 ++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index 114ca93dae26..34911a2975e4 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -104,7 +104,11 @@ struct Dist {
 
   // Special general pnorm derivative if p is less than two
   struct lttdist_calc {
-    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)); }
+    static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) {
+      Vec result = (dist == 0.0) ? Vec(0) : (sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)));
+      result = Vec::blendv(result, Vec(0), (diff == Vec(0)) & (p < Vec(1)));
+      return result;
+    }
   };
 
   // Two norm
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
index 385cac5c79e8..c43a2ae9877e 100644
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -50,7 +50,9 @@ struct dists {
 
   // Special case backward when p is less than two
   struct lt_two {
-    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1); }
+    static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) {
+      return (dist == 0.0 || (diff == 0.0 && p < 1)) ? 0 : (sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1));
+    }
   };
 
   // Two norm
diff --git a/test/test_autograd.py b/test/test_autograd.py
index e92fbcbf21bb..6bd6925e015f 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -6139,6 +6139,18 @@ def _test_euclidean_large_cdist(sizex, sizey=None):
         _test_cdist_for_size((1, 1), (S, 1))
         _test_euclidean_large_cdist((2000, 5))
 
+    # Ensure that cdist backward with p<1 does not produce NaNs
+    def test_cdist_grad_p_lt_1_no_nan(self, device):
+        for p in [0.99, 0.7, 0.5, 0.1, 0.01]:
+            x = torch.randn(1, 2, device=device)
+            y = x.clone().detach() + torch.tensor([[1., 0.]], device=device)
+            x.requires_grad = True
+            y.requires_grad = True
+            result = torch.cdist(x, y, p=p)
+            result.backward(torch.ones_like(result))
+            self.assertFalse(torch.isnan(x.grad).any())
+            self.assertFalse(torch.isnan(y.grad).any())
+
     def test_cdist_same_inputs(self, device):
         # Test to detect issues in cdist gradient calculation
         # When the distances are 0

From 1558a3657b669001befd3271e2718f7d2c71e614 Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nshulga@fb.com>
Date: Mon, 5 Oct 2020 16:25:33 -0700
Subject: [PATCH 430/449] Add LazyNVRTC (#45674)

Summary:
Instead of dynamically loading `caffe2_nvrtc`, lazyNVRTC provides the same functionality by binding all the hooks to lazy bind implementation, very similar to the shared library jump tables:
On the first call, each function from the list tries to get a global handle to the respective shared library and replace itself with the dynamically resolved symbol, using the following template:
```
  auto fn = reinterpret_cast<decltype(&NAME)>(getCUDALibrary().sym(C10_SYMBOLIZE(NAME)));
  if (!fn)
    throw std::runtime_error("Can't get" ## NAME);
  lazyNVRTC.NAME = fn;
  return fn(...)
```
Fixes https://github.com/pytorch/pytorch/issues/31985

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45674

Reviewed By: ezyang

Differential Revision: D24073946

Pulled By: malfet

fbshipit-source-id: 1479a75e5200e14df003144625a859d312885874
---
 aten/src/ATen/CMakeLists.txt                  |   1 +
 aten/src/ATen/cuda/detail/CUDAHooks.cpp       |  10 +-
 aten/src/ATen/cuda/detail/LazyNVRTC.cpp       | 171 ++++++++++++++++++
 aten/src/ATen/cuda/detail/LazyNVRTC.h         |  11 ++
 torch/csrc/jit/codegen/cuda/executor.cpp      |   1 +
 .../csrc/jit/codegen/cuda/executor_utils.cpp  |   1 +
 torch/csrc/jit/codegen/cuda/executor_utils.h  |   3 +-
 7 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 aten/src/ATen/cuda/detail/LazyNVRTC.cpp
 create mode 100644 aten/src/ATen/cuda/detail/LazyNVRTC.h

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 5ec9d24eea39..839964e33c59 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -51,6 +51,7 @@ file(GLOB cudnn_cpp "cudnn/*.cpp")
 
 file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h")
 file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
+list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp")
 file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip")
 file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h")
 file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 58f6a8d53e92..28b9738034e7 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -28,6 +28,10 @@
 #include <miopen/version.h>
 #endif
 
+#ifndef USE_ROCM
+#include <ATen/cuda/detail/LazyNVRTC.h>
+#endif
+
 #include <cuda.h>
 
 #include <sstream>
@@ -116,10 +120,14 @@ bool CUDAHooks::hasCuDNN() const {
   return AT_CUDNN_ENABLED();
 }
 
-#ifdef USE_DIRECT_NVRTC
+#if defined(USE_DIRECT_NVRTC)
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
   return std::make_pair(nullptr, at::cuda::load_nvrtc());
 }
+#elif !defined(USE_ROCM)
+static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
+  return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC);
+}
 #else
 static std::pair<std::unique_ptr<at::DynamicLibrary>, at::cuda::NVRTC*> load_nvrtc() {
 #if defined(_WIN32)
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
new file mode 100644
index 000000000000..fae48c08b61f
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -0,0 +1,171 @@
+#include <ATen/cuda/detail/LazyNVRTC.h>
+
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <ATen/DynamicLibrary.h>
+#include <stdexcept>
+
+namespace at {
+namespace cuda {
+namespace detail {
+namespace _stubs {
+
+at::DynamicLibrary& getCUDALibrary() {
+#if defined(_WIN32)
+  static at::DynamicLibrary lib("nvcuda.dll");
+#else
+  static at::DynamicLibrary lib("libcuda.so.1");
+#endif
+  return lib;
+}
+
+at::DynamicLibrary& getNVRTCLibrary() {
+  constexpr auto major = CUDA_VERSION / 1000;
+  constexpr auto minor = ( CUDA_VERSION / 10 ) % 10;
+#if defined(_WIN32)
+  auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll";
+#else
+  static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor);
+#endif
+  static at::DynamicLibrary lib(libname.c_str());
+  return lib;
+}
+
+#define _STUB_1(LIB, NAME, RETTYPE, ARG1)                                            \
+RETTYPE NAME(ARG1 a1) {                                                              \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1);                                                                     \
+}
+
+#define _STUB_2(LIB, NAME, RETTYPE, ARG1, ARG2)                                      \
+RETTYPE NAME(ARG1 a1, ARG2 a2) {                                                     \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2);                                                                 \
+}
+
+#define _STUB_3(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3)                                \
+RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3) {                                            \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2, a3);                                                             \
+}
+
+#define _STUB_4(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3, ARG4)                          \
+RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) {                                   \
+  auto fn = reinterpret_cast<decltype(&NAME)>(get## LIB ## Library().sym(__func__)); \
+  if (!fn)                                                                           \
+    throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) );                     \
+  lazyNVRTC.NAME = fn;                                                               \
+  return fn(a1, a2, a3, a4);                                                         \
+}
+
+#define CUDA_STUB1(NAME, A1) _STUB_1(CUDA, NAME, CUresult CUDAAPI, A1)
+#define CUDA_STUB2(NAME, A1, A2) _STUB_2(CUDA, NAME, CUresult CUDAAPI, A1, A2)
+#define CUDA_STUB3(NAME, A1, A2, A3) _STUB_3(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3)
+#define CUDA_STUB4(NAME, A1, A2, A3, A4) _STUB_4(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3, A4)
+
+#define NVRTC_STUB1(NAME, A1) _STUB_1(NVRTC, NAME, nvrtcResult, A1)
+#define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2)
+#define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3)
+
+NVRTC_STUB2(nvrtcVersion, int*, int*);
+NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const);
+
+nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+                               const char *src,
+                               const char *name,
+                               int numHeaders,
+                               const char * const *headers,
+                               const char * const *includeNames) {
+  auto fn = reinterpret_cast<decltype(&nvrtcCreateProgram)>(getNVRTCLibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get nvrtcCreateProgram");
+  lazyNVRTC.nvrtcCreateProgram = fn;
+  return fn(prog, src, name, numHeaders, headers, includeNames);
+}
+
+NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *);
+NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *);
+NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
+_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
+NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
+NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
+NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **);
+
+CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *);
+CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *);
+CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t);
+CUDA_STUB2(cuGetErrorString, CUresult, const char **);
+CUDA_STUB1(cuCtxGetCurrent, CUcontext *);
+CUDA_STUB1(cuModuleUnload, CUmodule);
+CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
+CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
+CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
+
+// Irregularly shaped functions
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra) {
+  auto fn = reinterpret_cast<decltype(&cuLaunchKernel)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuLaunchKernel");
+  lazyNVRTC.cuLaunchKernel = fn;
+  return fn(f,
+            gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+            sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module,
+                                    const void *image,
+                                    unsigned int numOptions,
+                                    CUjit_option *options,
+                                    void **optionValues) {
+  auto fn = reinterpret_cast<decltype(&cuModuleLoadDataEx)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuModuleLoadDataEx");
+  lazyNVRTC.cuModuleLoadDataEx = fn;
+  return fn(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state,
+              CUjitInputType type,
+              void *data,
+              size_t size,
+              const char *name,
+              unsigned int numOptions,
+              CUjit_option *options,
+              void **optionValues) {
+  auto fn = reinterpret_cast<decltype(&cuLinkAddData)>(getCUDALibrary().sym(__func__));
+  if (!fn)
+    throw std::runtime_error("Can't get cuLinkAddData");
+  lazyNVRTC.cuLinkAddData = fn;
+  return fn(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+} // namespace _stubs
+
+NVRTC lazyNVRTC = {
+#define _REFERENCE_MEMBER(name) _stubs::name,
+  AT_FORALL_NVRTC(_REFERENCE_MEMBER)
+#undef _REFERENCE_MEMBER
+};
+} // namespace detail
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.h b/aten/src/ATen/cuda/detail/LazyNVRTC.h
new file mode 100644
index 000000000000..810e1c322dbd
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.h
@@ -0,0 +1,11 @@
+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+namespace at { namespace cuda {
+// Forward-declares at::cuda::NVRTC
+struct NVRTC;
+
+namespace detail {
+extern NVRTC lazyNVRTC;
+}
+
+}}  // at::cuda::detail
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index f33079bcbab5..a0df3c784778 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -11,6 +11,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAStream.h>
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index 9670968b8fe1..af4e127cc548 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -1,5 +1,6 @@
 #include <ATen/CUDAGeneratorImpl.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
index 76b8a9a145f1..b306cf04da0a 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.h
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -1,11 +1,12 @@
 #pragma once
 
 #include <ATen/core/ivalue.h>
-#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 
 #include <c10/core/DeviceType.h>
 #include <c10/util/Exception.h>
 
+#include <cuda.h>
+
 #include <torch/csrc/jit/ir/ir.h>
 
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>

From b04ae953b415bd0151e93dddef58ea20172e9b61 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Mon, 5 Oct 2020 17:05:07 -0700
Subject: [PATCH 431/449] [FX][WIP] Mutable Graph APIs (#45227)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45227

Test Plan: Imported from OSS

Reviewed By: zdevito

Differential Revision: D23880730

Pulled By: jamesr66a

fbshipit-source-id: eb4e8c14d7f6b1deb1ddd6cf38a360413a1705ed
---
 test/test_fx.py                            | 102 +++++++++++++++++++++
 torch/fx/__init__.py                       |   4 +-
 torch/fx/experimental/GraphManipulation.py |   4 +-
 torch/fx/experimental/shape_prop.py        |   2 +-
 torch/fx/graph.py                          |  56 ++++++++---
 torch/fx/graph_module.py                   |  12 ++-
 torch/fx/node.py                           |  59 +++++++++++-
 torch/quantization/fx/fuse.py              |   8 +-
 torch/quantization/fx/quantize.py          |   2 +-
 9 files changed, 222 insertions(+), 27 deletions(-)

diff --git a/test/test_fx.py b/test/test_fx.py
index 507e114614a6..1451c5efe5cb 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -846,5 +846,107 @@ def mod_partition(node: Node):
 
         self.assertEqual(orig_out, submodules_out)
 
+    @skipIfNoTorchVision
+    def test_replace_uses(self):
+        rn18 = resnet18()
+
+        class LowerReluTracer(torch.fx.Tracer):
+            def is_leaf_module(self, m : torch.nn.Module, qualname : str):
+                if isinstance(m, torch.nn.ReLU):
+                    return False
+                return super().is_leaf_module(m, qualname)
+
+        rn18_traced = GraphModule(rn18, LowerReluTracer().trace(rn18))
+
+        to_erase = []
+        for node in rn18_traced.graph.nodes:
+            if node.op == 'call_function' and node.target in [torch.relu, torch.nn.functional.relu]:
+                kwargs = node.kwargs
+                # Neg doesn't have in-place
+                kwargs.pop('inplace')
+                with torch.fx.graph.insert_before(node):
+                    new_node = rn18_traced.graph.call_function(
+                        the_function=torch.neg, args=node.args, kwargs=node.kwargs)
+                node.replace_all_uses_with(replace_with=new_node)
+                to_erase.append(node)
+
+        for node in to_erase:
+            rn18_traced.graph.erase_node(node)
+
+    def test_insertion_point(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        x : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,))
+        output : torch.fx.Node = graph.output(b)
+
+        with torch.fx.graph.insert_before(b):
+            neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,))
+            _, *relu_args = b.args
+            b.args = (neg, *relu_args)
+
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+
+        input = torch.randn(33, 44)
+        self.assertEqual(gm(input), torch.relu(torch.neg(input)))
+
+
+    def test_move_before(self):
+        graph : torch.fx.Graph = torch.fx.Graph()
+        x : torch.fx.Node = graph.create_node('placeholder', 'x')
+        b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,))
+        output : torch.fx.Node = graph.output(b)
+
+        neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,))
+        _, *relu_args = b.args
+        b.args = (neg, *relu_args)
+        graph.move_node_before(to_move=neg, before=b)
+
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+
+        input = torch.randn(33, 44)
+        self.assertEqual(gm(input), torch.relu(torch.neg(input)))
+
+    def test_erase_node_error(self):
+        st = SimpleTest()
+        traced = symbolic_trace(st)
+
+        for node in traced.graph.nodes:
+            # Test deleting with uses both in another Node and at the output
+            if node.target in [operator.add, torch.relu]:
+                with self.assertRaisesRegex(RuntimeError, 'but it still had .* uses in the graph!'):
+                    traced.graph.erase_node(node)
+
+    def test_find_uses(self):
+        graph = torch.fx.Graph()
+        x = torch.fx.Proxy(graph.placeholder('x'))
+
+        y = torch.relu(x)
+        z = x + x
+        u = torch.neg(x)
+        graph.output((y + z + u).node)
+        graph.lint()
+
+        uses_of_x = x.node.find_uses()
+        self.assertEqual(len(uses_of_x), 3)
+        expected_ops = ['relu', 'add', 'neg']
+        for node, expected in zip(uses_of_x, expected_ops):
+            assert expected in node.name
+
+    def test_multi_insert_point(self):
+        graph = torch.fx.Graph()
+        x = torch.fx.Proxy(graph.placeholder('x'))
+        relu = torch.relu(x)
+
+        with torch.fx.graph.insert_before(relu.node):
+            y = torch.neg(x)
+            z = torch.tanh(y)
+
+        graph.output((relu.node, z.node))
+        graph.lint()
+
+        expected_ops = ['x', 'neg', 'tanh', 'relu']
+        for node, expected in zip(graph.nodes, expected_ops):
+            assert expected in node.name
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py
index 30e65c191a30..792a905432a5 100644
--- a/torch/fx/__init__.py
+++ b/torch/fx/__init__.py
@@ -85,6 +85,6 @@ def forward(self, x):
 
 from .graph_module import GraphModule
 from .symbolic_trace import symbolic_trace, Tracer
-from .graph import Graph, map_arg
-from .node import Node
+from .graph import Graph
+from .node import Node, map_arg
 from .proxy import Proxy
diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py
index 10a0c86e2249..0c5d18aa4fb2 100644
--- a/torch/fx/experimental/GraphManipulation.py
+++ b/torch/fx/experimental/GraphManipulation.py
@@ -1,8 +1,8 @@
 from typing import Dict, List
 from torch.fx.graph_module import GraphModule
 from typing import Any
-from torch.fx.node import Node, Target
-from torch.fx.graph import Graph, map_arg
+from torch.fx.node import Node, Target, map_arg
+from torch.fx.graph import Graph
 
 
 """find_use is used to find out if the node is another node's arg or kwargs."""
diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py
index 01374727a447..52264796c7d4 100644
--- a/torch/fx/experimental/shape_prop.py
+++ b/torch/fx/experimental/shape_prop.py
@@ -15,7 +15,7 @@ def propagate(self, *args):
         env : Dict[str, Node] = {}
 
         def load_arg(a):
-            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+            return torch.fx.node.map_arg(a, lambda n: env[n.name])
 
         def fetch_attr(target : str):
             target_atoms = target.split('.')
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index b23464dc86f7..ed7618372b57 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -1,4 +1,4 @@
-from .node import Node, Argument, Target
+from .node import Node, Argument, Target, map_arg
 
 from typing import Callable, Any, List, Dict, Optional, Tuple, Set
 import builtins
@@ -52,23 +52,22 @@ def _format_target(base: str, target: str) -> str:
             r = f'{r}.{e}'
     return r
 
-def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
-    """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
-    if isinstance(a, (tuple, list)):
-        return type(a)(map_arg(elem, fn) for elem in a)
-    elif isinstance(a, dict):
-        return {k: map_arg(v, fn) for k, v in a.items()}
-    elif isinstance(a, slice):
-        return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn))
-    elif isinstance(a, Node):
-        return fn(a)
-    else:
-        return a
+class insert_before:
+    def __init__(self, n : Node):
+        self.n = n
+
+    def __enter__(self):
+        self.orig_insert_point = self.n.graph._insert_point
+        self.n.graph._insert_point = self.n
+
+    def __exit__(self, type, value, tb):
+        self.n.graph._insert_point = self.orig_insert_point
 
 class Graph:
     def __init__(self):
         self._nodes : List[Node] = []
         self._used_names : Dict[str, int] = {}  # base name -> number
+        self._insert_point : Optional[Node] = None
 
     @property
     def nodes(self):
@@ -105,9 +104,38 @@ def create_node(self, op: str, target: Target,
         self._mark_uses(kwargs)
         sanitized_name = self._register_name_used(name) if name is not None else self._name(target)
         n = Node(self, sanitized_name, op, target, args, kwargs)
-        self._nodes.append(n)
+        if self._insert_point is not None:
+            before_idx = self._nodes.index(self._insert_point)
+            self._nodes.insert(before_idx, n)
+        else:
+            self._nodes.append(n)
         return n
 
+    def move_node_before(self, to_move : Node, before : Node):
+        """
+        Move node `to_move` before `before` in the Graph. Both `Node` arguments
+        must be present in this graph.
+        """
+        # TODO: Computationally inefficient
+        if to_move.graph != self or before.graph != self:
+            raise RuntimeError('Node arguments must belong to this Graph!')
+        node_idx = self._nodes.index(to_move)
+        before_idx = self._nodes.index(before)
+        self._nodes.insert(before_idx, self._nodes.pop(node_idx))
+
+
+    def erase_node(self, to_erase : Node):
+        """
+        Erases the node `to_erase` from the `Graph`. Throws an exception if
+        there are still uses of that node in the `Graph`.
+        """
+        if to_erase.uses > 0:
+            raise RuntimeError(f'Tried to erase Node {to_erase} but it still had {to_erase.uses} uses in the graph!')
+
+        node_indices = [i for i, n in enumerate(self._nodes) if n == to_erase]
+        for idx in reversed(node_indices):
+            self._nodes.pop(idx)
+
     # sugar for above when you know the op
     def placeholder(self, name: str) -> Node:
         return self.create_node('placeholder', name)
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index e261e0227f64..6f72a29be184 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -105,7 +105,9 @@ class GraphModule(torch.nn.Module):
         forward : The Python method generated from `graph`
 
     Note that when `graph` is reassigned, `code` and `forward` will be automatically
-    regenerated.
+    regenerated. However, if you edit the contents of the `graph` without reassigning
+    the `graph` attribute itself, you must call `recompile()` to update the generated
+    code.
     """
     def __new__(cls: 'Type[GraphModule]', *args, **kwargs):
         # each instance of a graph module needs its own forward method
@@ -172,6 +174,14 @@ def graph(self):
     @graph.setter
     def graph(self, val) -> None:
         self._graph = val
+        self.recompile()
+
+    def recompile(self) -> None:
+        """
+        Recompile this GraphModule from its `graph` attribute. This should be
+        called after editing the contained `graph`, otherwise the generated
+        code of this `GraphModule` will be out of date.
+        """
         self.code = self._graph.python_code(root_module='self')
         cls = type(self)
         cls.forward = _forward_from_src(self.code)
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 7bf57cff4dae..53abead5f044 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -5,7 +5,6 @@
 if TYPE_CHECKING:
     from .graph import Graph
 
-
 BaseArgumentTypes = Union[str, int, float, bool, torch.dtype, torch.Tensor]
 base_types = BaseArgumentTypes.__args__  # type: ignore
 
@@ -35,5 +34,63 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target,
         self.kwargs = kwargs
         self.uses = 0
 
+    def find_uses(self) -> List['Node']:
+        """
+        Find all nodes that use the value produced by `self`. The complexity of
+        this function is linear in the number of nodes * number of arguments to
+        each node.
+
+        Note that len(find_uses()) is not necessarily equal to attribute `uses`.
+        This node could be used multiple times in the same `Node`. In that case,
+        the user node would appear once in the return value here, but `uses` would
+        account for the total number of times this Node is used by the user node.
+        e.g. a node for `x + x` would have two uses for the `x` node, but the
+        `x + x` node would appear once in the return from `find_uses`
+        """
+        use_nodes : List[Node] = []
+        for node in self.graph._nodes:
+            def record_use(arg_node : Node) -> None:
+                if arg_node == self and (len(use_nodes) == 0 or use_nodes[-1] != node):
+                    use_nodes.append(node)
+            map_arg(node.args, record_use)
+            map_arg(node.kwargs, record_use)
+        return use_nodes
+
     def __repr__(self) -> str:
         return self.name
+
+    def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']:
+        """
+        Replace all uses of `self` in the Graph with the Node `replace_with`.
+        Returns the list of nodes on which this change was made.
+        """
+        use_nodes : List[Node] = self.find_uses()
+        for use_node in use_nodes:
+            def maybe_replace_node(n : Node) -> Node:
+                if n == self:
+                    self.uses -= 1
+                    return replace_with
+                else:
+                    return n
+            new_args = map_arg(use_node.args, maybe_replace_node)
+            assert isinstance(new_args, tuple)
+            use_node.args = new_args
+            new_kwargs = map_arg(use_node.kwargs, maybe_replace_node)
+            assert isinstance(new_kwargs, dict)
+            use_node.kwargs = new_kwargs
+
+        return use_nodes
+
+
+def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
+    """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """
+    if isinstance(a, (tuple, list)):
+        return type(a)(map_arg(elem, fn) for elem in a)
+    elif isinstance(a, dict):
+        return {k: map_arg(v, fn) for k, v in a.items()}
+    elif isinstance(a, slice):
+        return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn))
+    elif isinstance(a, Node):
+        return fn(a)
+    else:
+        return a
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 0c7e1f90f47a..852de812e39d 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -1,11 +1,9 @@
 from torch.fx import (
-    GraphModule
+    GraphModule,
+    map_arg
 )
 
-from torch.fx.graph import (
-    Graph,
-    map_arg,
-)
+from torch.fx.graph import Graph
 
 from .pattern_utils import (
     is_match,
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index 1ea12b2894d3..14bd2c8eee1e 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -3,12 +3,12 @@
     GraphModule,
     Proxy,
     symbolic_trace,
+    map_arg
 )
 
 from torch.fx.graph import (
     Graph,
     Node,
-    map_arg,
 )
 
 from torch.quantization import (

From 59083d61762b608a8298b68dcabe156e50759243 Mon Sep 17 00:00:00 2001
From: Mingzhe Li <mingzhe0908@fb.com>
Date: Mon, 5 Oct 2020 18:23:14 -0700
Subject: [PATCH 432/449] [NCCL] Support NCCL Send/Recv (#44921)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44921

This diff adds support for Process Group point-to-point operations on NCCL backend based on ncclSend/ncclRecv. See https://github.com/pytorch/pytorch/issues/43995 for more context.
ghstack-source-id: 113592785

Test Plan: unittest

Reviewed By: jiayisuse

Differential Revision: D23709848

fbshipit-source-id: cdf38050379ecbb10450f3394631317b41163258
---
 torch/csrc/distributed/c10d/init.cpp          |   6 +
 torch/distributed/__init__.py                 |   4 +
 torch/distributed/distributed_c10d.py         | 137 ++++++++++--
 torch/lib/c10d/ProcessGroupNCCL.cpp           | 197 ++++++++++++++++--
 torch/lib/c10d/ProcessGroupNCCL.hpp           |  45 +++-
 .../_internal/distributed/distributed_test.py | 184 +++++++++++++++-
 6 files changed, 527 insertions(+), 46 deletions(-)

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 38a1811692c2..d15ea9d23412 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -949,6 +949,12 @@ that adds a prefix to each key inserted to the store.
       .def(py::init<>())
       .def_readwrite("is_high_priority", &::c10d::ProcessGroupNCCL::Options::isHighPriorityStream)
       .def_readwrite("op_timeout", &::c10d::ProcessGroupNCCL::Options::opTimeout);
+  processGroupNCCL.def_static("_group_start", []() {
+    ::c10d::ProcessGroupNCCL::groupStart();
+  });
+  processGroupNCCL.def_static("_group_end", []() {
+    ::c10d::ProcessGroupNCCL::groupEnd();
+  });
 #endif
 
 #ifdef USE_C10D_MPI
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index ba5ec8bdb5fc..44b7876c4787 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -25,3 +25,7 @@ def is_available():
     # this.
 
     from .distributed_c10d import _backend
+
+    # TODO: remove this once CI issue is resolved
+    # https://github.com/pytorch/pytorch/issues/42517
+    from .distributed_c10d import _P2POp, _batch_isend_irecv
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index ae4338cd28fc..a125d8a1204b 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,6 +1,7 @@
 import pickle
 import torch
 import warnings
+import contextlib
 from torch._six import string_classes
 from datetime import timedelta
 
@@ -159,8 +160,7 @@ class GroupMember(object):
 
 def _rank_not_in_group(group):
     """
-    Helper that checks if the current process's rank is not in a given group
-
+    Helper that checks if the current process's rank is not in a given group.
     """
     if group == GroupMember.WORLD:
         return False
@@ -170,8 +170,7 @@ def _rank_not_in_group(group):
 def _get_group_rank(group, rank):
     """
     Helper that gets a given group's local rank in the group from a given global
-    rank
-
+    rank.
     """
     if group is GroupMember.WORLD:
         raise RuntimeError("group.WORLD does not have local rank to global "
@@ -188,8 +187,7 @@ def _get_group_rank(group, rank):
 def _get_global_rank(group, group_rank):
     """
     Helper that gets a given group's global rank from a given local rank in the
-    group
-
+    group.
     """
     if group is GroupMember.WORLD:
         raise RuntimeError("group.WORLD does not have local rank to global "
@@ -204,8 +202,7 @@ def _get_global_rank(group, group_rank):
 def _check_default_pg():
     """
     Helper that checks if the default ProcessGroup has been initialized, with
-    assertion
-
+    assertion.
     """
     assert _default_pg is not None, \
         "Default process group is not initialized"
@@ -213,8 +210,7 @@ def _check_default_pg():
 
 def _get_group_size(group):
     """
-    Helper that gets a given group's world size
-
+    Helper that gets a given group's world size.
     """
     if group is GroupMember.WORLD:
         _check_default_pg()
@@ -227,7 +223,6 @@ def _get_group_size(group):
 def _check_single_tensor(param, param_name):
     """
     Helper to check that the parameter ``param_name`` is a single tensor.
-
     """
     if not isinstance(param, torch.Tensor):
         raise RuntimeError("Invalid function argument. Expected parameter `{}` "
@@ -237,7 +232,6 @@ def _check_single_tensor(param, param_name):
 def _check_tensor_list(param, param_name):
     """
     Helper to check that the parameter ``param_name`` is a list of tensors.
-
     """
     if not isinstance(param, list) or \
        not all(isinstance(p, torch.Tensor) for p in param):
@@ -245,10 +239,34 @@ def _check_tensor_list(param, param_name):
                            "to be of type List[torch.Tensor].".format(param_name))
 
 
+def _check_op(op):
+    """
+    Helper to check that the ``op`` is either isend or irecv.
+    """
+    if op not in [isend, irecv]:
+        raise RuntimeError("Invalid ``op``. Expected ``op`` "
+                           "to be of type ``torch.distributed.isend`` or "
+                           "``torch.distributed.irecv``.")
+
+def _check_p2p_op_list(p2p_op_list):
+    """
+    Helper to check that the ``p2p_op_list`` is a list of _P2POp instances and
+    all ops use the same backend.
+    """
+    if not isinstance(p2p_op_list, list) or \
+       not all(isinstance(p2p_op, _P2POp) for p2p_op in p2p_op_list):
+        raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to "
+                           "to be of type ``torch.distributed._P2POp``.")
+
+
+    backend = get_backend(p2p_op_list[0].group)
+    if not all(backend == get_backend(p2p_op.group) for p2p_op in p2p_op_list):
+        raise RuntimeError("All groups need to use the same backend.")
+
+
 def is_mpi_available():
     """
     Checks if the MPI backend is available.
-
     """
     return _MPI_AVAILABLE
 
@@ -256,7 +274,6 @@ def is_mpi_available():
 def is_nccl_available():
     """
     Checks if the NCCL backend is available.
-
     """
     return _NCCL_AVAILABLE
 
@@ -264,7 +281,6 @@ def is_nccl_available():
 def is_gloo_available():
     """
     Checks if the Gloo backend is available.
-
     """
     return _GLOO_AVAILABLE
 
@@ -272,7 +288,6 @@ def is_gloo_available():
 def is_initialized():
     """
     Checking if the default process group has been initialized
-
     """
     return _default_pg is not None
 
@@ -280,7 +295,6 @@ def is_initialized():
 def _get_default_group():
     """
     Getting the default process group created by init_process_group
-
     """
     if not is_initialized():
         raise RuntimeError("Default process group has not been initialized, "
@@ -291,7 +305,6 @@ def _get_default_group():
 def _get_default_store():
     """
     Getting the default store created by init_process_group
-
     """
     if not is_initialized():
         raise RuntimeError("Default process group has not been initialized, "
@@ -757,6 +770,94 @@ def recv(tensor,
         return src
 
 
+class _P2POp(object):
+    """
+    A class to build point-to-point operations for ``_batch_isend_irecv``.
+
+    This class builds the type of P2P operation, communication buffer, peer rank,
+    Process Group group, and tag. Instances of this class will be passed to
+    ``_batch_isend_irecv`` for point-to-point communications.
+
+    Arguments:
+        op (callable): A function to send data to or receive data from a peer process.
+            The type of ``op`` is either ``torch.distributed.isend`` or
+            ``torch.distributed.irecv``.
+        tensor (Tensor): Tensor to send or receive.
+        peer (int): Destination or source rank.
+        group (ProcessGroup, optional): The process group to work on.
+        tag (int, optional): Tag to match send with recv.
+    """
+    def __init__(self, op, tensor, peer, group=group.WORLD, tag=0):
+        self.op = op
+        self.tensor = tensor
+        self.peer = peer
+        self.group = group
+        self.tag = tag
+
+    def __new__(cls, op, tensor, peer, group=group.WORLD, tag=0):
+        _check_op(op)
+        _check_single_tensor(tensor, "tensor")
+        return object.__new__(cls)
+
+
+@contextlib.contextmanager
+def _batch_p2p_manager(backend):
+    if backend == Backend.NCCL:
+        ProcessGroupNCCL._group_start()
+    try:
+        yield
+    finally:
+        if backend == Backend.NCCL:
+            ProcessGroupNCCL._group_end()
+
+
+def _batch_isend_irecv(p2p_op_list):
+    """
+    Send or Receive a batch of tensors asynchronously and return a list of requests.
+
+    Process each of the operations in p2p_op_list and return the corresponding
+    requests. NCCL and Gloo backend are currently supported.
+
+    Arguments:
+        p2p_op_list: A list of point-to-point operations(type of each operator is
+            ``torch.distributed._P2POp``). The order of the isend/irecv in the list
+            matters and it needs to match with corresponding isend/irecv on the
+            remote end.
+
+    Returns:
+        A list of distributed request objects returned by calling the corresponding
+        op in the op_list.
+
+    Examples:
+        >>> send_tensor = torch.arange(2) + 2 * rank
+        >>> recv_tensor = torch.randn(2)
+        >>> send_op = dist._P2POp(dist.isend, send_tensor, (rank + 1)%world_size)
+        >>> recv_op = dist._P2POp(dist.irecv, recv_tensor, (rank + 1)%world_size)
+        >>> reqs = _batch_isend_irecv([send_op, recv_op])
+        >>> for req in reqs:
+        >>>     req.wait()
+        >>> recv_tensor
+        tensor([2, 3])     # Rank 0
+        tensor([0, 1])     # Rank 1
+    """
+    _check_p2p_op_list(p2p_op_list)
+    backend = get_backend(p2p_op_list[0].group)
+    reqs = []
+    with _batch_p2p_manager(backend):
+        for p2p_op in p2p_op_list:
+            op = p2p_op.op
+            tensor = p2p_op.tensor
+            peer = p2p_op.peer
+            curr_group = p2p_op.group
+            tag = p2p_op.tag
+
+            ret = op(tensor, peer, curr_group, tag)
+
+            if ret is not None:
+                reqs.append(ret)
+    return reqs
+
+
 def broadcast_multigpu(tensor_list,
                        src,
                        group=group.WORLD,
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index a2d04be14959..7698aaee70fa 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -232,6 +232,7 @@ const int64_t ProcessGroupNCCL::kWorkCleanupThreadSleepMillis = 1000;
 constexpr int64_t kWaitForAbortCommStoreKey = 1000;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 const int64_t ProcessGroupNCCL::kProcessGroupNCCLOpTimeoutMillis = 10 * 1000;
+thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0;
 
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector<at::Device>& devices)
     : devices_(devices), workStartTime_(std::chrono::steady_clock::now()) {
@@ -756,7 +757,26 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   std::vector<at::cuda::CUDAStream> streamVal;
   streamVal.reserve(devices.size());
 
-  // Create the NCCL communicators for each GPU
+  // [Group Start/End Note] This is used to ensure that nccl communicator will be created
+  // before communication primitives are called. Let's look at this example:
+  // Using the batch_isend_irecv to send a tensor to a target process. On the sender side,
+  // the corresponding underlying NCCL calls will look like
+  //   ncclGroupStart() // This is in batch_isend_irecv
+  //   ncclGroupStart() // This is [Note 1]
+  //   ncclCommInitRank() // Inside NCCLComm::create
+  //   ncclSend()
+  //   ncclGroupEnd() // This is [Note 2]
+  //   ncclGroupEnd() // This is in batch_isend_irecv
+  // With this pattern, the nccl communicator will be created in the last ncclGroupEnd
+  // which means when ncclSend is processed, the passed communicator argument is NULL which will
+  // lead to runtime error. So we need to "close" all active nccl groups to ensure
+  // nccl communicator is actually created before encountering any communication calls.
+  // This is why we need the following for loop.
+  for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) {
+    C10D_NCCL_CHECK(ncclGroupEnd());
+  }
+
+  // [Note 1] Create the NCCL communicators for each GPU
   C10D_NCCL_CHECK(ncclGroupStart());
 
   for (size_t i = 0; i < devices.size(); ++i) {
@@ -781,8 +801,14 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     }
   }
 
+  // [Note 2 ]
   C10D_NCCL_CHECK(ncclGroupEnd());
 
+  // See [Group Start/End Note]
+  for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) {
+    C10D_NCCL_CHECK(ncclGroupStart());
+  }
+
   ncclStreams_.emplace(devicesKey, std::move(streamVal));
 
   // Note: these events are created with the (default) cudaEventDisableTiming
@@ -1010,6 +1036,72 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
   return work;
 }
 
+template <typename Fn, typename PreProcess, typename PostProcess>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+    std::vector<at::Tensor>& tensors,
+    Fn fn,
+    bool isRecv,
+    PreProcess pre,
+    PostProcess post) {
+  const auto devices = getDeviceList(tensors);
+  const auto key = getKeyFromDevices(devices);
+  auto& ncclComms = getNCCLComm(key, devices);
+
+  // First let NCCL streams wait for input tensors allocation streams
+  syncStreams(devices, ncclEvents_[key], ncclStreams_[key]);
+
+  // Work itself will create the CUDA events on all GPUs of tensors
+  auto work = initWork(devices);
+
+  if (isRecv) {
+    // Store references to outputs and futureNCCLCallbackStream to be used by
+    // WorkNCCL::getFuture.
+    work->outputs_ = std::make_shared<std::vector<at::Tensor>>(tensors);
+    work->futureNCCLCallbackStreams_ = futureNCCLCallbackStreams_;
+  }
+
+  at::cuda::OptionalCUDAGuard gpuGuard;
+
+  pre(ncclStreams_[key]);
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    gpuGuard.set_index(devices[i].index());
+    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+
+    // Both send tensor and recv tensor are created on a worker stream and used in
+    // different ncclStreams.  Hence, both must record the ncclStream to
+    // prevent being freed before the collective finishes.
+    //
+    // See [Sync Streams].
+    c10::cuda::CUDACachingAllocator::recordStream(
+        tensors[i].storage().data_ptr(), ncclStream);
+  }
+
+  {
+    AutoNcclGroup nccl_group_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      gpuGuard.set_index(devices[i].index());
+      at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+      C10D_NCCL_CHECK(
+          fn(tensors[i], ncclComms[i]->getNcclComm(), ncclStream));
+    }
+  }
+
+  post(ncclStreams_[key]);
+
+  // Event should only be recorded after the ncclGroupEnd()
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+    (*work->cudaEvents_)[i].record(ncclStream);
+    work->ncclComms_[i] = ncclComms[i];
+    work->blockingWait_ = blockingWait_;
+    work->opTimeout_ = opTimeout_;
+    work->store_ = store_;
+  }
+
+  return work;
+}
+
 template <typename Fn>
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     std::vector<at::Tensor>& inputs,
@@ -1023,6 +1115,19 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
       [](std::vector<at::cuda::CUDAStream>&) {});
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
+    std::vector<at::Tensor>& tensor,
+    Fn fn,
+    bool isRecv) {
+  return pointToPoint(
+      tensor,
+      fn,
+      isRecv,
+      [](std::vector<at::cuda::CUDAStream>&) {},
+      [](std::vector<at::cuda::CUDAStream>&) {});
+}
+
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allreduce(
     std::vector<at::Tensor>& tensors,
     const AllreduceOptions& opts) {
@@ -1298,6 +1403,50 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
         });
   }
 }
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+    std::vector<at::Tensor>& tensors,
+    int dstRank,
+    int /* unused */) {
+  check_gpu_tensors(tensors);
+  auto ret = pointToPoint(
+      tensors,
+      [&](at::Tensor& input,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream) {
+        return ncclSend(
+            input.data_ptr(),
+            input.numel(),
+            getNcclDataType(input.scalar_type()),
+            dstRank,
+            comm,
+            stream.stream());
+      },
+      /* isRecv */ false);
+  return ret;
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+    std::vector<at::Tensor>& tensors,
+    int srcRank,
+    int /* unused */) {
+  check_gpu_tensors(tensors);
+  auto ret= pointToPoint(
+      tensors,
+      [&](at::Tensor& output,
+          ncclComm_t comm,
+          at::cuda::CUDAStream& stream) {
+        return ncclRecv(
+            output.data_ptr(),
+            output.numel(),
+            getNcclDataType(output.scalar_type()),
+            srcRank,
+            comm,
+            stream.stream());
+      },
+      /* isRecv */ true);
+  return ret;
+}
 #else
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
     at::Tensor& /* unused */,
@@ -1308,7 +1457,37 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall_base(
   throw std::runtime_error(
       "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0");
 }
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error(
+      "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error(
+      "ProcessGroupNCCL only supports recv for NCCL lib version >= 2.7.0");
+}
+#endif
+
+void ProcessGroupNCCL::groupStart() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  C10D_NCCL_CHECK(ncclGroupStart());
+#endif
+  ++ncclActiveGroupCounter_;
+}
+
+void ProcessGroupNCCL::groupEnd() {
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
+  C10D_NCCL_CHECK(ncclGroupEnd());
 #endif
+  --ncclActiveGroupCounter_;
+}
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::alltoall(
     std::vector<at::Tensor>& /* unused */,
@@ -1331,24 +1510,10 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::scatter(
   throw std::runtime_error("ProcessGroupNCCL does not support scatter");
 }
 
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support send");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support recv");
-}
-
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recvAnysource(
     std::vector<at::Tensor>& /* unused */,
     int /* unused */) {
-  throw std::runtime_error("ProcessGroupNCCL does not support recv");
+  throw std::runtime_error("ProcessGroupNCCL does not support recvAnysource");
 }
 
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::allgather_base(
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index 06b144438455..fb9107121f6a 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -420,6 +420,20 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<at::Tensor>& inputTensors,
       const AllToAllOptions& opts = AllToAllOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  std::shared_ptr<ProcessGroup::Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  static void groupStart();
+
+  static void groupEnd();
+
   // Unsupported Ops
   std::shared_ptr<ProcessGroup::Work> gather(
       std::vector<std::vector<at::Tensor>>& outputTensors,
@@ -431,16 +445,6 @@ class ProcessGroupNCCL : public ProcessGroup {
       std::vector<std::vector<at::Tensor>>& inputTensors,
       const ScatterOptions& opts = ScatterOptions()) override;
 
-  std::shared_ptr<ProcessGroup::Work> send(
-      std::vector<at::Tensor>& tensors,
-      int dstRank,
-      int tag) override;
-
-  std::shared_ptr<ProcessGroup::Work> recv(
-      std::vector<at::Tensor>& tensors,
-      int srcRank,
-      int tag) override;
-
   std::shared_ptr<ProcessGroup::Work> recvAnysource(
       std::vector<at::Tensor>& tensors,
       int tag) override;
@@ -484,6 +488,22 @@ class ProcessGroupNCCL : public ProcessGroup {
       PreProcess pre,
       PostProcess post);
 
+  // Helper that encapsulates work shared across point-to-point communication
+  // primitives. It is the same structure as the helper used for collective
+  // communicaiton primitives.
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+      std::vector<at::Tensor>& tensor,
+      Fn fn,
+      bool isRecv);
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  std::shared_ptr<ProcessGroup::Work> pointToPoint(
+      std::vector<at::Tensor>& tensor,
+      Fn fn,
+      bool isRecv,
+      PreProcess pre,
+      PostProcess post);
+
   // Checks for NCCL errors on each of the communicators and returns an
   // appropriate exception_ptr (nullptr if no errors).
   static std::exception_ptr checkForNCCLErrorsInternal(
@@ -639,6 +659,11 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   // Schedule NCCL operations on high priority CUDA streams.
   bool isHighPriorityStream_ = false;
+
+  // The number of active ncclGroupStart() calls. This counter will be increased
+  // by 1 when ncclGroupStart() is called and decreased by 1 when ncclGroupEnd()
+  // is called.
+  static thread_local uint64_t ncclActiveGroupCounter_;
 };
 
 } // namespace c10d
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 01cddee92365..235e88f3c823 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -34,6 +34,7 @@
     skip_if_lt_x_gpu,
     skip_if_no_gpu,
     require_n_gpus_for_nccl_backend,
+    requires_nccl_version,
 )
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
 from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
@@ -210,10 +211,13 @@ def _lock():
             lf.close()
 
 
-def _build_tensor(size, value=None, dtype=torch.float):
+def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
     if value is None:
         value = size
-    return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    if device_id is None:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    else:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id)
 
 
 def _build_multidim_tensor(dim, dim_size, value=None):
@@ -585,6 +589,182 @@ def test_backend_group(self):
         def test_backend_full_group(self):
             self._test_group_override_backend(self._init_full_group_test)
 
+        # NCCL Batch SEND RECV
+        @skip_if_no_gpu
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_nccl(self):
+            self._barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+            p2p_op_list = []
+
+            for val in ["1", "0"]:
+                os.environ["NCCL_BLOCKING_WAIT"] = val
+                for src in range(0, dist.get_world_size()):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                    recv_tensor = _build_tensor(src + 1, value=-1, device_id=device_id)
+                    recv_op = dist._P2POp(dist.irecv, recv_tensor, src)
+                    p2p_op_list.append(recv_op)
+                    send_op = dist._P2POp(dist.isend, send_tensor, src)
+                    p2p_op_list.append(send_op)
+
+                reqs = dist._batch_isend_irecv(p2p_op_list)
+                for req in reqs:
+                    req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU
+        @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist._P2POp(dist.irecv, recv_tensor, src)
+                p2p_op_list.append(recv_op)
+                send_op = dist._P2POp(dist.isend, send_tensor, src)
+                p2p_op_list.append(send_op)
+
+            reqs = dist._batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU with provided tags
+        @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo_tags(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist._P2POp(dist.irecv, recv_tensor, src, tag=src)
+                p2p_op_list.append(recv_op)
+                send_op = dist._P2POp(dist.isend, send_tensor, src, tag=rank)
+                p2p_op_list.append(send_op)
+
+            reqs = dist._batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # NCCL Batch SEND RECV Tensor Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_tensor_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "Tensors must be CUDA and dense"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    send_op = dist._P2POp(dist.isend, send_tensor, 1)
+                    req = dist._batch_isend_irecv([send_op])
+                    req.wait()
+
+        # NCCL Batch SEND RECV Op Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "^Invalid ``op``"
+                ):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                    send_op = dist._P2POp(dist.broadcast, send_tensor, 1)
+                    req = dist._batch_isend_irecv([send_op])
+                    req.wait()
+
+        # NCCL Batch SEND RECV p2p_op_list Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_list_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = self._init_multigpu_helper()
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(
+                    RuntimeError, "^Invalid ``p2p_op_list``"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    req = dist._batch_isend_irecv([1, 2])
+                    req.wait()
+
+        # NCCL Batch SEND RECV Mixed Backend Error
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_mixed_backend_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+            group_gloo = dist.new_group(ranks=[0, 1], backend="gloo")
+            group_nccl = dist.new_group(ranks=[0, 1], backend="nccl")
+            if rank == 0:
+                with self.assertRaisesRegex(
+                    RuntimeError, "All groups need to use the same backend"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    send_op_gloo = dist._P2POp(dist.isend, send_tensor, 1, group_gloo)
+                    send_op_nccl = dist._P2POp(dist.isend, send_tensor, 1, group_nccl)
+                    req = dist._batch_isend_irecv([send_op_gloo, send_op_nccl])
+                    req.wait()
+
+        # NCCL SEND RECV
+        @unittest.skip("NCCL P2P is not enabled for OSS builds")
+        @skip_if_no_gpu
+        @unittest.skipIf(BACKEND != "nccl", "NCCL Send Recv Only")
+        @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv")
+        def test_send_recv_nccl(self):
+            rank = dist.get_rank()
+            rank_to_GPU = self._init_multigpu_helper()
+            device_id = rank_to_GPU[rank][0]
+
+            tensor = _build_tensor(rank + 1, device_id=device_id)
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    # Send mode
+                    for dst in range(0, dist.get_world_size()):
+                        if dst == rank:
+                            continue
+                        dist.send(tensor, dst)
+                else:
+                    # Recv mode
+                    expected_tensor = _build_tensor(src + 1)
+                    output_tensor = _build_tensor(src + 1, value=-1, device_id=device_id)
+                    dist.recv(output_tensor, src)
+                    self.assertEqual(output_tensor, expected_tensor)
+
+            self._barrier()
+
         # SEND RECV
         @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv")
         def test_send_recv(self):

From 10d86d1196f2dc070a46848ceb00f1cbf41931d5 Mon Sep 17 00:00:00 2001
From: Mingzhe Li <mingzhe0908@fb.com>
Date: Mon, 5 Oct 2020 18:23:14 -0700
Subject: [PATCH 433/449] [NCCL] create NCCL communicator for send/recv on
 demand (#44922)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44922

For NCCL send/recv operations, we will create NCCL communicator on demand following the same design as how it's currently done for collective operations.
ghstack-source-id: 113592757

Test Plan: to add

Reviewed By: pritamdamania87

Differential Revision: D23773726

fbshipit-source-id: 0d47c29d670ddc07f7181e8485af0e02e2c9cfaf
---
 torch/lib/c10d/ProcessGroupNCCL.cpp | 64 +++++++++++++++++++++--------
 torch/lib/c10d/ProcessGroupNCCL.hpp | 26 ++++++++++--
 2 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 7698aaee70fa..6e45b8594f9b 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -107,6 +107,13 @@ std::string getKeyFromDevices(const std::vector<at::Device>& devices) {
   return deviceList;
 }
 
+std::string getKeySendRecv(int myRank, int peer) {
+  int lowRank = myRank < peer ? myRank : peer;
+  int highRank = myRank < peer ? peer : myRank;
+  std::string sendRecvPair = std::to_string(lowRank) + ":" + std::to_string(highRank);
+  return sendRecvPair;
+}
+
 // Get the list of devices from list of tensors
 std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
   std::vector<at::Device> res;
@@ -718,7 +725,9 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId* ncclID) {
 
 std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
     const std::string& devicesKey,
-    const std::vector<at::Device>& devices) {
+    const std::vector<at::Device>& devices,
+    NCCLCommType commType,
+    int p2pRank) {
   // Sanity check
   if (devicesKey.empty()) {
     throw std::runtime_error(
@@ -745,7 +754,8 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
   // Create the unique NCCL ID and broadcast it
   ncclUniqueId ncclID;
 
-  if (rank_ == 0) {
+  // For point-to-point communication, lower rank of the two will get unique id.
+  if (rank_ == 0 || (commType != NCCLCommType::COLL && p2pRank == 0)) {
     C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID));
   }
 
@@ -781,8 +791,17 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
 
   for (size_t i = 0; i < devices.size(); ++i) {
     // GPU world size and GPU rank
-    int numRanks = getSize() * devices.size();
-    int rank = getRank() * devices.size() + i;
+    int numRanks, rank;
+
+    if (commType == NCCLCommType::COLL) {
+      numRanks = getSize() * devices.size();
+      rank = getRank() * devices.size() + i;
+    } else {
+    // For point-to-point operation, there are only 2 processes involved so
+    // the GPU rank is either 0 or 1.
+      numRanks = 2;
+      rank = p2pRank;
+    }
     // Get the device index
     int deviceIndex = devices[i].index();
 
@@ -1040,12 +1059,14 @@ template <typename Fn, typename PreProcess, typename PostProcess>
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
     std::vector<at::Tensor>& tensors,
     Fn fn,
-    bool isRecv,
+    int peer,
+    NCCLCommType commType,
     PreProcess pre,
     PostProcess post) {
   const auto devices = getDeviceList(tensors);
-  const auto key = getKeyFromDevices(devices);
-  auto& ncclComms = getNCCLComm(key, devices);
+  const auto key = getKeySendRecv(rank_, peer);
+  int p2pRank = rank_ < peer ? 0 : 1;
+  auto& ncclComms = getNCCLComm(key, devices, commType, p2pRank);
 
   // First let NCCL streams wait for input tensors allocation streams
   syncStreams(devices, ncclEvents_[key], ncclStreams_[key]);
@@ -1053,7 +1074,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
   // Work itself will create the CUDA events on all GPUs of tensors
   auto work = initWork(devices);
 
-  if (isRecv) {
+  if (commType == NCCLCommType::RECV) {
     // Store references to outputs and futureNCCLCallbackStream to be used by
     // WorkNCCL::getFuture.
     work->outputs_ = std::make_shared<std::vector<at::Tensor>>(tensors);
@@ -1082,8 +1103,11 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
     for (size_t i = 0; i < tensors.size(); ++i) {
       gpuGuard.set_index(devices[i].index());
       at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+      // For point-to-point communication, NCCL ranks can only
+      // be 0 or 1.
+      int p2pTargetRank = 1 - p2pRank;
       C10D_NCCL_CHECK(
-          fn(tensors[i], ncclComms[i]->getNcclComm(), ncclStream));
+          fn(tensors[i], ncclComms[i]->getNcclComm(), ncclStream, p2pTargetRank));
     }
   }
 
@@ -1119,11 +1143,13 @@ template <typename Fn>
 std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::pointToPoint(
     std::vector<at::Tensor>& tensor,
     Fn fn,
-    bool isRecv) {
+    int peer,
+    NCCLCommType type) {
   return pointToPoint(
       tensor,
       fn,
-      isRecv,
+      peer,
+      type,
       [](std::vector<at::cuda::CUDAStream>&) {},
       [](std::vector<at::cuda::CUDAStream>&) {});
 }
@@ -1413,16 +1439,18 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::send(
       tensors,
       [&](at::Tensor& input,
           ncclComm_t comm,
-          at::cuda::CUDAStream& stream) {
+          at::cuda::CUDAStream& stream,
+          int dst) {
         return ncclSend(
             input.data_ptr(),
             input.numel(),
             getNcclDataType(input.scalar_type()),
-            dstRank,
+            dst,
             comm,
             stream.stream());
       },
-      /* isRecv */ false);
+      dstRank,
+      NCCLCommType::SEND);
   return ret;
 }
 
@@ -1435,16 +1463,18 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::recv(
       tensors,
       [&](at::Tensor& output,
           ncclComm_t comm,
-          at::cuda::CUDAStream& stream) {
+          at::cuda::CUDAStream& stream,
+          int src) {
         return ncclRecv(
             output.data_ptr(),
             output.numel(),
             getNcclDataType(output.scalar_type()),
-            srcRank,
+            src,
             comm,
             stream.stream());
       },
-      /* isRecv */ true);
+      srcRank,
+      NCCLCommType::RECV);
   return ret;
 }
 #else
diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
index fb9107121f6a..b8b3d5aabd35 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -23,6 +23,13 @@ constexpr const char* NCCL_BLOCKING_WAIT = "NCCL_BLOCKING_WAIT";
 // Handling with NCCL.
 constexpr const char* NCCL_ASYNC_ERROR_HANDLING = "NCCL_ASYNC_ERROR_HANDLING";
 
+// NCCL Commmunication type
+enum class NCCLCommType : std::uint8_t {
+  SEND = 0,
+  RECV,
+  COLL,
+};
+
 // ProcessGroupNCCL implements NCCL bindings for c10d.
 //
 // All functions of the class are expected to be called in the same order
@@ -459,7 +466,9 @@ class ProcessGroupNCCL : public ProcessGroup {
   // a new set of NCCL communicators as a cache entry
   std::vector<std::shared_ptr<NCCLComm>>& getNCCLComm(
       const std::string& devicesKey,
-      const std::vector<at::Device>& devices);
+      const std::vector<at::Device>& devices,
+      NCCLCommType commType = NCCLCommType::COLL,
+      int p2pRank = 0);
 
   // Wrapper method which can be overridden for tests.
   virtual std::exception_ptr checkForNCCLErrors(
@@ -495,12 +504,14 @@ class ProcessGroupNCCL : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Work> pointToPoint(
       std::vector<at::Tensor>& tensor,
       Fn fn,
-      bool isRecv);
+      int peer,
+      NCCLCommType commType);
   template <typename Fn, typename PreProcess, typename PostProcess>
   std::shared_ptr<ProcessGroup::Work> pointToPoint(
       std::vector<at::Tensor>& tensor,
       Fn fn,
-      bool isRecv,
+      int peer,
+      NCCLCommType commType,
       PreProcess pre,
       PostProcess post);
 
@@ -545,6 +556,8 @@ class ProcessGroupNCCL : public ProcessGroup {
   uint64_t ncclCommCounter_{0};
 
   // The NCCL communicator that the process group has cached.
+  //
+  // For collective operations:
   // The key is a list of GPU devices that an operation is operating on
   // The GPU devices are stored in a device sequence and the cache NCCL
   // communicator is associated with this GPU device sequence
@@ -563,6 +576,13 @@ class ProcessGroupNCCL : public ProcessGroup {
   //      "0,4,5,6,7,1,2,3"
   //
   //      Note that the order of the device for the tensor list matters.
+  //
+  // For point-to-point operations:
+  // The key is a string of my current rank and the peer process rank.
+  // e.g. If process 1 and process 2 are involved in a point-to-point communication,
+  // the key will be "1:2" on both processes.
+  // Note: this is for the scenario where there is only 1 GPU per process.
+  // When it comes to multiple GPUs per process, this part may need to redesigned.
   std::unordered_map<std::string, std::vector<std::shared_ptr<NCCLComm>>>
       devNCCLCommMap_;
 

From bf85642c4c6e257587c50be8de108199be5d0396 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Mon, 5 Oct 2020 19:59:52 -0700
Subject: [PATCH 434/449] Remove lock from
 GraphTask::set_exception_without_signal. (#45867)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45867

In most cases the lock ordering was hold a lock in local autograd and
then hold a lock in DistAutogradContext.

In case of `set_exception_without_signal` the lock order was in reverse and as
a result we saw potential deadlock issues in our TSAN tests. To fix this, I
removed the lock and instead just used std::atomic exchange.

In addition to this, I fixed TestE2E to ensure that we use the appropriate
timeout.

TestE2EProcessGroup was flaky for these two reasons and now is fixed.
ghstack-source-id: 113592709

Test Plan: waitforbuildbot.

Reviewed By: albanD

Differential Revision: D24120962

fbshipit-source-id: 12447b84ceae772b91e9a183c90d1e6340f44e66
---
 test/cpp/rpc/test_e2e_process_group.cpp | 1 +
 torch/csrc/autograd/engine.cpp          | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/cpp/rpc/test_e2e_process_group.cpp b/test/cpp/rpc/test_e2e_process_group.cpp
index d509a4606fa1..7c5af57d6a09 100644
--- a/test/cpp/rpc/test_e2e_process_group.cpp
+++ b/test/cpp/rpc/test_e2e_process_group.cpp
@@ -19,6 +19,7 @@ class TestE2EProcessGroup : public TestE2EBase {
     options.devices.push_back(
         ::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress));
     std::chrono::milliseconds rpcTimeout(30000);
+    options.timeout = rpcTimeout;
 
     // Initialize server rpc agent.
     auto pg =
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 5ddaf4a4855d..e952b0afc772 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -513,12 +513,10 @@ void GraphTask::exec_post_processing() {
 }
 
 void GraphTask::set_exception_without_signal(const std::shared_ptr<Node>& fn) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  if (!has_error_.load()) {
+  if (!has_error_.exchange(true)) {
     if (AnomalyMode::is_enabled() && fn) {
       fn->metadata()->print_stack(fn->name());
     }
-    has_error_ = true;
   }
 }
 

From 7eb0a71484dc5d7e64690f7d8dc0da8d548f9f49 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Mon, 5 Oct 2020 21:24:33 -0700
Subject: [PATCH 435/449] update persons of interest (#45803)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/45803

Reviewed By: dzhulgakov

Differential Revision: D24125375

Pulled By: VitalyFedyunin

fbshipit-source-id: a892603c6449a2c15e926d2b161468690d4ec2f4
---
 docs/source/community/persons_of_interest.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index c152ca616571..f346fbe994e6 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -25,7 +25,6 @@ torch.*
 torch.nn
 ~~~~~~~~
 
--  Thomas Viehmann (`t-vi <https://github.com/t-vi>`__)
 -  Adam Paszke (`apaszke <https://github.com/apaszke>`__)
 -  Greg Chanan (`gchanan <https://github.com/gchanan>`__)
 -  Soumith Chintala (`soumith <https://github.com/soumith>`__)

From e4efc420ae7acebb3e699adf238b73bce96ceb87 Mon Sep 17 00:00:00 2001
From: Eric Cotner <2.71828cotner@gmail.com>
Date: Mon, 5 Oct 2020 21:46:39 -0700
Subject: [PATCH 436/449] Correct `Categorical` docstring (#45804)

Summary:
Clarified that the `Categorical` distribution will actually accept input of any arbitrary tensor shape, not just 1D and 2D tensors.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45804

Reviewed By: dzhulgakov

Differential Revision: D24125415

Pulled By: VitalyFedyunin

fbshipit-source-id: 5fa1f07911bd85e172199b28d79763428db3a0f4
---
 torch/distributions/categorical.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 01f3dd520174..319d2dd01b66 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -16,14 +16,14 @@ class Categorical(Distribution):
 
     Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is ``probs.size(-1)``.
 
-    If :attr:`probs` is 1D with length-`K`, each element is the relative
+    If :attr:`probs` is 1-dimensional with length-`K`, each element is the relative
     probability of sampling the class at that index.
 
-    If :attr:`probs` is 2D, it is treated as a batch of relative probability
-    vectors.
+    If :attr:`probs` is N-dimensional, the first N-1 dimensions are treated as a batch of 
+    relative probability vectors.
 
     .. note:: :attr:`probs` must be non-negative, finite and have a non-zero sum,
-              and it will be normalized to sum to 1.
+              and it will be normalized to sum to 1 along the last dimension.
 
     See also: :func:`torch.multinomial`
 

From d44eaf63d1d671ca840748ce5ed37cf801badb5b Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 5 Oct 2020 22:03:01 -0700
Subject: [PATCH 437/449] torch.fft helper functions (#44877)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44877

Part of gh-42175. This implements the `torch.fft` helper functions: `fftfreq`, `rfftfreq`, `fftshift` and `ifftshift`.

* #43009 Cleanup tracer handling of optional arguments

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D24043473

Pulled By: mruberry

fbshipit-source-id: 35de7b70b27658a426773f62d23722045ea53268
---
 aten/src/ATen/native/SpectralOps.cpp          |  64 ++++++
 aten/src/ATen/native/native_functions.yaml    |  20 ++
 test/test_spectral_ops.py                     |  75 +++++++
 .../templates/python_fft_functions.cpp        |  17 +-
 torch/csrc/api/include/torch/fft.h            |  62 ++++++
 torch/fft/__init__.py                         | 183 ++++++++++++++++++
 6 files changed, 419 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index cf8d52a7a3c2..21e4d63b163b 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -409,6 +409,70 @@ Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
   return native::fft_irfft(x, last_shape, last_dim, norm);
 }
 
+Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) {
+  ScalarType dtype = typeMetaToScalarType(options.dtype());
+  TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
+              "fftfreq requires a floating point or complex dtype");
+  // TODO: arange doesn't have complex support
+  Tensor result = native::arange(n, options);
+  auto right_slice = result.slice(0, (n + 1) / 2, 0);
+  at::arange_out(right_slice, -(n/2), 0, 1);
+  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+  return result;
+}
+
+Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  ScalarType dtype = typeMetaToScalarType(options.dtype());
+  TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
+              "rfftfreq requires a floating point or complex dtype");
+  // TODO: arange doesn't have complex support
+  Tensor result = native::arange(n/2 + 1, options);
+  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+  return result;
+}
+
+// If an array dim is specified, wraps them according to self.dim().
+// Otherwise returns a vector of all dims.
+DimVector default_alldims(const Tensor& self, c10::optional<IntArrayRef> dim_opt) {
+  DimVector dim;
+  if (dim_opt) {
+    IntArrayRef dim_unwrapped = *dim_opt;
+    dim.resize(dim_unwrapped.size());
+    for (int64_t i = 0; i < dim.size(); ++i) {
+      dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim());
+    }
+  } else {
+    dim.resize(self.dim());
+    std::iota(dim.begin(), dim.end(), 0);
+  }
+  return dim;
+}
+
+Tensor fft_fftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+  auto dim = default_alldims(x, dim_opt);
+
+  IntArrayRef x_sizes = x.sizes();
+  DimVector shift(dim.size());
+  for (int64_t i = 0; i < dim.size(); ++i) {
+    shift[i] = x_sizes[dim[i]] / 2;
+  }
+
+  return at::roll(x, shift, dim);
+}
+
+Tensor fft_ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim_opt) {
+  auto dim = default_alldims(x, dim_opt);
+
+  IntArrayRef x_sizes = x.sizes();
+  DimVector shift(dim.size());
+  for (int64_t i = 0; i < dim.size(); ++i) {
+    shift[i] = (x_sizes[dim[i]] + 1) / 2;
+  }
+
+  return at::roll(x, shift, dim);
+}
+
+
 // This is a pass-through wrapper function that does the size check and
 // inferences. The actual forward implementation function is called
 // at::_fft_with_size which dispatches to _fft_cufft (CUDA) or _fft_mkl (CPU).
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index de5e98037277..9cf7422d2917 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -8211,6 +8211,26 @@
   use_c10_dispatcher: full
   variants: function
 
+- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
+- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
+  python_module: fft
+  use_c10_dispatcher: full
+  variants: function
+
 - func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor
   use_c10_dispatcher: full
   variants: function, method
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index d7ef731699b3..82ed2225bda8 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -510,6 +510,81 @@ def test_fftn_invalid(self, device):
         with self.assertRaisesRegex(RuntimeError, "Expected a real input"):
             torch.fft.rfftn(c)
 
+    # Helper functions
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double)
+    def test_fftfreq_numpy(self, device, dtype):
+        test_args = [
+            *product(
+                # n
+                range(1, 20),
+                # d
+                (None, 10.0),
+            )
+        ]
+
+        functions = ['fftfreq', 'rfftfreq']
+
+        for fname in functions:
+            torch_fn = getattr(torch.fft, fname)
+            numpy_fn = getattr(np.fft, fname)
+
+            for n, d in test_args:
+                args = (n,) if d is None else (n, d)
+                expected = numpy_fn(*args)
+                actual = torch_fn(*args, device=device, dtype=dtype)
+                self.assertEqual(actual, expected, exact_dtype=False)
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_fftshift_numpy(self, device, dtype):
+        test_args = [
+            # shape, dim
+            *product(((11,), (12,)), (None, 0, -1)),
+            *product(((4, 5), (6, 6)), (None, 0, (-1,))),
+            *product(((1, 1, 4, 6, 7, 2),), (None, (3, 4))),
+        ]
+
+        functions = ['fftshift', 'ifftshift']
+
+        for shape, dim in test_args:
+            input = torch.rand(*shape, device=device, dtype=dtype)
+            input_np = input.cpu().numpy()
+
+            for fname in functions:
+                torch_fn = getattr(torch.fft, fname)
+                numpy_fn = getattr(np.fft, fname)
+
+                expected = numpy_fn(input_np, axes=dim)
+                actual = torch_fn(input, dim=dim)
+                self.assertEqual(actual, expected)
+
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
+    @dtypes(torch.float, torch.double)
+    def test_fftshift_frequencies(self, device, dtype):
+        for n in range(10, 15):
+            sorted_fft_freqs = torch.arange(-(n // 2), n - (n // 2),
+                                            device=device, dtype=dtype)
+            x = torch.fft.fftfreq(n, d=1 / n, device=device, dtype=dtype)
+
+            # Test fftshift sorts the fftfreq output
+            shifted = torch.fft.fftshift(x)
+            self.assertTrue(torch.allclose(shifted, shifted.sort().values))
+            self.assertEqual(sorted_fft_freqs, shifted)
+
+            # And ifftshift is the inverse
+            self.assertEqual(x, torch.fft.ifftshift(shifted))
+
     # Legacy fft tests
     def _test_fft_ifft_rfft_irfft(self, device, dtype):
         def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x):
diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp
index 7d0186538c98..1dbdca565792 100644
--- a/tools/autograd/templates/python_fft_functions.cpp
+++ b/tools/autograd/templates/python_fft_functions.cpp
@@ -7,14 +7,27 @@
 #include "torch/csrc/autograd/python_variable.h"
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
 #include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/cuda_lazy_init.h"
+
+#include <ATen/ATen.h>
 
 using at::Tensor;
+using at::Device;
+using at::Layout;
 using at::Scalar;
-using at::MemoryFormat;
-using at::Generator;
+using at::ScalarType;
+using at::Backend;
+using at::OptionalDeviceGuard;
+using at::DeviceGuard;
+using at::TensorOptions;
 using at::IntArrayRef;
+using at::Generator;
+using at::TensorList;
+using at::Dimname;
+using at::DimnameList;
 
 using namespace torch::autograd::utils;
 
diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h
index 1c119ed75226..8a094ec9e235 100644
--- a/torch/csrc/api/include/torch/fft.h
+++ b/torch/csrc/api/include/torch/fft.h
@@ -166,4 +166,66 @@ inline Tensor ihfft(const Tensor& self,
   return torch::fft_ihfft(self, n, dim, norm);
 }
 
+/// Computes the discrete Fourier Transform sample frequencies for a signal of size n.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::fftfreq(128, torch::kDouble);
+/// ```
+inline Tensor fftfreq(int64_t n, double d, const TensorOptions& options={}) {
+  return torch::fft_fftfreq(n, d, options);
+}
+
+inline Tensor fftfreq(int64_t n, const TensorOptions& options={}) {
+  return torch::fft_fftfreq(n, /*d=*/1.0, options);
+}
+
+/// Computes the sample frequencies for torch.fft.rfft with a signal of size n.
+///
+/// Like torch.fft.rfft, only the positive frequencies are included.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::rfftfreq(128, torch::kDouble);
+/// ```
+inline Tensor rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, d, options);
+}
+
+inline Tensor rfftfreq(int64_t n, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, /*d=*/1.0, options);
+}
+
+/// Reorders n-dimensional FFT output to have negative frequency terms first, by
+/// a torch.roll operation.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x));
+/// ```
+inline Tensor fftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+  return torch::fft_fftshift(x, dim);
+}
+
+/// Inverse of torch.fft.fftshift
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto shift = torch::fft::fftshift(x)
+/// auto unshift = torch::fft::ifftshift(shift);
+/// assert(torch::allclose(x, unshift));
+/// ```
+inline Tensor ifftshift(const Tensor& x, c10::optional<IntArrayRef> dim=c10::nullopt) {
+  return torch::fft_ifftshift(x, dim);
+}
+
 }} // torch::fft
diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py
index 3e4bcc35464b..b3ffdb24b4a0 100644
--- a/torch/fft/__init__.py
+++ b/torch/fft/__init__.py
@@ -2,6 +2,12 @@
 
 import torch
 from torch._C import _add_docstr, _fft  # type: ignore
+from torch._torch_docs import factory_common_args
+
+__all__ = ['fft', 'ifft', 'fftn', 'ifftn',
+           'rfft', 'irfft', 'rfftn', 'irfftn', 'hfft', 'ihfft',
+           'fftfreq', 'rfftfreq', 'fftshift', 'ifftshift',
+           'Tensor']
 
 Tensor = torch.Tensor
 
@@ -549,3 +555,180 @@
     tensor([ 2.0000+-0.0000j, -0.5000-0.6882j, -0.5000-0.1625j, -0.5000+0.1625j,
         -0.5000+0.6882j])
 """)
+
+fftfreq = _add_docstr(_fft.fft_fftfreq, r"""
+fftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the discrete Fourier Transform sample frequencies for a signal of size :attr:`n`.
+
+Note:
+    By convention, :func:`~torch.fft.fft` returns positive frequency terms
+    first, followed by the negative frequencies in reverse order, so that
+    ``f[-i]`` for all :math:`0 < i \leq n/2`` in Python gives the negative
+    frequency terms. For an FFT of length :attr:`n` and with inputs spaced in
+    length unit :attr:`d`, the frequencies are::
+
+        f = [0, 1, ..., (n - 1) // 2, -(n // 2), ..., -1] / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftfreq` follows NumPy's
+    convention of taking it to be negative.
+
+Args:
+    n (int): the FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> import torch.fft
+    >>> torch.fft.fftfreq(5)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    For even input, we can see the Nyquist frequency at ``f[2]`` is given as
+    negative:
+
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+rfftfreq = _add_docstr(_fft.fft_rfftfreq, r"""
+rfftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the sample frequencies for :func:`~torch.fft.rfft` with a signal of size :attr:`n`.
+
+Note:
+    :func:`~torch.fft.rfft` returns Hermitian one-sided output, so only the
+    positive frequency terms are returned. For a real FFT of length :attr:`n`
+    and with inputs spaced in length unit :attr:`d`, the frequencies are::
+
+        f = torch.arange((n + 1) // 2) / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. Unlike :func:`~torch.fft.fftfreq`,
+    :func:`~torch.fft.rfftfreq` always returns it as positive.
+
+Args:
+    n (int): the real FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> import torch.fft
+    >>> torch.fft.rfftfreq(5)
+    tensor([ 0.0000,  0.2000,  0.4000])
+
+    >>> torch.fft.rfftfreq(4)
+    tensor([ 0.0000,  0.2500, 0.5000])
+
+    Compared to the output from :func:`~torch.fft.fftfreq`, we see that the
+    Nyquist frequency at ``f[2]`` has changed sign:
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+fftshift = _add_docstr(_fft.fft_fftshift, r"""
+fftshift(input, dim=None) -> Tensor
+
+Reorders n-dimensional FFT data, as provided by :func:`~torch.fft.fftn`, to have
+negative frequency terms first.
+
+Note:
+    By convention, the FFT returns positive frequency terms first, followed by
+    the negative frequencies in reverse order, so that ``f[-i]`` for all
+    :math:`0 < i \leq n/2` in Python gives the negative frequency terms.
+    :func:`~torch.fft.fftshift` rearranges all frequencies into ascending order
+    from negative to positive with the zero-frequency term in the center.
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftshift` always puts the
+    Nyquist term at the 0-index. This is the same convention used by
+    :func:`~torch.fft.fftfreq`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> import torch.fft
+    >>> f = torch.fft.fftfreq(4)
+    >>> f
+    tensor([ 0.0000,  0.2500,  -0.5000, -0.2500])
+
+    >>> torch.fftshift(f)
+    tensor([-0.5000, -0.2500, 0.0000, 0.2500])
+
+    Also notice that the Nyquist frequency term at ``f[2]`` was moved to the
+    beginning of the tensor.
+
+    This also works for multi-dimensional transforms:
+    >>> x = torch.fft.fftfreq(5, d=1/5) + 0.1 * torch.fft.fftfreq(5, d=1/5).unsqueeze(1)
+    >>> x
+    tensor([[ 0.0000,  1.0000,  2.0000, -2.0000, -1.0000],
+            [ 0.1000,  1.1000,  2.1000, -1.9000, -0.9000],
+            [ 0.2000,  1.2000,  2.2000, -1.8000, -0.8000],
+            [-0.2000,  0.8000,  1.8000, -2.2000, -1.2000],
+            [-0.1000,  0.9000,  1.9000, -2.1000, -1.1000]])
+
+    >>> torch.fft.fftshift(x)
+    tensor([[-2.2000, -1.2000, -0.2000,  0.8000,  1.8000],
+            [-2.1000, -1.1000, -0.1000,  0.9000,  1.9000],
+            [-2.0000, -1.0000,  0.0000,  1.0000,  2.0000],
+            [-1.9000, -0.9000,  0.1000,  1.1000,  2.1000],
+            [-1.8000, -0.8000,  0.2000,  1.2000,  2.2000]])
+
+""")
+
+ifftshift = _add_docstr(_fft.fft_ifftshift, r"""
+ifftshift(input, dim=None) -> Tensor
+
+Inverse of :func:`~torch.fft.fftshift`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> import torch.fft
+    >>> f = torch.fft.fftfreq(5)
+    >>> f
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    A round-trip through :func:`~torch.fft.fftshift` and
+    :func:`~torch.fft.ifftshift` gives the same result:
+
+    >>> shifted = torch.fftshift(f)
+    >>> torch.ifftshift(shifted)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+""")

From 2fbe5971b360f63715fcccf4bb61ac7c44dff48c Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@fb.com>
Date: Mon, 5 Oct 2020 22:36:08 -0700
Subject: [PATCH 438/449] [pytorch/cuda] apply 16-bit mask to the index for
 device guard registry (#45485)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45485

Essentially this is the problem reported by ezyang: https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080. There are two proposed fixes:
* https://github.com/pytorch/pytorch/pull/44883: this doesn't work because it fails some static assert at runtime
```
caffe2/c10/core/TensorOptions.h:553:1: error: static_assert failed due to requirement 'sizeof(c10::TensorOptions) <= sizeof(long) * 2' "TensorOptions must fit in 128-bits"
static_assert( sizeof(TensorOptions) <= sizeof(int64_t) * 2,
^
```
* https://github.com/pytorch/pytorch/pull/44885: to be tested

This diff is a temp hack to work around the problem. W/o this patch:

```
  volatile size_t device_type = static_cast<size_t>(type);
  auto p = device_guard_impl_registry[device_type].load();
  C10_LOG_FIRST_N(WARNING, 10) << "XDW-fail: " << cntr << ", Device type: " << type << ", type cast: " << device_type  << ", guard: " << p;

// output
XDW-fail: 1129, Device type: cuda, type cast: 65537, guard: 0

```

Another workaround is D23788441, which changes -O3 to -O2. So this seems to be a miscompilation for nvcc or the host compiler.

Reviewed By: ezyang

Differential Revision: D23972356

fbshipit-source-id: ab91fbbfccb6389052de216f95cf9a8265445aea
---
 c10/core/impl/DeviceGuardImplInterface.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index 516aebba0747..f7f5b4f867a9 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -209,7 +209,15 @@ class C10_API DeviceGuardImplRegistrar {
   static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE(g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl());
 
 inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
-  auto p = device_guard_impl_registry[static_cast<size_t>(type)].load();
+  // Two adjacent int16_t fields DeviceType and DeviceIndex has field access
+  // miscompiled on NVCC. To workaround this issue, we apply a mask to the
+  // DeviceType. First check if the DeviceType is 16-bit.
+  // FB employees can see
+  //   https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
+  // for more details
+  static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
+  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFFFF].load();
+
   // This seems to be the first place where you make use of a device
   // when you pass devices to factory functions.  Give a nicer error
   // message in this case.

From 8a1e1004664e4cc8b1adbcef07b9fdae397e1787 Mon Sep 17 00:00:00 2001
From: Raziel Alvarez Guevara <razy@fb.com>
Date: Tue, 6 Oct 2020 01:26:22 -0700
Subject: [PATCH 439/449] Stricter backward compatibility check (#45773)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45773

Changes the function schema's backward compatibility check to be stricter to comply with C++ API backwards compatibility capabilities.
ghstack-source-id: 113537304

Test Plan:
Updated and added tests to test_function_schema.py

Browsed through several commits to native_functions.yaml and derivatives.yaml and I don't see instances where new arguments where not already being appended.

Reviewed By: dzhulgakov

Differential Revision: D24089751

fbshipit-source-id: a21f407cdc750906d3326e3ea27928b8aa732804
---
 aten/src/ATen/core/function_schema.h     |  35 ++++---
 aten/src/ATen/core/function_schema_inl.h |  73 ++++----------
 test/test_function_schema.py             | 119 ++++++++++-------------
 3 files changed, 95 insertions(+), 132 deletions(-)

diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index a9182787d2e6..a7b4e694d52e 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -156,18 +156,29 @@ struct FunctionSchema {
     checkSchema();
   }
 
-  // check whether this schema is backward compatible with the old one.
-  // the following conditions are considered as this schema is backward
-  // compatible with old:
-  //   1) two schemas are equal
-  //   2) this schema has the same or more positional args than old,
-  //      and any positional arg in this schema is backward compatible
-  //      with the corresponding one in old schema, which could be an arg
-  //      or a kwarg, if it has, or it must provide a default value
-  //   3) this schema has the same or more kwargs than old, and all the kwargs
-  //      in old schema can find the corresponding kwarg in this schema which
-  //      is backward compatible with the old kwarg, and the extra kwargs in
-  //      this schema must provide default values.
+  // Checks whether this schema is backward compatible with the old one.
+  // The following conditions must be true:
+  // [Function structure] The new schema's name, overload-name, varargs, and
+  //      return arity are the same.
+  // [Output Narrowing] The new schema's output type must be the same class
+  //      or inherit from the old schema's output type.
+  // [Argument count] The new schema must have at least as many arguments as
+  //      the old schema (considering the list of positional and kwargs).
+  // [Arg Compatibility] Every argument in the old schema has a corresponding
+  //      argument in the new schema that:
+  //        * is at the same position.
+  //        * has the same name.
+  //        * is either positional, or kwarg and the old argument was kwarg.
+  //        * has the same type, or the old argument's type inherits from the
+  //          new argument's type.
+  // [Default Values] Every new argument must have a default value.
+  // E.g.
+  //   OK    f_new(a, b, c=1) => f_old(a, b)
+  //   NOK   f_new(a, c=1, *, b) => f_old(a, *, b)
+  //   OK    f_new(a, b, *, c) => f_old(a, *, b, c)
+  //   NOK   f_new(a, *, b, c) -> f_old(a, b, *, c)
+  //   NOK   f_new(a, *, c, b) => f_old(a, *, b, c)
+  //   OK    f_new(a, *, b, c, d=1) => f_old(a, *, b, c)
   bool isBackwardCompatibleWith(
       const FunctionSchema& old,
       std::ostream* why_not = nullptr) const;
diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h
index bc9a68fbad3f..2185b35bc593 100644
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@@ -111,69 +111,35 @@ inline bool FunctionSchema::isBackwardCompatibleWith(
     return false;
   }
   for (size_t i = 0; i < returns().size(); ++i) {
-    // functions are covariant in arguments but contravariant in returns
+    // Backwards compatibility requires covariance on argument types
+    // (i.e. more generic), and contravariance on return types (i.e.
+    //  more specific).
     if (!old.returns().at(i).isBackwardCompatibleWith(
           returns().at(i),
           why_not)) {
       return false;
     }
   }
-  std::vector<const Argument*> args, old_args;
-  std::map<std::string, const Argument*> kwargs, old_kwargs;
-  auto split_func = [](const std::vector<Argument>& arguments,
-      std::vector<const Argument*>* positionals,
-      std::map<std::string, const Argument*>* nameds) {
-    for (const Argument& arg : arguments) {
-      if (!arg.kwarg_only()) {
-        positionals->emplace_back(&arg);
-      }
-      nameds->emplace(arg.name(), &arg);
-    }
-  };
-  // we split args into positional and keyward parts,
-  split_func(arguments(), &args, &kwargs);
-  split_func(old.arguments(), &old_args, &old_kwargs);
-  if (old_args.size() > args.size()) {
-    return false;
-  }
-  // make sure that all the old positional args have their corresponding
-  // backward compatible positional args in this schema
-  for (size_t i = 0; i < old_args.size(); ++i) {
-    if (!args.at(i)->isBackwardCompatibleWith(
-          *old_args.at(i),
-          why_not)) {
+
+  // Make sure that all the old arguments have their corresponding backward
+  // compatible arguments in this schema.
+  for (size_t i = 0; i < old.arguments().size(); ++i) {
+    if (!arguments().at(i).isBackwardCompatibleWith(
+          old.arguments().at(i), why_not)) {
       return false;
     }
   }
-  // check the extra positional args in this schema either has corresponding
-  // backward compatible keyward args since positional args also can be used as
-  // a keyward arg, or provided default values
-  for (size_t i = old_args.size(); i < args.size(); ++i) {
-    if (!args.at(i)->default_value()) {
-      auto it = old_kwargs.find(args.at(i)->name());
-      if (it == old_kwargs.end() ||
-          !args.at(i)->isBackwardCompatibleWith(
-            *it->second,
-            why_not)) {
-        return false;
+
+  // Validate that all new arguments provided a default value.
+  for (size_t i = old.arguments().size(); i < arguments().size(); ++i) {
+    if (!arguments().at(i).default_value()) {
+      if (why_not) {
+        *why_not
+            << "Function schema not backward compatible since the new argument '"
+            << arguments().at(i).name() << "' of type "
+            << arguments().at(i).type()->str()
+            << " did not provide a default value.";
       }
-    }
-  }
-  // make sure that all the keyword args in the old schema have their
-  // corresponding backward compatible keyward args in this schema
-  for (auto& kv : old_kwargs) {
-    auto it = kwargs.find(kv.first);
-    if (it == kwargs.end() ||
-        !it->second->isBackwardCompatibleWith(
-          *kv.second,
-          why_not)) {
-      return false;
-    }
-    kwargs.erase(it);
-  }
-  // check all the extra keyword args in this schema provide default values
-  for (auto& kv : kwargs) {
-    if (!kv.second->default_value()) {
       return false;
     }
   }
@@ -186,7 +152,6 @@ inline void FunctionSchema::checkArg(
     const Argument& argument,
     optional<size_t> pos) const {
   if (!value.type()->isSubtypeOf(argument.type())) {
-    std::string position = pos ? ::c10::str(" in position ", *pos) : "";
     TORCH_CHECK(
         false,
         formatTypeMismatchMsg(
diff --git a/test/test_function_schema.py b/test/test_function_schema.py
index f2ad2290d326..5a1527373478 100644
--- a/test/test_function_schema.py
+++ b/test/test_function_schema.py
@@ -14,90 +14,77 @@ def test_serialize_and_deserialize(self):
             self.assertEqual(parsed_schema, schema)
             self.assertTrue(parsed_schema.is_backward_compatible_with(schema))
 
-    def test_backward_compatible_args(self):
-        old_schema = parse_schema('any(Tensor self, int dim) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int? dim) -> Tensor')
+    def test_backward_compatible_structure(self):
+        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        # BC: A new schema without changes.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim=5) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_compatible_kwargs(self):
-        old_schema = parse_schema('any(Tensor self, *, Tensor out) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, *, bool extra1=True, Tensor out, bool extra2=False) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, Tensor out) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_compatible_ret(self):
-        old_schema = parse_schema('any(Tensor self) -> Tensor?')
-        new_schema = parse_schema('any(Tensor self) -> Tensor')
-        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_name(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any_(Tensor self, int dim, bool keepdim=False) -> Tensor')
+        # No-BC: A new schema with different name.
+        new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_vararg(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False, ...) -> Tensor')
+        # No-BC: A new schema with different overload name.
+        new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_returns(self):
-        old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, ...)')
+        # No-BC: A new schema that adds vararg.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> int')
+        # No-BC: A new schema with different number of outputs.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor?')
+
+    def test_backward_compatible_outputs(self):
+        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        # No-BC: A new schema with output becoming of optional type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) An schema where the output is not of optional type anymore.
         self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)')
+        # No-BC: A new schema with a different output type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor out')
+        # No-BC: A new schema with a different output type.
+        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
 
-    def test_backward_incompatible_args(self):
-        old_schema = parse_schema('any(Tensor self, int[] dims, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor s, int[] dims, bool keepdim=False) -> Tensor')
+    def test_backward_compatible_arguments(self):
+        old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor')
+        # No-BC: A new schema with less arguments.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[3] dims, bool keepdim=False) -> Tensor')
+        # No-BC: A new schema with more arguments, appended, but no default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[](a) dims, bool keepdim=False) -> Tensor')
+        # BC: A new schema with more arguments, appended, that have a default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # No-BC: A new schema with more arguments, not-appended, that have a default value.
+        new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where old kwargs becomes positional.
+        new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) A new schema where an old positional argument becomes kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int dims, bool keepdim=False) -> Tensor')
-        self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where all old kwargs become positional.
+        new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: (the opposite case) A new schema where all old positional arguments become kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[] dim, bool keepdim=False, bool? extra=None) -> Tensor')
+        # No-BC: A new schema where old kwargs appear in different order.
+        new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
-    def test_backward_incompatible_kwargs(self):
-        old_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False) -> Tensor')
-        new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim) -> Tensor')
+        # BC: A new schema where argument becomes of type optional.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # BC: A new schema where argument gains a default value.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor')
+        self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
+        # No-BC: A new schema where argument is "renamed".
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
-        new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False, bool extra) -> Tensor')
+        # No-BC: A new schema where argument type changes to an incompatible type.
+        new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor')
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
-        self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
-
 
 if __name__ == '__main__':
     run_tests()

From 8bc0c755be4fb47b3405021b48eb395a013fd7a4 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Tue, 6 Oct 2020 07:11:37 -0700
Subject: [PATCH 440/449] adding option to move excluding to run_test.py
 instead of test.sh (#45868)

Summary:
Cleaning up test.sh a tiny bit

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45868

Reviewed By: albanD

Differential Revision: D24122726

Pulled By: janeyx99

fbshipit-source-id: e8254accad15ad887a000ec1401c401389393c92
---
 .jenkins/pytorch/macos-test.sh                  |  2 +-
 .jenkins/pytorch/test.sh                        |  2 +-
 .../test_python_all_except_nn.bat               |  2 +-
 test/run_test.py                                | 17 +++++++++++++++++
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 213750ba7280..8e71738f414e 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -63,7 +63,7 @@ test_python_all() {
   # Increase default limit on open file handles from 256 to 1024
   ulimit -n 1024
 
-  python test/run_test.py --verbose --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --determine-from="$DETERMINE_FROM"
+  python test/run_test.py --verbose --exclude-jit-executor --determine-from="$DETERMINE_FROM"
 
   assert_git_not_dirty
 }
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 4412915d2fb9..ecdf3264b822 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -137,7 +137,7 @@ test_python_legacy_jit() {
 }
 
 test_python_all_except_nn_and_cpp_extensions() {
-  time python test/run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
+  time python test/run_test.py --exclude test_nn --exclude-jit-executor --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
index 4bfb5bc85e66..d76637dd0db7 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
@@ -1,3 +1,3 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
-cd test && python run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd ..
+cd test && python run_test.py --exclude-jit-executor --verbose --determine-from="%1" && cd ..
 if ERRORLEVEL 1 exit /b 1
diff --git a/test/run_test.py b/test/run_test.py
index 9ce938f6cd7b..ad68f6ce3276 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -200,6 +200,15 @@
 
 PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE"))
 
+JIT_EXECUTOR_TESTS = [
+    'test_jit_cuda_fuser_profiling',
+    'test_jit_cuda_fuser_legacy',
+    'test_jit_profiling',
+    'test_jit_legacy',
+    'test_jit_fuser_legacy',
+    'test_jit_fuser_te',
+    'test_tensorexpr']
+
 def print_to_stderr(message):
     print(message, file=sys.stderr)
 
@@ -458,6 +467,11 @@ def parse_args():
         '--shard 2 3 will break up the selected tests into 3 shards and run the tests '
         'in the 2nd shard (the number of shards will be whichever argument is greater)',
     )
+    parser.add_argument(
+        '--exclude-jit-executor',
+        action='store_true',
+        help='exclude tests that are run for a specific jit config'
+    )
     return parser.parse_args()
 
 
@@ -533,6 +547,9 @@ def get_selected_tests(options):
         assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}"
         selected_tests = selected_tests[which_shard - 1 :: num_shards]
 
+    if options.exclude_jit_executor:
+        options.exclude.extend(JIT_EXECUTOR_TESTS)
+
     selected_tests = exclude_tests(options.exclude, selected_tests)
 
     if sys.platform == 'win32' and not options.ignore_win_blocklist:

From 67889db8aac73335ba01a2b0a01099e33e8c31a1 Mon Sep 17 00:00:00 2001
From: REX51 <rahulsingha.rex51@gmail.com>
Date: Tue, 6 Oct 2020 07:37:09 -0700
Subject: [PATCH 441/449] Replaced BLACKLIST with BLOCKLIST (#45781)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/41714

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45781

Reviewed By: nairbv

Differential Revision: D24136821

Pulled By: albanD

fbshipit-source-id: 0c0223bda0c5b4da75167a27d7859562db396304
---
 cmake/public/cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index 8b60915f7e00..c9ac37783d1c 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -478,7 +478,7 @@ foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration
 endforeach()
 
 # Set C++14 support
-set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Werror")
+set(CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Werror")
 if(MSVC)
   list(APPEND CUDA_NVCC_FLAGS "--Werror" "cross-execution-space-call")
   list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward")
@@ -490,7 +490,7 @@ endif()
 # OpenMP flags for NVCC with Clang-cl
 if("${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC"
   AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Xclang" "-fopenmp")
+  list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Xclang" "-fopenmp")
   if(MSVC_TOOLSET_VERSION LESS 142)
     list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-openmp")
   else()

From be137e45cddf1f0b04ae7a6827a586ca4b3323f2 Mon Sep 17 00:00:00 2001
From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com>
Date: Tue, 6 Oct 2020 07:55:15 -0700
Subject: [PATCH 442/449] reorganizing tests so that test1 and test2 are
 balanced in timing (#45778)

Summary:
used --shard option to split up python tests ran from `test/run_test.py` in the testing script run in CI

also revised a help message to be more accurate for --shard.

Test results:
BEFORE:
| EVENT | TIMING  |
|---|---|
| **TEST1** | |
| | |
| test_python_nn | 35m19s |
| test_cpp_extensions | 30s |
| **total** | **35m49s** |
| **TEST2** | |
| | |
| install_torchvision | 35s |
| test_python_all_except_nn_and_cpp_extensions | 255m37s |
| test_aten | SKIPPED |
| test_libtorch | 9m8s |
| test_custom_script_ops | SKIPPED |
| test_custom_backend | SKIPPED |
| test_torch_function_benchmark | 10s |
| **total** | **4hr24m** |

AFTER THIS SHARD:
| EVENT | TIMING  |
|---|---|
| **TEST1** | |
| | |
| test_autograd | 26m30s |
| test_foreach | 69m |
| test_nn | test_nn is 35m38s |
| **total** | **3h1m** |
| **TEST2** | |
| | |
| test-quantization | 41m28s |
| test_spectral_ops | 17m37s |
| test_torch | 8m56s |
| test_jit_legacy | 16m21s |
| **total** | **2h18m** |

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45778

Reviewed By: albanD

Differential Revision: D24137156

Pulled By: janeyx99

fbshipit-source-id: 5873fec47aedb9f699ebbda653a4d32a9950fc13
---
 .jenkins/pytorch/test.sh | 25 ++++++++++++-------------
 test/run_test.py         |  2 +-
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index ecdf3264b822..0e35364a2f5d 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -126,18 +126,18 @@ if ([ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]);
   file_diff_from_base "$DETERMINE_FROM"
 fi
 
-test_python_nn() {
-  time python test/run_test.py --include test_nn --verbose --determine-from="$DETERMINE_FROM"
+test_python_legacy_jit() {
+  time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
-test_python_legacy_jit() {
-  time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard1() {
+  time python test/run_test.py --exclude-jit-executor --shard 1 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
-test_python_all_except_nn_and_cpp_extensions() {
-  time python test/run_test.py --exclude test_nn --exclude-jit-executor --verbose --determine-from="$DETERMINE_FROM"
+test_python_shard2() {
+  time python test/run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
@@ -299,7 +299,7 @@ test_xla() {
   assert_git_not_dirty
 }
 
-# Do NOT run this test before any other tests, like test_python_nn, etc.
+# Do NOT run this test before any other tests, like test_python_shard1, etc.
 # Because this function uninstalls the torch built from branch, and install
 # nightly version.
 test_backward_compatibility() {
@@ -382,11 +382,11 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
 elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
-  test_python_nn
-  test_cpp_extensions
+  install_torchvision
+  test_python_shard1
 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then
   install_torchvision
-  test_python_all_except_nn_and_cpp_extensions
+  test_python_shard2
   test_aten
   test_libtorch
   test_custom_script_ops
@@ -402,9 +402,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4
   test_cpp_extensions
 else
   install_torchvision
-  test_python_nn
-  test_python_all_except_nn_and_cpp_extensions
-  test_cpp_extensions
+  test_python_shard1
+  test_python_shard2
   test_aten
   test_vec256
   test_libtorch
diff --git a/test/run_test.py b/test/run_test.py
index ad68f6ce3276..2af7405e300b 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -465,7 +465,7 @@ def parse_args():
         type=int,
         help='runs a shard of the tests (taking into account other selections), e.g., '
         '--shard 2 3 will break up the selected tests into 3 shards and run the tests '
-        'in the 2nd shard (the number of shards will be whichever argument is greater)',
+        'in the 2nd shard (the first number should not exceed the second)',
     )
     parser.add_argument(
         '--exclude-jit-executor',

From b1373a74e052ba482f021ce60a6b381fd0d40423 Mon Sep 17 00:00:00 2001
From: peterjc123 <peterghost86@gmail.com>
Date: Tue, 6 Oct 2020 08:02:19 -0700
Subject: [PATCH 443/449] Don't export enums for CUDA sources on Windows
 (#45829)

Summary:
Fixes #{issue number}

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45829

Reviewed By: VitalyFedyunin

Differential Revision: D24113130

Pulled By: ezyang

fbshipit-source-id: 8356c837ed3a790efecf8dfcc8fb6ee6f45bd6e2
---
 c10/macros/Export.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c10/macros/Export.h b/c10/macros/Export.h
index 5888207c5f80..966dd22e08fa 100644
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@@ -113,8 +113,8 @@
 #define TORCH_HIP_API C10_IMPORT
 #endif
 
-// Enums only need to be exported on windows
-#ifdef _WIN32
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
 #define C10_API_ENUM C10_API
 #else
 #define C10_API_ENUM

From a09e1098e7179c566100c2f61f4cf40f8d7f44c7 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Tue, 6 Oct 2020 09:07:22 -0700
Subject: [PATCH 444/449] Profiling allocator for mobile. (#43951)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951

AllocationPlan: Stores the sequence of allocations, their sizes
                and liftime of the allocations. Along with this
                it also stores the total size of a single memory
                blob, total_size, required to satisfy all the allocations.
                It also stores the offsets in the blob, of size
                total_size, corresponding to each allocation.
                Thus allocation plan contains:
                - allocation sizes
                - allocation lifetimes
                - allocation offsets
                - total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
                  it ups with plan, i.e. sizes, lifetimes, offsets,
                  total size.
                  This is done via WithProfileAllocationsGuard which
                  takes in AllocationPlan* and constructs
                  AllocationPlanner* and set the thread local
                  allocation_planner to it.
                  MobileCPUAllocator profiles allocations via
                  allocation_planner.
                  In WithValidateAllocationsGuard, allocations profiled
                  in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.

Test Plan:
cpu_profiling_allocator_test on mobile.

Imported from OSS

Reviewed By: dreiss

Differential Revision: D23451019

fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
---
 aten/src/ATen/test/CMakeLists.txt             |   4 +-
 .../test/cpu_profiling_allocator_test.cpp     | 167 +++++++
 c10/core/CPUAllocator.cpp                     |  15 +
 c10/mobile/CPUProfilingAllocator.cpp          | 410 ++++++++++++++++++
 c10/mobile/CPUProfilingAllocator.h            | 149 +++++++
 5 files changed, 744 insertions(+), 1 deletion(-)
 create mode 100644 aten/src/ATen/test/cpu_profiling_allocator_test.cpp
 create mode 100644 c10/mobile/CPUProfilingAllocator.cpp
 create mode 100644 c10/mobile/CPUProfilingAllocator.h

diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 43d0fc8ccd92..9f69c9d6ad6f 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -79,11 +79,13 @@ list(APPEND ATen_VULKAN_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)
 
 list(APPEND ATen_MOBILE_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)
 
 list(APPEND ATen_VEC256_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp  
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp
   )
 
 # Caffe2 specific tests
diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
new file mode 100644
index 000000000000..d3391425e14b
--- /dev/null
+++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@@ -0,0 +1,167 @@
+#include <gtest/gtest.h>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+#include <ATen/ATen.h>
+
+at::Tensor run_with_control_flow(
+    at::Tensor input,
+    at::Tensor conv_weight,
+    at::Tensor linear_weight,
+    bool cond,
+    std::vector<void*>& pointers,
+    bool record = false,
+    bool validate = false) {
+  if (cond) {
+    input = input * 2;
+  }
+  void* input_ptr = input.data_ptr();
+  auto conv_out = at::conv2d(input, conv_weight);
+  void* conv_out_ptr = input.data_ptr();
+  auto conv_out_flat = conv_out.view({conv_out.size(0), -1});
+  auto output = at::linear(conv_out_flat, linear_weight);
+  if (record) {
+    pointers.push_back(input_ptr);
+    pointers.push_back(conv_out_ptr);
+  }
+  if (validate) {
+    TORCH_CHECK(input_ptr == pointers[0]);
+    TORCH_CHECK(conv_out_ptr == pointers[1]);
+  }
+  return output;
+}
+
+TEST(CPUAllocationPlanTest, with_control_flow) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, true, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode, bool validation_mode) -> bool {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output =
+        run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers);
+    }
+    bool success{true};
+    for (uint64_t i = 0; i < 10; ++i) {
+      bool validation_success;
+      {
+        c10::WithValidateAllocationPlanGuard
+          validation_guard(&plan, &validation_success);
+        output = run_with_control_flow(
+            a, conv_weight, linear_weight, validation_mode, pointers);
+      }
+      success = success && validation_success;
+    }
+    return success;
+  };
+  ASSERT_FALSE(validate_allocation_plan(false, true));
+  ASSERT_FALSE(validate_allocation_plan(true, false));
+  ASSERT_TRUE(validate_allocation_plan(true, true));
+  ASSERT_TRUE(validate_allocation_plan(false, false));
+}
+
+TEST(CPUAllocationPlanTest, with_profiling_alloc) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, false, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode,
+        bool validation_mode,
+        bool validate_pointers) {
+      pointers.clear();
+      c10::AllocationPlan plan;
+      {
+        c10::WithProfileAllocationsGuard profile_guard(&plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            record_mode,
+            pointers,
+            false,
+            false);
+      }
+      c10::CPUProfilingAllocator profiling_allocator;
+      {
+        c10::WithProfilingAllocatorGuard
+          profiling_allocator_guard(&profiling_allocator, &plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            validation_mode,
+            pointers,
+            validate_pointers,
+            false);
+      }
+      for (uint64_t i = 0; i < 10; ++i) {
+        {
+          c10::WithProfilingAllocatorGuard
+            profiling_allocator_guard(&profiling_allocator, &plan);
+          output = run_with_control_flow(
+              a,
+              conv_weight,
+              linear_weight,
+              validation_mode,
+              pointers,
+              false,
+              validate_pointers);
+        }
+      }
+  };
+  // When control flow conditions are same between profiling and evaluation
+  // profiling allocator should not throw.
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, false));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, false));
+  // Furthermore profiling allocator should return the same pointers
+  // back for the intermediate tensors
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, true));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, true));
+
+  // When control flow conditions are different between profiling and evaluation
+  // profiling allocator should throw.
+  ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error);
+  ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error);
+}
+
+int main(int argc, char* argv[]) {
+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+  ::testing::InitGoogleTest(&argc, argv);
+  at::manual_seed(42);
+  return RUN_ALL_TESTS();
+#endif /* C10_Mobile */
+}
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index 5502aaf4b3d6..c76fefe21d27 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -1,6 +1,7 @@
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/mobile/CPUCachingAllocator.h>
+#include <c10/mobile/CPUProfilingAllocator.h>
 
 // TODO: rename flags to C10
 C10_DEFINE_bool(
@@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     // TODO: enable with better TLS support on mobile
     // profiledCPUMemoryReporter().Delete(pointer);
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       allocator_ptr->free(pointer);
+    } else if (profiling_allocator_ptr != nullptr) {
+      profiling_allocator_ptr->free(pointer);
     } else {
       c10::free_cpu(pointer);
       // This adds extra cost to freeing memory to the default case when
       // caching allocator is not enabled.
       CPUCachingAllocator::record_free(pointer);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_free(pointer);
+      }
     }
   }
 
@@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
     auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
     void* data;
     auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
     if (allocator_ptr != nullptr) {
       data = allocator_ptr->allocate(alloc_size);
+    } else if (profiling_allocator_ptr != nullptr) {
+      data = profiling_allocator_ptr->allocate(alloc_size);
     } else {
       data = c10::alloc_cpu(alloc_size);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_allocation(alloc_size, data);
+      }
     }
     //  profiledCPUMemoryReporter().New(data, alloc_size);
     return {
diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp
new file mode 100644
index 000000000000..3559c8ce280f
--- /dev/null
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@@ -0,0 +1,410 @@
+#include <climits>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+
+namespace c10 {
+
+namespace {
+thread_local AllocationPlanner* allocation_planner{nullptr};
+thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
+
+struct MemBlock {
+  uint64_t start_offset, end_offset;
+  MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
+  bool operator<(const MemBlock& other) const {
+    return end_offset <= other.start_offset;
+  }
+};
+
+bool validate_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_offsets) {
+  std::set<MemBlock> allocations;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // Skip allocations not managed by AllocationPlan
+    if (allocation_offsets[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto start_offset = allocation_offsets[i];
+    auto end_offset = allocation_offsets[i] + allocation_sizes[i];
+    if (!allocations.emplace(start_offset, end_offset).second) {
+      return false;
+    }
+  }
+  return true;
+}
+
+enum class EventType {
+  Allocate = 0,
+  Free,
+  Invalid
+};
+
+struct MemEvent {
+  uint64_t time;
+  uint64_t allocation_id;
+  uint64_t size;
+  EventType type{EventType::Invalid};
+  MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) :
+    time(t), allocation_id(id), size(s), type(e) {}
+};
+
+std::vector<MemEvent> create_and_sort_mem_events(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  std::vector<MemEvent> events;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // If observed allocation are freed outside the scope of
+    // observation, then allocations are not managed by the
+    // AllocationPlan.
+    if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
+    events.emplace_back(allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
+  }
+  std::sort(
+      events.begin(),
+      events.end(),
+      [](const MemEvent& a,
+         const MemEvent& b) -> bool {return a.time < b.time;});
+  return events;
+}
+
+std::vector<uint64_t> formulate_greedy_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  // Step 1. Construct all allocation/free events.
+  //         Sort these events by timestamp.
+  // Step 2. Iterate through all events.
+  //  2.1 If allocate event:
+  //      Find all candidate in free_size_to_offset map
+  //      Greedily pick the first one.
+  //      Remove the entry from free_size_to_offset map.
+  //      new_offset = offset + request_size
+  //      new_size = size - request_size
+  //      Add new entry to both maps
+  //  2.2 If free event.
+  //      Check if the returned offset merges with another chunk.
+  //      If so merge until no more merging is possible.
+  //      If returned offset does not merge, then
+  //      just return it as a chunk.
+
+  // lower_bound on this map will get all candidates of
+  // the right size for allocation.
+  std::map<uint64_t, uint64_t> free_size_to_offset;
+  // This provides fast lookup when we want to insert freed block
+  // back, especially when we want to merge blocks.
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_start_offset_to_size_iter;
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_end_offset_to_size_iter;
+  // Upon free end_ptr = offset + size
+  // If end_ptr exists merge freed allocation
+  // Also find coresponding offset in size_to_offet
+  // Remove that entry and update with new size and offset
+  // If end_ptr does not exist then just insert offset,size
+  // in map and correspondingly size, offset in the other map.
+  // Merging should always be done recursively until no more chunks
+  // that can be found.
+  // After last free we should have only one entry left in these maps.
+  ska::flat_hash_map<uint64_t, uint64_t> allocated_offset_to_size;
+
+  std::vector<uint64_t> allocation_offsets(
+      allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
+  auto mem_events = create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
+  uint64_t max_offset{0};
+  for (const auto& mem_event : mem_events) {
+    uint64_t alloc_offset;
+    uint64_t new_offset, new_size;
+    if (mem_event.type == EventType::Allocate) {
+      auto it = free_size_to_offset.lower_bound(mem_event.size);
+      if (it == free_size_to_offset.end()) {
+        // If there is no contiguous block of the size requested
+        // allocate a new one.
+        alloc_offset = max_offset;
+        max_offset += mem_event.size;
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      } else {
+        // If we have found a block of the size we want
+        // 1. change the block by allocating out of it.
+        //    1.1 Erase the entire block
+        //    1.2 Erase the reverse map entries
+        // 2. If block still has space left insert the remainder back in map.
+        //    Including reverse map entries.
+        // 3. Insert the allocated block in allocated_offset_to_size.
+        alloc_offset = it->second;
+        new_offset = alloc_offset + mem_event.size;
+        new_size = it->first - mem_event.size;
+        free_size_to_offset.erase(it);
+        free_start_offset_to_size_iter.erase(alloc_offset);
+        free_end_offset_to_size_iter.erase(alloc_offset + it->first);
+        if (new_size > 0) {
+          auto ref_it = free_size_to_offset.emplace(new_offset, new_size).first;
+          free_start_offset_to_size_iter.emplace(new_offset, ref_it);
+          free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
+        }
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      }
+      allocation_offsets[mem_event.allocation_id] = alloc_offset;
+    } else {
+      // 1. Check if freed block is adjancent to an existing free block
+      //    at its end boundary. This is done by checking
+      //    free_end_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 2. Similarly check if freed block is adjacent to an existing
+      //    free block at start boundary. This is done by checking
+      //    free_start_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 3. Inser the freed block in map.
+      auto freed_offset = allocation_offsets[mem_event.allocation_id];
+      auto freed_size = mem_event.size;
+      auto end_offset = freed_offset + freed_size;
+      // Merge when another free block exist at the end of this block
+      auto end_it = free_end_offset_to_size_iter.find(end_offset);
+      if (end_it != free_end_offset_to_size_iter.end()) {
+        auto size_to_end_offset_iter = end_it->second;
+        freed_size += size_to_end_offset_iter->first;
+        free_size_to_offset.erase(size_to_end_offset_iter);
+        free_end_offset_to_size_iter.erase(end_it);
+      }
+      // Merge when freed block exist at the end of another free block
+      auto start_it = free_start_offset_to_size_iter.find(freed_offset);
+      if (start_it != free_start_offset_to_size_iter.end()) {
+        auto size_to_start_offset_iter = start_it->second;
+        freed_size += size_to_start_offset_iter->first;
+        freed_offset -= size_to_start_offset_iter->first;
+        free_size_to_offset.erase(size_to_start_offset_iter);
+        free_start_offset_to_size_iter.erase(start_it);
+      }
+      allocated_offset_to_size.erase(freed_offset);
+      auto freed_block_it =
+        free_size_to_offset.emplace(freed_size, freed_offset).first;
+      free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
+      free_end_offset_to_size_iter.emplace(
+          freed_offset + freed_size, freed_block_it);
+    }
+  }
+  TORCH_CHECK(validate_allocation_plan(allocation_sizes, allocation_offsets),
+      "Allocation plan invaild.");
+  return allocation_offsets;
+}
+
+} // namespace
+
+void AllocationPlan::clear() {
+  allocation_sizes.clear();
+  allocation_lifetimes.clear();
+  allocation_offsets.clear();
+}
+
+void AllocationPlanner::record_allocation(
+    const uint64_t size, const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_allocation(size, ptr);
+    return;
+  }
+  allocation_plan_->allocation_sizes.push_back(size);
+  allocation_plan_->allocation_lifetimes.push_back(
+      std::numeric_limits<uint64_t>::max());
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+}
+
+void AllocationPlanner::record_free(const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_free(ptr);
+    return;
+  }
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Free being recorded was allocated outside of WithProfileAllocationGuard
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during record_allocation.");
+  allocation_plan_->allocation_lifetimes[id] = allocation_id_;
+}
+
+bool AllocationPlanner::validate_allocation(
+    const uint64_t size, const void* ptr) {
+  if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
+      allocation_plan_->allocation_sizes[allocation_id_] != size) {
+    TORCH_WARN(
+        "Allocation request does not match plan:",
+        "Allocation id:",
+        allocation_id_,
+        ", Number of recorded allocations:",
+        allocation_plan_->allocation_sizes.size(),
+        ", Recorded size of the requested allocation:",
+        allocation_plan_->allocation_sizes[allocation_id_],
+        ", but got:",
+        size);
+
+    return false;
+  }
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+  return true;
+}
+
+bool AllocationPlanner::validate_free(const void* ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Allocation that was made outside the validation scope is being freed here
+    return true;
+  }
+  auto id = (*it).second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during validate_allocation.");
+  auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
+  return (lifetime_id == allocation_id_);
+}
+
+void AllocationPlanner::formulate_plan() {
+  allocation_plan_->allocation_offsets =
+    formulate_greedy_allocation_plan(
+        allocation_plan_->allocation_sizes, allocation_plan_->allocation_lifetimes);
+  allocation_plan_->total_size = 0;
+  for (auto i = 0; i < allocation_plan_->allocation_sizes.size(); ++i) {
+    if (allocation_plan_->allocation_lifetimes[i] ==
+        std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto limit = allocation_plan_->allocation_offsets[i] + allocation_plan_->allocation_sizes[i];
+    allocation_plan_->total_size = std::max(allocation_plan_->total_size, limit);
+  }
+}
+
+void AllocationPlanner::clear() {
+  allocation_plan_->clear();
+  allocation_ptr_to_id_.clear();
+}
+
+void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
+  TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
+  plan_ = plan;
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  if (current_size_ < plan->total_size) {
+    // Free existing memory and reallocate for larger size.
+    c10::free_cpu(blob_);
+    blob_ = c10::alloc_cpu(plan->total_size);
+    current_size_ = plan->total_size;
+  }
+}
+
+void CPUProfilingAllocator::unset_plan() {
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  plan_ = nullptr;
+}
+
+void* CPUProfilingAllocator::allocate(const size_t bytes) {
+  TORCH_CHECK(bytes == plan_->allocation_sizes[allocation_id_],
+      "Got allocation request that does not match with the plan.");
+  if (plan_->allocation_lifetimes[allocation_id_] ==
+      std::numeric_limits<uint64_t>::max()) {
+    // This allocation is not managed by ProfilingAllocator.
+    allocation_id_++;
+    return c10::alloc_cpu(bytes);
+  }
+  void* ptr =
+    reinterpret_cast<uint8_t*>(blob_) +
+    plan_->allocation_offsets[allocation_id_];
+  TORCH_CHECK(allocation_ptr_to_id_.emplace(ptr, allocation_id_).second);
+  allocation_id_++;
+  return ptr;
+}
+
+void CPUProfilingAllocator::free(void* const ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Either
+    // 1. Allocation that was made outside the validation scope is being freed here
+    // or
+    // 2. Allocation that is not managed by profiling allocator is being freed.
+    //    Example of the second type
+    //    Tensor out;
+    //    for (....) {
+    //      {
+    //        CPUProfilingAllocator
+    //        out = ...some op (This also frees previous memory held by out)
+    //      }
+    //      out is used..
+    //    }
+    c10::free_cpu(ptr);
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < plan_->allocation_lifetimes.size(),
+      "Freeing allocation that is not accordingly to the plan.");
+  auto lifetime_id = plan_->allocation_lifetimes[id];
+  TORCH_CHECK(
+      lifetime_id == allocation_id_,
+      "Lifetime of allocations do not match: allocation_id ",
+      id,
+      ", expected:",
+      lifetime_id,
+      ", got:",
+      allocation_id_);
+}
+
+CPUProfilingAllocator::~CPUProfilingAllocator() {
+  c10::free_cpu(blob_);
+}
+
+WithProfileAllocationsGuard::WithProfileAllocationsGuard(
+    AllocationPlan* plan) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan);
+  planner_->clear();
+  allocation_planner = planner_.get();
+}
+
+WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
+  planner_->formulate_plan();
+  allocation_planner = nullptr;
+}
+
+WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
+    AllocationPlan* plan, bool* success) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan, true);
+  success_ = success;
+  allocation_planner = planner_.get();
+}
+
+WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
+  *success_ = planner_->validation_success;
+  allocation_planner = nullptr;
+}
+
+AllocationPlanner* GetThreadLocalAllocationPlanner() {
+  return allocation_planner;
+}
+
+WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
+    CPUProfilingAllocator* allocator, const AllocationPlan* plan) {
+  // Nesting of profiling allocator is not supported.
+  TORCH_CHECK(profiling_allocator == nullptr,
+      "Nesting profiling allocators is not supported.");
+  profiling_allocator = allocator;
+  profiling_allocator->set_plan(plan);
+}
+
+WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
+  profiling_allocator->unset_plan();
+  profiling_allocator = nullptr;
+}
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
+  return profiling_allocator;
+}
+
+} // namespace c10
diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h
new file mode 100644
index 000000000000..4a7e79fe2857
--- /dev/null
+++ b/c10/mobile/CPUProfilingAllocator.h
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+
+/*
+ * Given a sequence of allocations in a thread, AllocationPlan records
+ * 1. size of each allocation
+ * 2. Lifetime of each allocation.
+ * 3. allocation offsets: Memory offset for each allocation in a single blob of memory
+ * 4. Total size of a blob of memory required to satisfy all the allocations.
+ */
+class C10_API AllocationPlan {
+  private:
+    // Records size of each allocation by their sequential allocation ids.
+    std::vector<uint64_t> allocation_sizes;
+    // This maps one allocation id (X) to another allocation id (Y).
+    // Allocation X is alive until allocation Y. From allocation Y onwards
+    // allocation X is not referenced.
+    // Thus Y is the id of the first allocation after X is freed.
+    // NB: When an allocation is recorded, along with recording its size,
+    // we also set the lifetime to be numeric_limits::max()
+    // This is to track allocations that are made during the scope of
+    // profiling but were not freed until after the scope ended.
+    // Such allocations are not managed by profiling allocator.
+    std::vector<uint64_t> allocation_lifetimes;
+    // Maps an allocation to some offset in a blob of memory.
+    std::vector<uint64_t> allocation_offsets;
+    uint64_t total_size{0};
+    void clear();
+    friend class AllocationPlanner;
+    friend class CPUProfilingAllocator;
+};
+
+/*
+ * Map of memory ptr to allocation id. This is auxiliary information only
+ * used to establish lifetime of allocations.
+ */
+class C10_API AllocationPlanner {
+  private:
+    AllocationPlan* allocation_plan_{nullptr};
+    // Maps allocated ptr to its allocation id.
+    // This is used when freeing the memory to lookup the allocation id
+    // in order to establish the lifetime of a particular allocation.
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+    uint64_t allocation_id_{0};
+    bool validation_mode_{false};
+
+    bool validate_allocation(const uint64_t size, const void* ptr);
+    bool validate_free(const void* ptr);
+  public:
+    bool validation_success{true};
+
+    AllocationPlanner() = delete;
+    AllocationPlanner(AllocationPlan* plan, bool validate = false) :
+      allocation_plan_(plan), validation_mode_(validate) {}
+    void record_allocation(const uint64_t size, const void* ptr);
+    void record_free(const void* ptr);
+    void formulate_plan();
+    void clear();
+};
+
+// NOT THREAD SAFE profiling allocator.
+class C10_API CPUProfilingAllocator {
+  private:
+    const AllocationPlan* plan_{nullptr};
+    uint64_t allocation_id_{0};
+    uint64_t current_size_{0};
+    void* blob_{nullptr};
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+  public:
+    ~CPUProfilingAllocator();
+    void set_plan(const AllocationPlan* plan);
+    void unset_plan();
+    void* allocate(const size_t bytes);
+    void free(void* const ptr);
+};
+
+/*
+ * Usage: Profile allocations made by one run of the model.
+ * AllocationPlan plan;
+ * {
+ *   WithProfileAllocationGuard profile_guard(&plan);
+ *   module.forward(...);
+ * }
+ * plan now contains allocation plan.
+ */
+class C10_API WithProfileAllocationsGuard {
+  public:
+    WithProfileAllocationsGuard(AllocationPlan* plan);
+    ~WithProfileAllocationsGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+};
+
+/*
+ * Usage: Validate allocation plan made with WithProfileAllocationGuard
+ * bool plan_validation_success, success = true;
+ * for (some number of representative inputs)
+ * {
+ *   WithValidateAllocationPlanGuard(&plan, &plan_validation_success);
+ *   module.forward(...);
+ *   success = success && plan_validation_success;
+ * }
+ * success == true means allocations are according to plan
+ * else for some inputs allocation pattern changed.
+ */
+class C10_API WithValidateAllocationPlanGuard {
+  public:
+    WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success);
+    ~WithValidateAllocationPlanGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+    bool* success_;
+};
+
+AllocationPlanner* GetThreadLocalAllocationPlanner();
+
+/*
+ * Usage: Allocate tensors accordingly to allocation plan
+ * First make allocation plan.
+ *  See WithProfileAllocationsGuard usage.
+ * Second validate allocation plan.
+ *  See WithValidateAllocationPlanGuard usage.
+ * CPUProfilingAllocator profiling_allocator;
+ * {
+ *   WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan);
+ *   module.forward(...);
+ * }
+ */
+class C10_API WithProfilingAllocatorGuard {
+  public:
+    WithProfilingAllocatorGuard(
+        CPUProfilingAllocator* allocator, const AllocationPlan* plan);
+    ~WithProfilingAllocatorGuard();
+};
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator();
+
+} // namespace c10

From 9728584ccad9506cb2f93cc672a53c48169fa4f8 Mon Sep 17 00:00:00 2001
From: REX51 <rahulsingha.rex51@gmail.com>
Date: Tue, 6 Oct 2020 09:09:04 -0700
Subject: [PATCH 445/449] Replaced whitelist with allowlist (#45796)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/41752

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45796

Reviewed By: dzhulgakov

Differential Revision: D24125214

Pulled By: VitalyFedyunin

fbshipit-source-id: 5b06c1fdaa90a60e8a6efc2e61f37fd647cf0ae7
---
 tools/clang_format_all.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/clang_format_all.py b/tools/clang_format_all.py
index 710a21e33514..77ca68d92b0b 100755
--- a/tools/clang_format_all.py
+++ b/tools/clang_format_all.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-A script that runs clang-format on all C/C++ files in CLANG_FORMAT_WHITELIST. There is
+A script that runs clang-format on all C/C++ files in CLANG_FORMAT_ALLOWLIST. There is
 also a diff mode which simply checks if clang-format would make any changes, which is useful for
 CI purposes.
 
@@ -14,22 +14,22 @@
 import sys
 from clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH
 
-# Whitelist of directories to check. All files that in that directory
+# Allowlist of directories to check. All files that in that directory
 # (recursively) will be checked.
-# If you edit this, please edit the whitelist in clang_format_ci.sh as well.
-CLANG_FORMAT_WHITELIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"]
+# If you edit this, please edit the allowlist in clang_format_ci.sh as well.
+CLANG_FORMAT_ALLOWLIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"]
 
 # Only files with names matching this regex will be formatted.
 CPP_FILE_REGEX = re.compile(".*\\.(h|cpp|cc|c|hpp)$")
 
 
-def get_whitelisted_files():
+def get_allowlisted_files():
     """
-    Parse CLANG_FORMAT_WHITELIST and resolve all directories.
-    Returns the set of whitelist cpp source files.
+    Parse CLANG_FORMAT_ALLOWLIST and resolve all directories.
+    Returns the set of allowlist cpp source files.
     """
     matches = []
-    for dir in CLANG_FORMAT_WHITELIST:
+    for dir in CLANG_FORMAT_ALLOWLIST:
         for root, dirnames, filenames in os.walk(dir):
             for filename in filenames:
                 if CPP_FILE_REGEX.match(filename):
@@ -77,7 +77,7 @@ async def file_clang_formatted_correctly(filename, semaphore, verbose=False):
 
 async def run_clang_format(max_processes, diff=False, verbose=False):
     """
-    Run clang-format to all files in CLANG_FORMAT_WHITELIST that match CPP_FILE_REGEX.
+    Run clang-format to all files in CLANG_FORMAT_ALLOWLIST that match CPP_FILE_REGEX.
     """
     # Check to make sure the clang-format binary exists.
     if not os.path.exists(CLANG_FORMAT_PATH):
@@ -97,7 +97,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False):
 
     # Format files in parallel.
     if diff:
-        for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_whitelisted_files()]):
+        for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_allowlisted_files()]):
             ok &= await f
 
         if ok:
@@ -105,7 +105,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False):
         else:
             print("Some files not formatted correctly")
     else:
-        await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_whitelisted_files()])
+        await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_allowlisted_files()])
 
     return ok
 
@@ -134,7 +134,7 @@ def main(args):
     options = parse_args(args)
     # Get clang-format and make sure it is the right binary and it is in the right place.
     ok = get_and_check_clang_format(options.verbose)
-    # Invoke clang-format on all files in the directories in the whitelist.
+    # Invoke clang-format on all files in the directories in the allowlist.
     if ok:
         loop = asyncio.get_event_loop()
         ok = loop.run_until_complete(run_clang_format(options.max_processes, options.diff, options.verbose))

From abedd9a2743c1ed1ae2a8e76196b4f4fe4d0553e Mon Sep 17 00:00:00 2001
From: Rong Rong <rongr@fb.com>
Date: Tue, 6 Oct 2020 10:29:44 -0700
Subject: [PATCH 446/449] Reduce size of test_unsqueeze to resolve consistent
 timeout issue (#45877)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45877

apex_test_L0_optimizers

Test Plan: `buck test mode/dev-tsan //caffe2/test:tensorexpr -- 'test_unsqueeze \(test_tensorexpr\.TestTensorExprFuser\)' --run-disabled`

Reviewed By: malfet

Differential Revision: D24126211

fbshipit-source-id: e38ba0168b6dd44459c070c01e3e39c93d5fae42
---
 test/test_tensorexpr.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 143c6dab91d2..739957569962 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1046,18 +1046,18 @@ def easy(x, y):
         # FIXME: interp.elapsed_value() also increments due to simplifier
         assert llvm.elapsed_value() == 1 or interp.elapsed_value() > 1
 
-    def test_unsqueeze(self):
+    def test_unsqueeze(self, N=256):
         def easy(x, y):
             a = torch.unsqueeze(x, 0)
             b = torch.unsqueeze(y, 0)
             return a + b
 
-        traced = torch.jit.trace(easy, (torch.ones(1024, 1024), torch.zeros(1024, 1024)))
+        traced = torch.jit.trace(easy, (torch.ones(N, N), torch.zeros(N, N)))
 
         llvm = LLVMCodeGenExecuted()
         interp = SimpleIREvalExecuted()
 
-        a = torch.rand(1024, 1024)
+        a = torch.rand(N, N)
         x = traced(a, a)
         npr = np.expand_dims(a, 0)
         npr = npr + npr

From 3510f19c5fb390976ea6dab9b4f61b0fb564cfb3 Mon Sep 17 00:00:00 2001
From: Jane Xu <janeyx@fb.com>
Date: Tue, 6 Oct 2020 10:38:06 -0700
Subject: [PATCH 447/449] added some more details + debugging steps to
 CONTRIBUTING.md (#45903)

Summary:
When attempting to install PyTorch locally on my Macbook, I had some difficulty running the setup steps and understanding what I was really doing. I've added some clarifications and summarized some debugging steps about `python setup.py develop` to lower the barrier of entrance for new contributors.

I'm seeking a lot of review here since I am not sure if what I wrote is entirely the most useful or accurate. Thank you!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/45903

Reviewed By: albanD

Differential Revision: D24140343

Pulled By: janeyx99

fbshipit-source-id: a5e40d1bc804945ae7db2b95ab18cf7fe169e68a
---
 CONTRIBUTING.md | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b01184603918..a1b4096592a7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -118,11 +118,37 @@ For example:
 - modify your Python file `torch/__init__.py`
 - test functionality
 
-You do not need to repeatedly install after modifying Python files.
+You do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall
+if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
 
 In case you want to reinstall, make sure that you uninstall PyTorch first by running `pip uninstall torch`
 and `python setup.py clean`. Then you can install in `develop` mode again.
 
+### Tips and Debugging
+* A prerequisite to installing PyTorch is CMake. We recommend installing it with [Homebrew](https://brew.sh/)
+with `brew install cmake` if you are developing on MacOS or Linux system.
+* Our `setup.py` requires Python >= 3.6
+* If you run into errors when running `python setup.py develop`, here are some debugging steps:
+  1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
+  your CMake works and can compile this simple Hello World program without errors.
+  2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
+  details along the way, which saves time the next time you build. If you're running into issues, you can always
+  `rm -rf build` from the toplevel `pytorch` directory and start over.
+  3. If you have made edits to the PyTorch repo, commit any change you'd like to keep and clean the repo with the
+  following commands (note that clean _really_ removes all untracked files and changes.):
+  ```bash
+  git submodule deinit -f .
+  git clean -xdf
+  python setup.py clean
+  git submodule update --init --recursive # very important to sync the submodules
+  python setup.py develop                 # then try running the command again
+  ```
+  4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to
+  experiment with some environment variables, you can pass them into the command:
+  ```bash
+  ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop
+  ```
+
 ## Nightly Checkout & Pull
 
 The `tools/nightly.py` script is provided to ease pure Python development of

From 001a7998b495e2d43f6d94ec05f503006a7a3c85 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Tue, 6 Oct 2020 10:48:04 -0700
Subject: [PATCH 448/449] Disabling XNNPACK integration test in tsan mode
 (#45850)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45850

In TSAN mode most xnnpack integration tests seem to be failing. Reason for
failure is not entirely clear. It is not clear if this is spurious.

Test Plan: python test/test_xnnpack_integration.py

Reviewed By: xcheng16

Differential Revision: D24113885

fbshipit-source-id: dc3de3ad3d4bf0210ad67211383dbe0e842b09dd
---
 test/test_xnnpack_integration.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index a40ec48f2f37..56c44b904b47 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -12,10 +12,12 @@
 import io
 import itertools
 
+from torch.testing._internal.common_utils import TEST_WITH_TSAN
 
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKOps(TestCase):
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
@@ -161,6 +163,7 @@ def test_conv2d_transpose(self,
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKSerDes(TestCase):
     @given(batch_size=st.integers(0, 3),
            data_shape=hu.array_shapes(1, 3, 2, 64),
@@ -551,6 +554,7 @@ def forward(self, x):
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.")
 class TestXNNPACKRewritePass(TestCase):
     @staticmethod
     def validate_transformed_module(
@@ -911,6 +915,7 @@ def forward(self, x):
 @unittest.skipUnless(torch.backends.xnnpack.enabled,
                      " XNNPACK must be enabled for these tests."
                      " Please build with USE_XNNPACK=1.")
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment")
 class TestXNNPACKConv1dTransformPass(TestCase):
     @staticmethod
     def validate_transform_conv1d_to_conv2d(

From d1fc1555d493a396358840d2bc19d55a19636371 Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh@fb.com>
Date: Tue, 6 Oct 2020 10:54:31 -0700
Subject: [PATCH 449/449] [quant] Add quantized::leaky_relu that takes
 scale/zero_point as input (#45702)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/45702

https://github.com/pytorch/pytorch/issues/45593

Previously quantized leaky_relu does not require observation and just inherits
the quantization parameters from input, but that does not work very well in qat
This PR added a quantized::leaky_relu that has observation for output and it will
become the default leaky_relu that our quantization tools produce (eager/graph mode)

Test Plan: Imported from OSS

Reviewed By: raghuramank100

Differential Revision: D24067681

fbshipit-source-id: d216738344363794b82bd3d75c8587a4b9415bca
---
 aten/src/ATen/native/native_functions.yaml   |  2 +-
 aten/src/ATen/native/quantized/cpu/qrelu.cpp | 21 ++++++++-
 aten/src/ATen/native/quantized/library.cpp   |  1 +
 test/quantization/test_quantized_op.py       | 46 +++++++++++++++-----
 4 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 9cf7422d2917..c27cb4083ac2 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -6970,7 +6970,7 @@
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu
-    QuantizedCPU: heaky_relu_quantized_cpu
+    QuantizedCPU: leaky_relu_quantized_cpu
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
   use_c10_dispatcher: full
diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
index a9fcf1a625ef..ca03081a1a25 100644
--- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@@ -113,7 +113,7 @@ Tensor& leaky_relu_out_quantized_cpu(Tensor& result, const Tensor& self,
   return result;
 }
 
-Tensor heaky_relu_quantized_cpu(const Tensor& self, Scalar negval) {
+Tensor leaky_relu_quantized_cpu(const Tensor& self, Scalar negval) {
   const auto qx = self.contiguous(self.suggest_memory_format());
   auto qy = at::_empty_affine_quantized(qx.sizes(),
       at::device(kCPU).dtype(self.scalar_type()),
@@ -170,8 +170,27 @@ class QRelu6 final {
   }
 };
 
+class QLeakyRelu final {
+ public:
+  static Tensor run(Tensor self, Scalar negative_slope, bool inplace, double output_scale, int64_t output_zero_point) {
+    // inplace argument is ignored now, TODO:support inplace
+    if (inplace) {
+      TORCH_WARN("inplace=True is not supported for quantized::leaky_relu yet");
+    }
+    const auto qx = self.contiguous(self.suggest_memory_format());
+    auto qy = at::_empty_affine_quantized(qx.sizes(),
+      at::device(kCPU).dtype(self.scalar_type()),
+      output_scale,
+      output_zero_point,
+      self.suggest_memory_format());
+    qrelu_leaky_stub(self.device().type(), qy, qx, negative_slope);
+    return qy;
+  }
+};
+
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
   m.impl(TORCH_SELECTIVE_NAME("quantized::relu6"), TORCH_FN(QRelu6::run));
+  m.impl(TORCH_SELECTIVE_NAME("quantized::leaky_relu"), TORCH_FN(QLeakyRelu::run));
 }
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index f6820dd9993b..dceb06b05d4a 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -157,6 +157,7 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool1d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::relu6(Tensor qx, bool inplace=False) -> Tensor"));
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::leaky_relu(Tensor qx, Scalar negative_slope, bool inplace, float output_scale, int output_zero_point) -> Tensor"));
 }
 
 // According to #33294: The "_" prefix registration will be
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 68a1b446d773..ceef43dca51c 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -140,17 +140,17 @@ def _test_activation_function(self, X, fn_name, test_configs):
             quantized_fn: a list of the quantized functions to be tested
             reference_fn: the original reference function to be called on the
             the dequantized X
-            inplace_kwarg: the additional inplace keyword argument to test in-place
+            extra_kwargs: the additional keyword arguments
             for each test entry in ops_under_test, it must have at least the fields
-            for quantized_fn and reference_fn. If inplace_kwarg is missing, the
-            quantized function is assumed to be either inplace by default or the
-            test is not testing an inplace function.
+            for quantized_fn and reference_fn.
             output_range: the output range the operator will map to. By default, if it is
             no specified, the range will not be controlled and depend on Xmin and Xmax.
             change_zero_point: a boolean flag indicating if the zero point parameter should
             be determined based on torch_type during quantization (see sigmoid/hardsigmoid for
             examples). By default, if it is not specified, change_zero_point is assumed to be
             False and zero point will just take on the default value from X.
+            `output_is_observed`: if specified and is True, we'll append extra
+             output_scale/output_zero_point keyword argument when calling quantized op
         """
         # Retrives the default parameters from X.
         X, (scale, zero_point, torch_type) = X
@@ -162,15 +162,15 @@ def _test_activation_function(self, X, fn_name, test_configs):
         for op_group in test_configs:
             ref_op = op_group['reference_fn']
             for q_op in op_group['quantized_fn']:
+                # Retrieves the inplace keyword arguments
+                # some functions require inplace=True to test in-place.
+                extra_kwargs = op_group.get('extra_kwargs', dict())
+                output_is_observed = op_group.get('output_is_observed', False)
                 # Quantizes and dequantizes to account for max error.
                 qX = torch.quantize_per_tensor(X, scale=scale, zero_point=zero_point,
                                                dtype=torch_type)
                 dqX = qX.dequantize()
-                dqY_hat = ref_op(dqX.clone())
-
-                # Retrieves the inplace keyword arguments
-                # some functions require inplace=True to test in-place.
-                inplace_kwarg = op_group.get('inplace_kwarg', dict())
+                dqY_hat = ref_op(dqX.clone(), **extra_kwargs)
 
                 # Adjusts output_scale if needed.
                 # The output_scale determines the quantization scale for functions that
@@ -194,8 +194,11 @@ def _test_activation_function(self, X, fn_name, test_configs):
                                                    zero_point=output_zero_point,
                                                    dtype=torch_type)
 
+                if output_is_observed:
+                    extra_kwargs.update({'output_scale': scale, 'output_zero_point': zero_point})
+
                 # Finds qY using in-place or non-in-place quantized operators.
-                qY = q_op(qX, **inplace_kwarg)
+                qY = q_op(qX, **extra_kwargs)
 
                 self.assertEqual(qY, qY_hat, msg='{} - {} failed: ({} vs. {})'.format(
                     fn_name, q_op, qY, qY_hat
@@ -222,7 +225,7 @@ def test_qrelu(self, X):
                     torch.nn.quantized.functional.relu,
                 ],
                 'reference_fn': torch.nn.functional.relu,
-                'inplace_kwarg': {
+                'extra_kwargs': {
                     'inplace': True
                 }
             }
@@ -280,11 +283,30 @@ def test_qhardsigmoid(self, X):
         ]
         self._test_activation_function(X, 'hardsigmoid', hardsigmoid_test_configs)
 
+    @override_qengines
+    @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
+                       qparams=hu.qparams()))
+    def test_leaky_relu_observed_output(self, X):
+        leaky_relu_test_configs = [
+            {
+                'quantized_fn': [
+                    torch.ops.quantized.leaky_relu
+                ],
+                'reference_fn': torch.nn.functional.leaky_relu,
+                'extra_kwargs': {
+                    'negative_slope': 0.1,
+                    'inplace': False,
+                },
+                'output_is_observed': True,
+            }
+        ]
+        self._test_activation_function(X, 'leaky_relu', leaky_relu_test_configs)
+
     """Tests the correctness of the quantized::relu op."""
     @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5),
                        qparams=hu.qparams()),
            alpha=st.floats(0.0, 1.0, allow_nan=False, allow_infinity=False))
-    def test_qrelu_leaky(self, X, alpha):
+    def test_leaky_relu(self, X, alpha):
         X, (scale, zero_point, torch_type) = X
 
         X = torch.from_numpy(X)